diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..c317bd563e346bd301f844dc3854a18122f56bec 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+hf_model/tokenizer.json filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8c5356b25f4729321651d039cb96c8e0d682c4a8
--- /dev/null
+++ b/README.md
@@ -0,0 +1,148 @@
+---
+license: gemma
+library_name: coreml
+base_model: google/gemma-4-E4B-it
+tags:
+  - coreml
+  - apple-silicon
+  - ane
+  - on-device
+  - gemma-4
+  - multimodal
+  - vision
+  - audio
+pipeline_tag: image-text-to-text
+---
+
+## Use it from Swift
+
+<!-- swift-usage-begin -->
+### Add the package
+
+`Package.swift`:
+
+```swift
+.package(url: "https://github.com/john-rocky/CoreML-LLM", branch: "main"),
+
+// In your target:
+.product(name: "CoreMLLLM", package: "CoreML-LLM"),
+```
+
+Platforms: iOS 18+ / macOS 15+.
+
+### Download + chat (one call, text + image + audio)
+
+```swift
+import CoreMLLLM
+
+// First call pulls the bundle from this repo to Documents/Models/.
+let llm = try await CoreMLLLM.load(repo: "mlboydaisuke/gemma-4-E4B-multimodal-coreml")
+
+// Text-only
+let stream = try await llm.generate(
+    [CoreMLLLM.Message(role: .user, content: "Hello!")],
+    maxTokens: 256
+)
+for await chunk in stream { print(chunk, terminator: "") }
+
+// Image + text
+let image: CGImage = // ... your image
+let stream2 = try await llm.generate(
+    [CoreMLLLM.Message(role: .user, content: "Describe this picture.")],
+    image: image, maxTokens: 256)
+
+// Audio + text (16 kHz mono PCM Float)
+let pcm: [Float] = // ... your audio samples
+let stream3 = try await llm.generate(
+    [CoreMLLLM.Message(role: .user, content: "What language is this?")],
+    audio: pcm, maxTokens: 256)
+```
+
+Set the Xcode scheme env var `LLM_VISION_FORCE_ANE=1` to route the vision encoder through the Apple Neural Engine (built ANE-targeted, 256 tokens per image at the LM hidden dim).
+<!-- swift-usage-end -->
+
+# Gemma 4 E4B (multimodal) — Core ML (INT4, Apple Neural Engine)
+
+Core ML port of [`google/gemma-4-E4B-it`](https://huggingface.co/google/gemma-4-E4B-it) with vision (still image), video, and audio (Conformer) encoders. Sliding-window-attention chunks targeting Apple Neural Engine; vision encoder is ANE-targeted; audio runs on GPU + a small Swift/Accelerate projection sidecar.
+
+**iPhone 17 Pro validated 2026-05-03** — text decode **15.7 tok/s** with correct outputs across all four input modalities (text / image / video / audio).
+
+Built from [`john-rocky/CoreML-LLM`](https://github.com/john-rocky/CoreML-LLM); see [`docs/E4B_MULTIMODAL_BUILD.md`](https://github.com/john-rocky/CoreML-LLM/blob/main/docs/E4B_MULTIMODAL_BUILD.md) for the full reproduction guide and [`scripts/assemble_gemma4_e4b_multimodal.sh`](https://github.com/john-rocky/CoreML-LLM/blob/main/scripts/assemble_gemma4_e4b_multimodal.sh) for the assembly script.
+
+## Files
+
+```
+# Decode chunks (3-chunk Topology II — auto-detected by ChunkedEngine)
+chunk1.mlmodelc/                # L0-11   — own KV
+chunk2_3way.mlmodelc/           # L12-32  — merged 21 layers (own + KV-shared internal)
+chunk3_3way.mlmodelc/           # L33-41 + lm_head + argmax
+
+# Prefill chunks (legacy 4-chunk with prefill_b8 multifunction inside)
+chunk2.mlmodelc/                # L12-22  prefill (own KV writes via recurrent shift)
+chunk3.mlmodelc/                # L23-32  prefill (KV-shared)
+chunk4.mlmodelc/                # L33-41  prefill + lm_head
+
+# Vision encoder (ANE-targeted)
+vision.ane.mlmodelc/            # SigLIP, output [1, 256, 2560]
+
+# Audio encoder + Swift projection sidecars
+audio.mlmodelc/                 # Conformer, output [1, 50, 1024]
+audio_config.json
+mel_filterbank.bin
+output_proj_weight.npy          # 1024 -> 1536 (audio_soft_token_size)
+output_proj_bias.npy
+embed_proj_weight.npy           # 1536 -> 2560 (LM hidden) — E4B-specific shape
+
+# Token / per-layer embeddings (mmap'd, dequantised on demand by Swift)
+embed_tokens_q8.bin             640 MB  — INT8 token embeddings (262144 x 2560)
+embed_tokens_scales.bin         512 KB
+embed_tokens_per_layer_q8.bin   2.6 GB  — INT8 per-layer embeddings (PLE)
+embed_tokens_per_layer_scales.bin 512 KB
+per_layer_projection.bin        53 MB
+per_layer_norm_weight.bin       512 B
+
+# RoPE cos/sin tables (pre-baked, mmap'd)
+cos_sliding.npy / sin_sliding.npy
+cos_full.npy    / sin_full.npy
+
+# Tokenizer + runtime config
+hf_model/
+  tokenizer.json, tokenizer_config.json, config.json, generation_config.json
+model_config.json
+```
+
+Total bundle size: **~7.6 GB**.
+
+## Engine path on iPhone (what runs where)
+
+| Stage | Compute | Files used |
+|---|---|---|
+| Token / PLE embed lookup | Swift CPU (mmap) | `embed_tokens*.bin`, `per_layer_*.bin` |
+| Decode (T=1) | ANE | `chunk1` + `chunk2_3way` + `chunk3_3way` |
+| Prefill (batched, T=8) | ANE | `chunk1` + `chunk2` + `chunk3` + `chunk4` (`prefill_b8` multifunction) |
+| Vision encoder | ANE | `vision.ane.mlmodelc` (with `LLM_VISION_FORCE_ANE=1`) |
+| Audio encoder | GPU | `audio.mlmodelc` |
+| Audio projection (1024 → 1536 → 2560) | Swift / Accelerate | `output_proj_*.npy`, `embed_proj_weight.npy` |
+
+The Swift runtime auto-detects Topology II by the presence of `chunk2_3way` + `chunk3_3way` and routes prefill through the legacy 4-chunk `prefill_b8` multifunction (the engine's `fillBatchMasksVisionAware` keeps bidirectional within-image attention working at `T=8` batches).
+
+## Why so many sidecars (vs a single `model.mlpackage`)?
+
+Gemma 4 E-series uses a per-layer embedding (PLE) bank that's much larger than the token embedding (2.6 GB vs 640 MB for E4B). Loading PLE through Core ML would dequantize the entire bank into the CPU heap and blow up `phys_footprint`. We mmap the raw INT8 + scale `.bin` files instead, dequantize the few rows touched per token in pure Swift, and feed the result to the chunks. The chunks themselves are pure transformer bodies and stay ANE-resident.
+
+The `.npy` RoPE tables are pre-baked at conversion-time so Swift doesn't need to ship a `cos`/`sin` builder.
+
+The audio Swift projection (`output_proj_*` / `embed_proj_weight`) lives outside the ANE because of a Core ML GPU runtime bug with `RMSNorm(with_scale=False)` that produces all-zero outputs. Sgemm in Accelerate is fast enough on CPU.
+
+## Tokenizer
+
+The Gemma 4 SentencePiece tokenizer ships in `hf_model/`. Three multimodal placeholder token IDs:
+- `<|image|>` = 258880 — image-pad span (256 per still image)
+- `<|audio|>` = 258881 — audio-pad span (~188 per 2 sec)
+- `<|video|>` = 258884 — video-pad span (64 per frame)
+
+Vision encoder output rows replace `<|image|>`/`<|video|>` rows during prefill (and per-token at decode for tail spans). Audio output rows replace `<|audio|>`. `per_layer_raw` is forced to zero at multimodal positions — the chunks compute `per_layer_combined` entirely from the spliced hidden state.
+
+## License
+
+This is a derivative work of `google/gemma-4-E4B-it`. Use is governed by the [Gemma Terms of Use](https://ai.google.dev/gemma/terms). Vision / audio extensions inherit the same license.
diff --git a/audio.mlmodelc/analytics/coremldata.bin b/audio.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..59e1d0c6e55befa826e7bcd344f394de14edba8c
--- /dev/null
+++ b/audio.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea515c53f416101ef42bce8f1a9ac1be59d838914747ecba22b70ead41039ee5
+size 243
diff --git a/audio.mlmodelc/coremldata.bin b/audio.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..16fe0ef528b7abe2e3186429da3258897a48e0b9
--- /dev/null
+++ b/audio.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cda961a2bf20e093c8fe82e55240512265ee10f1cf48078033ebc972298750b9
+size 390
diff --git a/audio.mlmodelc/metadata.json b/audio.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..5bcd568fff1880133868c49c51a8ca62ce9d8ab6
--- /dev/null
+++ b/audio.mlmodelc/metadata.json
@@ -0,0 +1,84 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Mixed (Float16, Int32, Palettized (10 bits), Palettized (11 bits), Palettized (4 bits), Palettized (9 bits), UInt4)",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 50 × 1024)",
+        "shortDescription" : "",
+        "shape" : "[1, 50, 1024]",
+        "name" : "hidden_states",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 9,
+    "mlProgramOperationTypeHistogram" : {
+      "Ios18.expandDims" : 1,
+      "Ios18.mul" : 312,
+      "Ios18.softmax" : 12,
+      "Ios18.matmul" : 36,
+      "Ios16.reduceMean" : 108,
+      "Ios18.sigmoid" : 12,
+      "Split" : 12,
+      "Select" : 12,
+      "Ios18.add" : 168,
+      "Ios18.layerNorm" : 2,
+      "Ios18.reshape" : 109,
+      "Pad" : 60,
+      "Ios18.constexprLutToDense" : 134,
+      "Ios18.linear" : 121,
+      "Ios18.conv" : 14,
+      "Ios18.relu" : 2,
+      "Ios18.clip" : 312,
+      "Ios18.silu" : 36,
+      "Stack" : 24,
+      "Ios18.pow" : 216,
+      "Ios18.cast" : 540,
+      "Ios18.transpose" : 75,
+      "Ios18.tanh" : 12,
+      "Ios18.sliceByIndex" : 144
+    },
+    "computePrecision" : "Mixed (Float16, Float32, Int32)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+
+    ],
+    "availability" : {
+      "macOS" : "15.0",
+      "tvOS" : "18.0",
+      "visionOS" : "2.0",
+      "watchOS" : "11.0",
+      "iOS" : "18.0",
+      "macCatalyst" : "18.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.conversion_date" : "2026-04-30",
+      "com.github.apple.coremltools.source" : "torch==2.11.0",
+      "com.github.apple.coremltools.version" : "9.0",
+      "com.github.apple.coremltools.source_dialect" : "TorchScript"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 200 × 128)",
+        "shortDescription" : "",
+        "shape" : "[1, 200, 128]",
+        "name" : "input_features",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "audio",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/audio.mlmodelc/model.mil b/audio.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..fe3ef7711704ba8b9d39d75076addabb513e08a6
--- /dev/null
+++ b/audio.mlmodelc/model.mil
@@ -0,0 +1,5325 @@
+program(1.3)
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3500.14.1"}, {"coremlc-version", "3500.32.1"}})]
+{
+    func main<ios18>(tensor<fp16, [1, 200, 128]> input_features) {
+            tensor<int32, [1]> hidden_states_1_axes_0 = const()[name = string("hidden_states_1_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 200, 128]> hidden_states_1_cast_fp16 = expand_dims(axes = hidden_states_1_axes_0, x = input_features)[name = string("hidden_states_1_cast_fp16")];
+            string hidden_states_3_pad_type_0 = const()[name = string("hidden_states_3_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> hidden_states_3_pad_0 = const()[name = string("hidden_states_3_pad_0"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<int32, [2]> hidden_states_3_strides_0 = const()[name = string("hidden_states_3_strides_0"), val = tensor<int32, [2]>([2, 2])];
+            tensor<int32, [2]> hidden_states_3_dilations_0 = const()[name = string("hidden_states_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_3_groups_0 = const()[name = string("hidden_states_3_groups_0"), val = int32(1)];
+            tensor<fp16, [128, 1, 3, 3]> subsample_conv_projection_layer0_conv_weight_to_fp16 = const()[name = string("subsample_conv_projection_layer0_conv_weight_to_fp16"), val = tensor<fp16, [128, 1, 3, 3]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64)))];
+            tensor<fp16, [1, 128, 100, 64]> hidden_states_3_cast_fp16 = conv(dilations = hidden_states_3_dilations_0, groups = hidden_states_3_groups_0, pad = hidden_states_3_pad_0, pad_type = hidden_states_3_pad_type_0, strides = hidden_states_3_strides_0, weight = subsample_conv_projection_layer0_conv_weight_to_fp16, x = hidden_states_1_cast_fp16)[name = string("hidden_states_3_cast_fp16")];
+            tensor<int32, [4]> var_198 = const()[name = string("op_198"), val = tensor<int32, [4]>([0, 2, 3, 1])];
+            tensor<int32, [1]> var_202_axes_0 = const()[name = string("op_202_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp16, [128]> subsample_conv_projection_layer0_norm_weight_to_fp16 = const()[name = string("subsample_conv_projection_layer0_norm_weight_to_fp16"), val = tensor<fp16, [128]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2432)))];
+            fp16 var_182_to_fp16 = const()[name = string("op_182_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 100, 64, 128]> input_3_cast_fp16 = transpose(perm = var_198, x = hidden_states_3_cast_fp16)[name = string("transpose_74")];
+            tensor<fp16, [1, 100, 64, 128]> var_202_cast_fp16 = layer_norm(axes = var_202_axes_0, epsilon = var_182_to_fp16, gamma = subsample_conv_projection_layer0_norm_weight_to_fp16, x = input_3_cast_fp16)[name = string("op_202_cast_fp16")];
+            tensor<int32, [4]> var_203 = const()[name = string("op_203"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<fp16, [1, 128, 100, 64]> var_204_cast_fp16 = transpose(perm = var_203, x = var_202_cast_fp16)[name = string("transpose_73")];
+            tensor<fp16, [1, 128, 100, 64]> hidden_states_5_cast_fp16 = relu(x = var_204_cast_fp16)[name = string("hidden_states_5_cast_fp16")];
+            string hidden_states_7_pad_type_0 = const()[name = string("hidden_states_7_pad_type_0"), val = string("custom")];
+            tensor<int32, [4]> hidden_states_7_pad_0 = const()[name = string("hidden_states_7_pad_0"), val = tensor<int32, [4]>([1, 1, 1, 1])];
+            tensor<int32, [2]> hidden_states_7_strides_0 = const()[name = string("hidden_states_7_strides_0"), val = tensor<int32, [2]>([2, 2])];
+            tensor<int32, [2]> hidden_states_7_dilations_0 = const()[name = string("hidden_states_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 hidden_states_7_groups_0 = const()[name = string("hidden_states_7_groups_0"), val = int32(1)];
+            tensor<fp16, [32, 128, 3, 3]> subsample_conv_projection_layer1_conv_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [32, 128, 3, 3]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2752))), lut = tensor<fp16, [1, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(21248))))[name = string("subsample_conv_projection_layer1_conv_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 32, 50, 32]> hidden_states_7_cast_fp16 = conv(dilations = hidden_states_7_dilations_0, groups = hidden_states_7_groups_0, pad = hidden_states_7_pad_0, pad_type = hidden_states_7_pad_type_0, strides = hidden_states_7_strides_0, weight = subsample_conv_projection_layer1_conv_weight_to_fp16_palettized, x = hidden_states_5_cast_fp16)[name = string("hidden_states_7_cast_fp16")];
+            tensor<int32, [4]> var_216 = const()[name = string("op_216"), val = tensor<int32, [4]>([0, 2, 3, 1])];
+            tensor<int32, [1]> var_220_axes_0 = const()[name = string("op_220_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp16, [32]> subsample_conv_projection_layer1_norm_weight_to_fp16 = const()[name = string("subsample_conv_projection_layer1_norm_weight_to_fp16"), val = tensor<fp16, [32]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(21376)))];
+            tensor<fp16, [1, 50, 32, 32]> input_9_cast_fp16 = transpose(perm = var_216, x = hidden_states_7_cast_fp16)[name = string("transpose_72")];
+            tensor<fp16, [1, 50, 32, 32]> var_220_cast_fp16 = layer_norm(axes = var_220_axes_0, epsilon = var_182_to_fp16, gamma = subsample_conv_projection_layer1_norm_weight_to_fp16, x = input_9_cast_fp16)[name = string("op_220_cast_fp16")];
+            tensor<fp16, [1, 50, 32, 32]> hidden_states_9_cast_fp16 = relu(x = var_220_cast_fp16)[name = string("hidden_states_9_cast_fp16")];
+            tensor<int32, [3]> var_230 = const()[name = string("op_230"), val = tensor<int32, [3]>([1, 50, -1])];
+            tensor<fp16, [1, 50, 1024]> input_13_cast_fp16 = reshape(shape = var_230, x = hidden_states_9_cast_fp16)[name = string("input_13_cast_fp16")];
+            tensor<fp16, [1024, 1024]> subsample_conv_projection_input_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(21504))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(545856))))[name = string("subsample_conv_projection_input_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1024]> linear_0_bias_0_to_fp16 = const()[name = string("linear_0_bias_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(546944)))];
+            tensor<fp16, [1, 50, 1024]> linear_0_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = subsample_conv_projection_input_proj_linear_weight_to_fp16_palettized, x = input_13_cast_fp16)[name = string("linear_0_cast_fp16")];
+            string linear_0_cast_fp16_to_fp32_dtype_0 = const()[name = string("linear_0_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_242 = const()[name = string("op_242"), val = fp32(-0x1p-1)];
+            fp32 var_243 = const()[name = string("op_243"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_244 = const()[name = string("op_244"), val = fp32(-0x1.2a05f2p+33)];
+            tensor<fp32, [1, 50, 1024]> linear_0_cast_fp16_to_fp32 = cast(dtype = linear_0_cast_fp16_to_fp32_dtype_0, x = linear_0_cast_fp16)[name = string("cast_539")];
+            tensor<fp32, [1, 50, 1024]> clip_0 = clip(alpha = var_244, beta = var_243, x = linear_0_cast_fp16_to_fp32)[name = string("clip_0")];
+            fp32 var_238_promoted = const()[name = string("op_238_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_252 = pow(x = clip_0, y = var_238_promoted)[name = string("op_252")];
+            tensor<int32, [1]> var_254_axes_0 = const()[name = string("op_254_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_254_keep_dims_0 = const()[name = string("op_254_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_254 = reduce_mean(axes = var_254_axes_0, keep_dims = var_254_keep_dims_0, x = var_252)[name = string("op_254")];
+            string var_254_to_fp16_dtype_0 = const()[name = string("op_254_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_255_to_fp16 = const()[name = string("op_255_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_254_to_fp16 = cast(dtype = var_254_to_fp16_dtype_0, x = var_254)[name = string("cast_538")];
+            tensor<fp16, [1, 50, 1]> mean_squared_1_cast_fp16 = add(x = var_254_to_fp16, y = var_255_to_fp16)[name = string("mean_squared_1_cast_fp16")];
+            string mean_squared_1_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_1_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_1_cast_fp16_to_fp32 = cast(dtype = mean_squared_1_cast_fp16_to_fp32_dtype_0, x = mean_squared_1_cast_fp16)[name = string("cast_537")];
+            tensor<fp32, [1, 50, 1]> var_257 = pow(x = mean_squared_1_cast_fp16_to_fp32, y = var_242)[name = string("op_257")];
+            string clip_0_to_fp16_dtype_0 = const()[name = string("clip_0_to_fp16_dtype_0"), val = string("fp16")];
+            string var_257_to_fp16_dtype_0 = const()[name = string("op_257_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_0_to_fp16 = cast(dtype = clip_0_to_fp16_dtype_0, x = clip_0)[name = string("cast_535")];
+            tensor<fp16, [1, 50, 1]> var_257_to_fp16 = cast(dtype = var_257_to_fp16_dtype_0, x = var_257)[name = string("cast_536")];
+            tensor<fp16, [1, 50, 1024]> normed_output_1_cast_fp16 = mul(x = clip_0_to_fp16, y = var_257_to_fp16)[name = string("normed_output_1_cast_fp16")];
+            tensor<fp16, [1024]> const_2_to_fp16 = const()[name = string("const_2_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(549056)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_3_cast_fp16 = mul(x = normed_output_1_cast_fp16, y = const_2_to_fp16)[name = string("normed_output_3_cast_fp16")];
+            fp16 feed_forward1s_0_ffw_layer_1_input_min_to_fp16 = const()[name = string("feed_forward1s_0_ffw_layer_1_input_min_to_fp16"), val = fp16(-0x1.9cp+3)];
+            fp16 feed_forward1s_0_ffw_layer_1_input_max_to_fp16 = const()[name = string("feed_forward1s_0_ffw_layer_1_input_max_to_fp16"), val = fp16(0x1.9ap+3)];
+            tensor<fp16, [1, 50, 1024]> clip_1_cast_fp16 = clip(alpha = feed_forward1s_0_ffw_layer_1_input_min_to_fp16, beta = feed_forward1s_0_ffw_layer_1_input_max_to_fp16, x = normed_output_3_cast_fp16)[name = string("clip_1_cast_fp16")];
+            tensor<fp16, [4096, 1024]> feed_forward1s_0_ffw_layer_1_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(551168))), lut = tensor<fp16, [128, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2648384))))[name = string("feed_forward1s_0_ffw_layer_1_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [4096]> linear_1_bias_0_to_fp16 = const()[name = string("linear_1_bias_0_to_fp16"), val = tensor<fp16, [4096]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2652544)))];
+            tensor<fp16, [1, 50, 4096]> linear_1_cast_fp16 = linear(bias = linear_1_bias_0_to_fp16, weight = feed_forward1s_0_ffw_layer_1_linear_weight_to_fp16_palettized, x = clip_1_cast_fp16)[name = string("linear_1_cast_fp16")];
+            fp16 feed_forward1s_0_ffw_layer_1_output_min_to_fp16 = const()[name = string("feed_forward1s_0_ffw_layer_1_output_min_to_fp16"), val = fp16(-0x1.4p+5)];
+            fp16 feed_forward1s_0_ffw_layer_1_output_max_to_fp16 = const()[name = string("feed_forward1s_0_ffw_layer_1_output_max_to_fp16"), val = fp16(0x1.3ep+5)];
+            tensor<fp16, [1, 50, 4096]> clip_2_cast_fp16 = clip(alpha = feed_forward1s_0_ffw_layer_1_output_min_to_fp16, beta = feed_forward1s_0_ffw_layer_1_output_max_to_fp16, x = linear_1_cast_fp16)[name = string("clip_2_cast_fp16")];
+            tensor<fp16, [1, 50, 4096]> hidden_states_21_cast_fp16 = silu(x = clip_2_cast_fp16)[name = string("hidden_states_21_cast_fp16")];
+            fp16 feed_forward1s_0_ffw_layer_2_input_min_to_fp16 = const()[name = string("feed_forward1s_0_ffw_layer_2_input_min_to_fp16"), val = fp16(-0x1.66p+3)];
+            fp16 feed_forward1s_0_ffw_layer_2_input_max_to_fp16 = const()[name = string("feed_forward1s_0_ffw_layer_2_input_max_to_fp16"), val = fp16(0x1.62p+3)];
+            tensor<fp16, [1, 50, 4096]> clip_3_cast_fp16 = clip(alpha = feed_forward1s_0_ffw_layer_2_input_min_to_fp16, beta = feed_forward1s_0_ffw_layer_2_input_max_to_fp16, x = hidden_states_21_cast_fp16)[name = string("clip_3_cast_fp16")];
+            tensor<fp16, [1024, 4096]> feed_forward1s_0_ffw_layer_2_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 4096]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2660800))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4758016))))[name = string("feed_forward1s_0_ffw_layer_2_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_2_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = feed_forward1s_0_ffw_layer_2_linear_weight_to_fp16_palettized, x = clip_3_cast_fp16)[name = string("linear_2_cast_fp16")];
+            fp16 feed_forward1s_0_ffw_layer_2_output_min_to_fp16 = const()[name = string("feed_forward1s_0_ffw_layer_2_output_min_to_fp16"), val = fp16(-0x1.12p+5)];
+            fp16 feed_forward1s_0_ffw_layer_2_output_max_to_fp16 = const()[name = string("feed_forward1s_0_ffw_layer_2_output_max_to_fp16"), val = fp16(0x1.1p+5)];
+            tensor<fp16, [1, 50, 1024]> clip_4_cast_fp16 = clip(alpha = feed_forward1s_0_ffw_layer_2_output_min_to_fp16, beta = feed_forward1s_0_ffw_layer_2_output_max_to_fp16, x = linear_2_cast_fp16)[name = string("clip_4_cast_fp16")];
+            string clip_4_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_4_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1024]> clip_4_cast_fp16_to_fp32 = cast(dtype = clip_4_cast_fp16_to_fp32_dtype_0, x = clip_4_cast_fp16)[name = string("cast_534")];
+            tensor<fp32, [1, 50, 1024]> clip_5 = clip(alpha = var_244, beta = var_243, x = clip_4_cast_fp16_to_fp32)[name = string("clip_5")];
+            fp32 var_238_promoted_1 = const()[name = string("op_238_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_284 = pow(x = clip_5, y = var_238_promoted_1)[name = string("op_284")];
+            tensor<int32, [1]> var_286_axes_0 = const()[name = string("op_286_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_286_keep_dims_0 = const()[name = string("op_286_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_286 = reduce_mean(axes = var_286_axes_0, keep_dims = var_286_keep_dims_0, x = var_284)[name = string("op_286")];
+            string var_286_to_fp16_dtype_0 = const()[name = string("op_286_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_287_to_fp16 = const()[name = string("op_287_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_286_to_fp16 = cast(dtype = var_286_to_fp16_dtype_0, x = var_286)[name = string("cast_533")];
+            tensor<fp16, [1, 50, 1]> mean_squared_3_cast_fp16 = add(x = var_286_to_fp16, y = var_287_to_fp16)[name = string("mean_squared_3_cast_fp16")];
+            string mean_squared_3_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_3_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_3_cast_fp16_to_fp32 = cast(dtype = mean_squared_3_cast_fp16_to_fp32_dtype_0, x = mean_squared_3_cast_fp16)[name = string("cast_532")];
+            tensor<fp32, [1, 50, 1]> var_289 = pow(x = mean_squared_3_cast_fp16_to_fp32, y = var_242)[name = string("op_289")];
+            string clip_5_to_fp16_dtype_0 = const()[name = string("clip_5_to_fp16_dtype_0"), val = string("fp16")];
+            string var_289_to_fp16_dtype_0 = const()[name = string("op_289_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_5_to_fp16 = cast(dtype = clip_5_to_fp16_dtype_0, x = clip_5)[name = string("cast_530")];
+            tensor<fp16, [1, 50, 1]> var_289_to_fp16 = cast(dtype = var_289_to_fp16_dtype_0, x = var_289)[name = string("cast_531")];
+            tensor<fp16, [1, 50, 1024]> normed_output_5_cast_fp16 = mul(x = clip_5_to_fp16, y = var_289_to_fp16)[name = string("normed_output_5_cast_fp16")];
+            tensor<fp16, [1024]> const_3_to_fp16 = const()[name = string("const_3_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4759104)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_7_cast_fp16 = mul(x = normed_output_5_cast_fp16, y = const_3_to_fp16)[name = string("normed_output_7_cast_fp16")];
+            fp16 var_234_to_fp16 = const()[name = string("op_234_to_fp16"), val = fp16(0x1p-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_33_cast_fp16 = mul(x = normed_output_7_cast_fp16, y = var_234_to_fp16)[name = string("hidden_states_33_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_35_cast_fp16 = add(x = hidden_states_33_cast_fp16, y = linear_0_cast_fp16)[name = string("hidden_states_35_cast_fp16")];
+            fp16 var_296_to_fp16 = const()[name = string("op_296_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_297_to_fp16 = const()[name = string("op_297_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_6_cast_fp16 = clip(alpha = var_296_to_fp16, beta = var_297_to_fp16, x = hidden_states_35_cast_fp16)[name = string("clip_6_cast_fp16")];
+            string clip_6_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_6_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_299 = const()[name = string("op_299"), val = fp32(-0x1p-1)];
+            fp32 var_303_promoted = const()[name = string("op_303_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_6_cast_fp16_to_fp32 = cast(dtype = clip_6_cast_fp16_to_fp32_dtype_0, x = clip_6_cast_fp16)[name = string("cast_529")];
+            tensor<fp32, [1, 50, 1024]> var_309 = pow(x = clip_6_cast_fp16_to_fp32, y = var_303_promoted)[name = string("op_309")];
+            tensor<int32, [1]> var_311_axes_0 = const()[name = string("op_311_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_311_keep_dims_0 = const()[name = string("op_311_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_311 = reduce_mean(axes = var_311_axes_0, keep_dims = var_311_keep_dims_0, x = var_309)[name = string("op_311")];
+            string var_311_to_fp16_dtype_0 = const()[name = string("op_311_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_312_to_fp16 = const()[name = string("op_312_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_311_to_fp16 = cast(dtype = var_311_to_fp16_dtype_0, x = var_311)[name = string("cast_528")];
+            tensor<fp16, [1, 50, 1]> mean_squared_5_cast_fp16 = add(x = var_311_to_fp16, y = var_312_to_fp16)[name = string("mean_squared_5_cast_fp16")];
+            string mean_squared_5_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_5_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_5_cast_fp16_to_fp32 = cast(dtype = mean_squared_5_cast_fp16_to_fp32_dtype_0, x = mean_squared_5_cast_fp16)[name = string("cast_527")];
+            tensor<fp32, [1, 50, 1]> var_314 = pow(x = mean_squared_5_cast_fp16_to_fp32, y = var_299)[name = string("op_314")];
+            string var_314_to_fp16_dtype_0 = const()[name = string("op_314_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_314_to_fp16 = cast(dtype = var_314_to_fp16_dtype_0, x = var_314)[name = string("cast_526")];
+            tensor<fp16, [1, 50, 1024]> normed_output_9_cast_fp16 = mul(x = clip_6_cast_fp16, y = var_314_to_fp16)[name = string("normed_output_9_cast_fp16")];
+            tensor<fp16, [1024]> const_4_to_fp16 = const()[name = string("const_4_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4761216)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_11_cast_fp16 = mul(x = normed_output_9_cast_fp16, y = const_4_to_fp16)[name = string("normed_output_11_cast_fp16")];
+            int32 var_320 = const()[name = string("op_320"), val = int32(-1)];
+            fp32 var_321 = const()[name = string("op_321"), val = fp32(-0x1.dcd65p+29)];
+            fp16 self_attns_0_q_proj_input_min_to_fp16 = const()[name = string("self_attns_0_q_proj_input_min_to_fp16"), val = fp16(-0x1.46p+4)];
+            fp16 self_attns_0_q_proj_input_max_to_fp16 = const()[name = string("self_attns_0_q_proj_input_max_to_fp16"), val = fp16(0x1.44p+4)];
+            tensor<fp16, [1, 50, 1024]> clip_7_cast_fp16 = clip(alpha = self_attns_0_q_proj_input_min_to_fp16, beta = self_attns_0_q_proj_input_max_to_fp16, x = normed_output_11_cast_fp16)[name = string("clip_7_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_0_q_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(4763328))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(5287680))))[name = string("self_attns_0_q_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_3_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_0_q_proj_linear_weight_to_fp16_palettized, x = clip_7_cast_fp16)[name = string("linear_3_cast_fp16")];
+            fp16 self_attns_0_q_proj_output_min_to_fp16 = const()[name = string("self_attns_0_q_proj_output_min_to_fp16"), val = fp16(-0x1.14p+5)];
+            fp16 self_attns_0_q_proj_output_max_to_fp16 = const()[name = string("self_attns_0_q_proj_output_max_to_fp16"), val = fp16(0x1.12p+5)];
+            tensor<fp16, [1, 50, 1024]> clip_8_cast_fp16 = clip(alpha = self_attns_0_q_proj_output_min_to_fp16, beta = self_attns_0_q_proj_output_max_to_fp16, x = linear_3_cast_fp16)[name = string("clip_8_cast_fp16")];
+            tensor<int32, [4]> var_365 = const()[name = string("op_365"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> q_1_cast_fp16 = reshape(shape = var_365, x = clip_8_cast_fp16)[name = string("q_1_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_0_k_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(5288768))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(5813120))))[name = string("self_attns_0_k_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_4_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_0_k_proj_linear_weight_to_fp16_palettized, x = clip_7_cast_fp16)[name = string("linear_4_cast_fp16")];
+            fp16 self_attns_0_k_proj_output_min_to_fp16 = const()[name = string("self_attns_0_k_proj_output_min_to_fp16"), val = fp16(-0x1.14p+5)];
+            fp16 self_attns_0_k_proj_output_max_to_fp16 = const()[name = string("self_attns_0_k_proj_output_max_to_fp16"), val = fp16(0x1.12p+5)];
+            tensor<fp16, [1, 50, 1024]> clip_10_cast_fp16 = clip(alpha = self_attns_0_k_proj_output_min_to_fp16, beta = self_attns_0_k_proj_output_max_to_fp16, x = linear_4_cast_fp16)[name = string("clip_10_cast_fp16")];
+            tensor<int32, [4]> var_377 = const()[name = string("op_377"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> k_1_cast_fp16 = reshape(shape = var_377, x = clip_10_cast_fp16)[name = string("k_1_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_0_v_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(5814208))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6338560))))[name = string("self_attns_0_v_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_5_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_0_v_proj_linear_weight_to_fp16_palettized, x = clip_7_cast_fp16)[name = string("linear_5_cast_fp16")];
+            fp16 self_attns_0_v_proj_output_min_to_fp16 = const()[name = string("self_attns_0_v_proj_output_min_to_fp16"), val = fp16(-0x1.14p+5)];
+            fp16 self_attns_0_v_proj_output_max_to_fp16 = const()[name = string("self_attns_0_v_proj_output_max_to_fp16"), val = fp16(0x1.12p+5)];
+            tensor<fp16, [1, 50, 1024]> clip_12_cast_fp16 = clip(alpha = self_attns_0_v_proj_output_min_to_fp16, beta = self_attns_0_v_proj_output_max_to_fp16, x = linear_5_cast_fp16)[name = string("clip_12_cast_fp16")];
+            tensor<int32, [4]> var_389 = const()[name = string("op_389"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> input_31_cast_fp16 = reshape(shape = var_389, x = clip_12_cast_fp16)[name = string("input_31_cast_fp16")];
+            fp16 var_391_to_fp16 = const()[name = string("op_391_to_fp16"), val = fp16(0x1.054p-3)];
+            tensor<fp16, [1, 50, 8, 128]> var_392_cast_fp16 = mul(x = q_1_cast_fp16, y = var_391_to_fp16)[name = string("op_392_cast_fp16")];
+            tensor<fp16, [128]> var_393_to_fp16 = const()[name = string("op_393_to_fp16"), val = tensor<fp16, [128]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6339648)))];
+            tensor<fp16, [1, 50, 8, 128]> input_27_cast_fp16 = mul(x = var_392_cast_fp16, y = var_393_to_fp16)[name = string("input_27_cast_fp16")];
+            fp16 var_395_to_fp16 = const()[name = string("op_395_to_fp16"), val = fp16(0x1.e5p+0)];
+            tensor<fp16, [1, 50, 8, 128]> input_29_cast_fp16 = mul(x = k_1_cast_fp16, y = var_395_to_fp16)[name = string("input_29_cast_fp16")];
+            tensor<int32, [8]> q_padded_1_pad_0 = const()[name = string("q_padded_1_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 10, 0, 0, 0, 0])];
+            string q_padded_1_mode_0 = const()[name = string("q_padded_1_mode_0"), val = string("constant")];
+            fp16 const_5_to_fp16 = const()[name = string("const_5_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 60, 8, 128]> q_padded_1_cast_fp16 = pad(constant_val = const_5_to_fp16, mode = q_padded_1_mode_0, pad = q_padded_1_pad_0, x = input_27_cast_fp16)[name = string("q_padded_1_cast_fp16")];
+            tensor<int32, [5]> var_399 = const()[name = string("op_399"), val = tensor<int32, [5]>([1, 5, 12, 8, 128])];
+            tensor<fp16, [1, 5, 12, 8, 128]> q_blocks_1_cast_fp16 = reshape(shape = var_399, x = q_padded_1_cast_fp16)[name = string("q_blocks_1_cast_fp16")];
+            tensor<int32, [8]> k_padded_1_pad_0 = const()[name = string("k_padded_1_pad_0"), val = tensor<int32, [8]>([0, 0, 12, 11, 0, 0, 0, 0])];
+            string k_padded_1_mode_0 = const()[name = string("k_padded_1_mode_0"), val = string("constant")];
+            fp16 const_6_to_fp16 = const()[name = string("const_6_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 73, 8, 128]> k_padded_1_cast_fp16 = pad(constant_val = const_6_to_fp16, mode = k_padded_1_mode_0, pad = k_padded_1_pad_0, x = input_29_cast_fp16)[name = string("k_padded_1_cast_fp16")];
+            tensor<int32, [8]> v_padded_1_pad_0 = const()[name = string("v_padded_1_pad_0"), val = tensor<int32, [8]>([0, 0, 12, 11, 0, 0, 0, 0])];
+            string v_padded_1_mode_0 = const()[name = string("v_padded_1_mode_0"), val = string("constant")];
+            fp16 const_7_to_fp16 = const()[name = string("const_7_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 73, 8, 128]> v_padded_1_cast_fp16 = pad(constant_val = const_7_to_fp16, mode = v_padded_1_mode_0, pad = v_padded_1_pad_0, x = input_31_cast_fp16)[name = string("v_padded_1_cast_fp16")];
+            tensor<int32, [4]> var_406_begin_0 = const()[name = string("op_406_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_406_end_0 = const()[name = string("op_406_end_0"), val = tensor<int32, [4]>([1, 24, 8, 128])];
+            tensor<bool, [4]> var_406_end_mask_0 = const()[name = string("op_406_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_406_cast_fp16 = slice_by_index(begin = var_406_begin_0, end = var_406_end_0, end_mask = var_406_end_mask_0, x = k_padded_1_cast_fp16)[name = string("op_406_cast_fp16")];
+            tensor<int32, [4]> var_408_begin_0 = const()[name = string("op_408_begin_0"), val = tensor<int32, [4]>([0, 12, 0, 0])];
+            tensor<int32, [4]> var_408_end_0 = const()[name = string("op_408_end_0"), val = tensor<int32, [4]>([1, 36, 8, 128])];
+            tensor<bool, [4]> var_408_end_mask_0 = const()[name = string("op_408_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_408_cast_fp16 = slice_by_index(begin = var_408_begin_0, end = var_408_end_0, end_mask = var_408_end_mask_0, x = k_padded_1_cast_fp16)[name = string("op_408_cast_fp16")];
+            tensor<int32, [4]> var_410_begin_0 = const()[name = string("op_410_begin_0"), val = tensor<int32, [4]>([0, 24, 0, 0])];
+            tensor<int32, [4]> var_410_end_0 = const()[name = string("op_410_end_0"), val = tensor<int32, [4]>([1, 48, 8, 128])];
+            tensor<bool, [4]> var_410_end_mask_0 = const()[name = string("op_410_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_410_cast_fp16 = slice_by_index(begin = var_410_begin_0, end = var_410_end_0, end_mask = var_410_end_mask_0, x = k_padded_1_cast_fp16)[name = string("op_410_cast_fp16")];
+            tensor<int32, [4]> var_412_begin_0 = const()[name = string("op_412_begin_0"), val = tensor<int32, [4]>([0, 36, 0, 0])];
+            tensor<int32, [4]> var_412_end_0 = const()[name = string("op_412_end_0"), val = tensor<int32, [4]>([1, 60, 8, 128])];
+            tensor<bool, [4]> var_412_end_mask_0 = const()[name = string("op_412_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_412_cast_fp16 = slice_by_index(begin = var_412_begin_0, end = var_412_end_0, end_mask = var_412_end_mask_0, x = k_padded_1_cast_fp16)[name = string("op_412_cast_fp16")];
+            tensor<int32, [4]> var_414_begin_0 = const()[name = string("op_414_begin_0"), val = tensor<int32, [4]>([0, 48, 0, 0])];
+            tensor<int32, [4]> var_414_end_0 = const()[name = string("op_414_end_0"), val = tensor<int32, [4]>([1, 72, 8, 128])];
+            tensor<bool, [4]> var_414_end_mask_0 = const()[name = string("op_414_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_414_cast_fp16 = slice_by_index(begin = var_414_begin_0, end = var_414_end_0, end_mask = var_414_end_mask_0, x = k_padded_1_cast_fp16)[name = string("op_414_cast_fp16")];
+            int32 k_blocks_1_axis_0 = const()[name = string("k_blocks_1_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 5, 24, 8, 128]> k_blocks_1_cast_fp16 = stack(axis = k_blocks_1_axis_0, values = (var_406_cast_fp16, var_408_cast_fp16, var_410_cast_fp16, var_412_cast_fp16, var_414_cast_fp16))[name = string("k_blocks_1_cast_fp16")];
+            tensor<int32, [4]> var_418_begin_0 = const()[name = string("op_418_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_418_end_0 = const()[name = string("op_418_end_0"), val = tensor<int32, [4]>([1, 24, 8, 128])];
+            tensor<bool, [4]> var_418_end_mask_0 = const()[name = string("op_418_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_418_cast_fp16 = slice_by_index(begin = var_418_begin_0, end = var_418_end_0, end_mask = var_418_end_mask_0, x = v_padded_1_cast_fp16)[name = string("op_418_cast_fp16")];
+            tensor<int32, [4]> var_420_begin_0 = const()[name = string("op_420_begin_0"), val = tensor<int32, [4]>([0, 12, 0, 0])];
+            tensor<int32, [4]> var_420_end_0 = const()[name = string("op_420_end_0"), val = tensor<int32, [4]>([1, 36, 8, 128])];
+            tensor<bool, [4]> var_420_end_mask_0 = const()[name = string("op_420_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_420_cast_fp16 = slice_by_index(begin = var_420_begin_0, end = var_420_end_0, end_mask = var_420_end_mask_0, x = v_padded_1_cast_fp16)[name = string("op_420_cast_fp16")];
+            tensor<int32, [4]> var_422_begin_0 = const()[name = string("op_422_begin_0"), val = tensor<int32, [4]>([0, 24, 0, 0])];
+            tensor<int32, [4]> var_422_end_0 = const()[name = string("op_422_end_0"), val = tensor<int32, [4]>([1, 48, 8, 128])];
+            tensor<bool, [4]> var_422_end_mask_0 = const()[name = string("op_422_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_422_cast_fp16 = slice_by_index(begin = var_422_begin_0, end = var_422_end_0, end_mask = var_422_end_mask_0, x = v_padded_1_cast_fp16)[name = string("op_422_cast_fp16")];
+            tensor<int32, [4]> var_424_begin_0 = const()[name = string("op_424_begin_0"), val = tensor<int32, [4]>([0, 36, 0, 0])];
+            tensor<int32, [4]> var_424_end_0 = const()[name = string("op_424_end_0"), val = tensor<int32, [4]>([1, 60, 8, 128])];
+            tensor<bool, [4]> var_424_end_mask_0 = const()[name = string("op_424_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_424_cast_fp16 = slice_by_index(begin = var_424_begin_0, end = var_424_end_0, end_mask = var_424_end_mask_0, x = v_padded_1_cast_fp16)[name = string("op_424_cast_fp16")];
+            tensor<int32, [4]> var_426_begin_0 = const()[name = string("op_426_begin_0"), val = tensor<int32, [4]>([0, 48, 0, 0])];
+            tensor<int32, [4]> var_426_end_0 = const()[name = string("op_426_end_0"), val = tensor<int32, [4]>([1, 72, 8, 128])];
+            tensor<bool, [4]> var_426_end_mask_0 = const()[name = string("op_426_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_426_cast_fp16 = slice_by_index(begin = var_426_begin_0, end = var_426_end_0, end_mask = var_426_end_mask_0, x = v_padded_1_cast_fp16)[name = string("op_426_cast_fp16")];
+            int32 v_blocks_1_axis_0 = const()[name = string("v_blocks_1_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 5, 24, 8, 128]> v_blocks_1_cast_fp16 = stack(axis = v_blocks_1_axis_0, values = (var_418_cast_fp16, var_420_cast_fp16, var_422_cast_fp16, var_424_cast_fp16, var_426_cast_fp16))[name = string("v_blocks_1_cast_fp16")];
+            tensor<int32, [5]> var_434 = const()[name = string("op_434"), val = tensor<int32, [5]>([0, 3, 1, -3, -1])];
+            tensor<int32, [5]> var_436 = const()[name = string("op_436"), val = tensor<int32, [5]>([0, 3, 1, -1, -3])];
+            bool matrix_ac_1_transpose_x_0 = const()[name = string("matrix_ac_1_transpose_x_0"), val = bool(false)];
+            bool matrix_ac_1_transpose_y_0 = const()[name = string("matrix_ac_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 5, 12, 128]> queries_1_cast_fp16 = transpose(perm = var_434, x = q_blocks_1_cast_fp16)[name = string("transpose_70")];
+            tensor<fp16, [1, 8, 5, 128, 24]> keys_t_1_cast_fp16 = transpose(perm = var_436, x = k_blocks_1_cast_fp16)[name = string("transpose_71")];
+            tensor<fp16, [1, 8, 5, 12, 24]> matrix_ac_1_cast_fp16 = matmul(transpose_x = matrix_ac_1_transpose_x_0, transpose_y = matrix_ac_1_transpose_y_0, x = queries_1_cast_fp16, y = keys_t_1_cast_fp16)[name = string("matrix_ac_1_cast_fp16")];
+            tensor<int32, [4]> var_439 = const()[name = string("op_439"), val = tensor<int32, [4]>([1, 8, 60, 128])];
+            tensor<fp16, [1, 8, 60, 128]> q_flat_1_cast_fp16 = reshape(shape = var_439, x = queries_1_cast_fp16)[name = string("q_flat_1_cast_fp16")];
+            bool matrix_bd_1_transpose_x_0 = const()[name = string("matrix_bd_1_transpose_x_0"), val = bool(false)];
+            bool matrix_bd_1_transpose_y_0 = const()[name = string("matrix_bd_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [8, 128, 13]> rel_k_t_1_to_fp16 = const()[name = string("rel_k_t_1_to_fp16"), val = tensor<fp16, [8, 128, 13]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6339968)))];
+            tensor<fp16, [1, 8, 60, 13]> matrix_bd_1_cast_fp16 = matmul(transpose_x = matrix_bd_1_transpose_x_0, transpose_y = matrix_bd_1_transpose_y_0, x = q_flat_1_cast_fp16, y = rel_k_t_1_to_fp16)[name = string("matrix_bd_1_cast_fp16")];
+            tensor<int32, [5]> var_444 = const()[name = string("op_444"), val = tensor<int32, [5]>([1, 8, 5, 12, 13])];
+            tensor<fp16, [1, 8, 5, 12, 13]> input_35_cast_fp16 = reshape(shape = var_444, x = matrix_bd_1_cast_fp16)[name = string("input_35_cast_fp16")];
+            tensor<int32, [10]> matrix_bd_3_pad_0 = const()[name = string("matrix_bd_3_pad_0"), val = tensor<int32, [10]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6366656)))];
+            string matrix_bd_3_mode_0 = const()[name = string("matrix_bd_3_mode_0"), val = string("constant")];
+            fp16 const_9_to_fp16 = const()[name = string("const_9_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 8, 5, 12, 25]> matrix_bd_3_cast_fp16 = pad(constant_val = const_9_to_fp16, mode = matrix_bd_3_mode_0, pad = matrix_bd_3_pad_0, x = input_35_cast_fp16)[name = string("matrix_bd_3_cast_fp16")];
+            tensor<int32, [4]> var_448 = const()[name = string("op_448"), val = tensor<int32, [4]>([1, 8, 5, 300])];
+            tensor<fp16, [1, 8, 5, 300]> matrix_bd_5_cast_fp16 = reshape(shape = var_448, x = matrix_bd_3_cast_fp16)[name = string("matrix_bd_5_cast_fp16")];
+            tensor<int32, [4]> matrix_bd_7_begin_0 = const()[name = string("matrix_bd_7_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> matrix_bd_7_end_0 = const()[name = string("matrix_bd_7_end_0"), val = tensor<int32, [4]>([1, 8, 5, 288])];
+            tensor<bool, [4]> matrix_bd_7_end_mask_0 = const()[name = string("matrix_bd_7_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 8, 5, 288]> matrix_bd_7_cast_fp16 = slice_by_index(begin = matrix_bd_7_begin_0, end = matrix_bd_7_end_0, end_mask = matrix_bd_7_end_mask_0, x = matrix_bd_5_cast_fp16)[name = string("matrix_bd_7_cast_fp16")];
+            tensor<int32, [5]> var_454 = const()[name = string("op_454"), val = tensor<int32, [5]>([1, 8, 5, 12, 24])];
+            tensor<fp16, [1, 8, 5, 12, 24]> matrix_bd_9_cast_fp16 = reshape(shape = var_454, x = matrix_bd_7_cast_fp16)[name = string("matrix_bd_9_cast_fp16")];
+            tensor<fp16, [1, 8, 5, 12, 24]> attn_1_cast_fp16 = add(x = matrix_ac_1_cast_fp16, y = matrix_bd_9_cast_fp16)[name = string("attn_1_cast_fp16")];
+            fp16 _inversed_457_y_0_to_fp16 = const()[name = string("_inversed_457_y_0_to_fp16"), val = fp16(0x1.47cp-6)];
+            tensor<fp16, [1, 8, 5, 12, 24]> _inversed_457_cast_fp16 = mul(x = attn_1_cast_fp16, y = _inversed_457_y_0_to_fp16)[name = string("_inversed_457_cast_fp16")];
+            string _inversed_457_cast_fp16_to_fp32_dtype_0 = const()[name = string("_inversed_457_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 8, 5, 12, 24]> _inversed_457_cast_fp16_to_fp32 = cast(dtype = _inversed_457_cast_fp16_to_fp32_dtype_0, x = _inversed_457_cast_fp16)[name = string("cast_525")];
+            tensor<fp32, [1, 8, 5, 12, 24]> var_458 = tanh(x = _inversed_457_cast_fp16_to_fp32)[name = string("op_458")];
+            string var_458_to_fp16_dtype_0 = const()[name = string("op_458_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 self_attns_0_softcap_to_fp16 = const()[name = string("self_attns_0_softcap_to_fp16"), val = fp16(0x1.9p+5)];
+            tensor<fp16, [1, 8, 5, 12, 24]> var_458_to_fp16 = cast(dtype = var_458_to_fp16_dtype_0, x = var_458)[name = string("cast_524")];
+            tensor<fp16, [1, 8, 5, 12, 24]> attn_3_cast_fp16 = mul(x = var_458_to_fp16, y = self_attns_0_softcap_to_fp16)[name = string("attn_3_cast_fp16")];
+            string attn_3_cast_fp16_to_fp32_dtype_0 = const()[name = string("attn_3_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<bool, [1, 1, 5, 12, 24]> var_460 = const()[name = string("op_460"), val = tensor<bool, [1, 1, 5, 12, 24]>([[[[[true, true, true, true, true, true, true, true, true, true, true, true, false, true, true, true, true, true, true, true, true, true, true, true], [true, true, true, true, true, true, true, true, true, true, true, true, false, false, true, true, true, true, true, true, true, true, true, true], [true, true, true, true, true, true, true, true, true, true, true, true, false, false, false, true, true, true, true, true, true, true, true, true], [true, true, true, true, true, true, true, true, true, true, true, true, false, false, false, false, true, true, true, true, true, true, true, true], [true, true, true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, true, true, true, true, true, true, true], [true, true, true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, true, true, true, true, true, true], [true, true, true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, false, true, true, true, true, true], [true, true, true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, false, false, true, true, true, true], [true, true, true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, true, true, true], [true, true, true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, false, true, true], [true, true, true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, true], [true, true, true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false]], [[true, false, false, false, false, false, false, false, false, false, false, false, false, true, true, true, true, true, true, true, true, true, true, true], [true, true, false, false, false, false, false, false, false, false, false, false, false, false, true, true, true, true, true, true, true, true, true, true], [true, true, true, false, false, false, false, false, false, false, false, false, false, false, false, true, true, true, true, true, true, true, true, true], [true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false, true, true, true, true, true, true, true, true], [true, true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false, true, true, true, true, true, true, true], [true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false, true, true, true, true, true, true], [true, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false, true, true, true, true, true], [true, true, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false, true, true, true, true], [true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false, true, true, true], [true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false, true, true], [true, true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false, true], [true, true, true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false]], [[true, false, false, false, false, false, false, false, false, false, false, false, false, true, true, true, true, true, true, true, true, true, true, true], [true, true, false, false, false, false, false, false, false, false, false, false, false, false, true, true, true, true, true, true, true, true, true, true], [true, true, true, false, false, false, false, false, false, false, false, false, false, false, false, true, true, true, true, true, true, true, true, true], [true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false, true, true, true, true, true, true, true, true], [true, true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false, true, true, true, true, true, true, true], [true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false, true, true, true, true, true, true], [true, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false, true, true, true, true, true], [true, true, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false, true, true, true, true], [true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false, true, true, true], [true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false, true, true], [true, true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false, true], [true, true, true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false]], [[true, false, false, false, false, false, false, false, false, false, false, false, false, true, true, true, true, true, true, true, true, true, true, true], [true, true, false, false, false, false, false, false, false, false, false, false, false, false, true, true, true, true, true, true, true, true, true, true], [true, true, true, false, false, false, false, false, false, false, false, false, false, false, false, true, true, true, true, true, true, true, true, true], [true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false, true, true, true, true, true, true, true, true], [true, true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false, true, true, true, true, true, true, true], [true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false, true, true, true, true, true, true], [true, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false, true, true, true, true, true], [true, true, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false, true, true, true, true], [true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false, true, true, true], [true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false, true, true], [true, true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false, true], [true, true, true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false]], [[true, false, false, false, false, false, false, false, false, false, false, false, false, true, true, true, true, true, true, true, true, true, true, true], [true, true, false, false, false, false, false, false, false, false, false, false, false, false, true, true, true, true, true, true, true, true, true, true], [true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true], [true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true], [true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true], [true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true], [true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true], [true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true], [true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true], [true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true], [true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true], [true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true]]]]])];
+            tensor<fp32, [1, 8, 5, 12, 24]> attn_3_cast_fp16_to_fp32 = cast(dtype = attn_3_cast_fp16_to_fp32_dtype_0, x = attn_3_cast_fp16)[name = string("cast_523")];
+            tensor<fp32, [1, 8, 5, 12, 24]> input_37 = select(a = var_321, b = attn_3_cast_fp16_to_fp32, cond = var_460)[name = string("input_37")];
+            tensor<fp32, [1, 8, 5, 12, 24]> var_462 = softmax(axis = var_320, x = input_37)[name = string("op_462")];
+            tensor<int32, [5]> var_464 = const()[name = string("op_464"), val = tensor<int32, [5]>([0, 3, 1, -3, -1])];
+            bool out_1_transpose_x_0 = const()[name = string("out_1_transpose_x_0"), val = bool(false)];
+            bool out_1_transpose_y_0 = const()[name = string("out_1_transpose_y_0"), val = bool(false)];
+            string var_462_to_fp16_dtype_0 = const()[name = string("op_462_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 8, 5, 24, 128]> values_t_1_cast_fp16 = transpose(perm = var_464, x = v_blocks_1_cast_fp16)[name = string("transpose_69")];
+            tensor<fp16, [1, 8, 5, 12, 24]> var_462_to_fp16 = cast(dtype = var_462_to_fp16_dtype_0, x = var_462)[name = string("cast_522")];
+            tensor<fp16, [1, 8, 5, 12, 128]> out_1_cast_fp16 = matmul(transpose_x = out_1_transpose_x_0, transpose_y = out_1_transpose_y_0, x = var_462_to_fp16, y = values_t_1_cast_fp16)[name = string("out_1_cast_fp16")];
+            tensor<int32, [5]> var_467 = const()[name = string("op_467"), val = tensor<int32, [5]>([0, 2, 3, 1, 4])];
+            tensor<int32, [3]> var_469 = const()[name = string("op_469"), val = tensor<int32, [3]>([1, 60, 1024])];
+            tensor<fp16, [1, 5, 12, 8, 128]> var_468_cast_fp16 = transpose(perm = var_467, x = out_1_cast_fp16)[name = string("transpose_68")];
+            tensor<fp16, [1, 60, 1024]> out_3_cast_fp16 = reshape(shape = var_469, x = var_468_cast_fp16)[name = string("out_3_cast_fp16")];
+            tensor<int32, [3]> var_472_begin_0 = const()[name = string("op_472_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_472_end_0 = const()[name = string("op_472_end_0"), val = tensor<int32, [3]>([1, 50, 1024])];
+            tensor<bool, [3]> var_472_end_mask_0 = const()[name = string("op_472_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp16, [1, 50, 1024]> var_472_cast_fp16 = slice_by_index(begin = var_472_begin_0, end = var_472_end_0, end_mask = var_472_end_mask_0, x = out_3_cast_fp16)[name = string("op_472_cast_fp16")];
+            fp16 self_attns_0_post_input_min_to_fp16 = const()[name = string("self_attns_0_post_input_min_to_fp16"), val = fp16(-0x1.aap+4)];
+            fp16 self_attns_0_post_input_max_to_fp16 = const()[name = string("self_attns_0_post_input_max_to_fp16"), val = fp16(0x1.a8p+4)];
+            tensor<fp16, [1, 50, 1024]> clip_13_cast_fp16 = clip(alpha = self_attns_0_post_input_min_to_fp16, beta = self_attns_0_post_input_max_to_fp16, x = var_472_cast_fp16)[name = string("clip_13_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_0_post_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6366784))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6891136))))[name = string("self_attns_0_post_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_7_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_0_post_linear_weight_to_fp16_palettized, x = clip_13_cast_fp16)[name = string("linear_7_cast_fp16")];
+            fp16 self_attns_0_post_output_min_to_fp16 = const()[name = string("self_attns_0_post_output_min_to_fp16"), val = fp16(-0x1.96p+6)];
+            fp16 self_attns_0_post_output_max_to_fp16 = const()[name = string("self_attns_0_post_output_max_to_fp16"), val = fp16(0x1.92p+6)];
+            tensor<fp16, [1, 50, 1024]> clip_14_cast_fp16 = clip(alpha = self_attns_0_post_output_min_to_fp16, beta = self_attns_0_post_output_max_to_fp16, x = linear_7_cast_fp16)[name = string("clip_14_cast_fp16")];
+            fp16 var_484_to_fp16 = const()[name = string("op_484_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_485_to_fp16 = const()[name = string("op_485_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_15_cast_fp16 = clip(alpha = var_484_to_fp16, beta = var_485_to_fp16, x = clip_14_cast_fp16)[name = string("clip_15_cast_fp16")];
+            string clip_15_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_15_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_487 = const()[name = string("op_487"), val = fp32(-0x1p-1)];
+            fp32 var_491_promoted = const()[name = string("op_491_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_15_cast_fp16_to_fp32 = cast(dtype = clip_15_cast_fp16_to_fp32_dtype_0, x = clip_15_cast_fp16)[name = string("cast_521")];
+            tensor<fp32, [1, 50, 1024]> var_497 = pow(x = clip_15_cast_fp16_to_fp32, y = var_491_promoted)[name = string("op_497")];
+            tensor<int32, [1]> var_499_axes_0 = const()[name = string("op_499_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_499_keep_dims_0 = const()[name = string("op_499_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_499 = reduce_mean(axes = var_499_axes_0, keep_dims = var_499_keep_dims_0, x = var_497)[name = string("op_499")];
+            string var_499_to_fp16_dtype_0 = const()[name = string("op_499_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_500_to_fp16 = const()[name = string("op_500_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_499_to_fp16 = cast(dtype = var_499_to_fp16_dtype_0, x = var_499)[name = string("cast_520")];
+            tensor<fp16, [1, 50, 1]> mean_squared_7_cast_fp16 = add(x = var_499_to_fp16, y = var_500_to_fp16)[name = string("mean_squared_7_cast_fp16")];
+            string mean_squared_7_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_7_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_7_cast_fp16_to_fp32 = cast(dtype = mean_squared_7_cast_fp16_to_fp32_dtype_0, x = mean_squared_7_cast_fp16)[name = string("cast_519")];
+            tensor<fp32, [1, 50, 1]> var_502 = pow(x = mean_squared_7_cast_fp16_to_fp32, y = var_487)[name = string("op_502")];
+            string var_502_to_fp16_dtype_0 = const()[name = string("op_502_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_502_to_fp16 = cast(dtype = var_502_to_fp16_dtype_0, x = var_502)[name = string("cast_518")];
+            tensor<fp16, [1, 50, 1024]> normed_output_13_cast_fp16 = mul(x = clip_15_cast_fp16, y = var_502_to_fp16)[name = string("normed_output_13_cast_fp16")];
+            tensor<fp16, [1024]> const_10_to_fp16 = const()[name = string("const_10_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6892224)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_15_cast_fp16 = mul(x = normed_output_13_cast_fp16, y = const_10_to_fp16)[name = string("normed_output_15_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_61_cast_fp16 = add(x = normed_output_15_cast_fp16, y = hidden_states_35_cast_fp16)[name = string("hidden_states_61_cast_fp16")];
+            string hidden_states_61_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_61_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_509 = const()[name = string("op_509"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_510 = const()[name = string("op_510"), val = fp32(-0x1.2a05f2p+33)];
+            fp32 var_522 = const()[name = string("op_522"), val = fp32(-0x1p-1)];
+            fp32 var_518_promoted = const()[name = string("op_518_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> hidden_states_61_cast_fp16_to_fp32 = cast(dtype = hidden_states_61_cast_fp16_to_fp32_dtype_0, x = hidden_states_61_cast_fp16)[name = string("cast_517")];
+            tensor<fp32, [1, 50, 1024]> var_530 = pow(x = hidden_states_61_cast_fp16_to_fp32, y = var_518_promoted)[name = string("op_530")];
+            tensor<int32, [1]> var_532_axes_0 = const()[name = string("op_532_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_532_keep_dims_0 = const()[name = string("op_532_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_532 = reduce_mean(axes = var_532_axes_0, keep_dims = var_532_keep_dims_0, x = var_530)[name = string("op_532")];
+            string var_532_to_fp16_dtype_0 = const()[name = string("op_532_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_533_to_fp16 = const()[name = string("op_533_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_532_to_fp16 = cast(dtype = var_532_to_fp16_dtype_0, x = var_532)[name = string("cast_516")];
+            tensor<fp16, [1, 50, 1]> mean_squared_9_cast_fp16 = add(x = var_532_to_fp16, y = var_533_to_fp16)[name = string("mean_squared_9_cast_fp16")];
+            string mean_squared_9_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_9_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_9_cast_fp16_to_fp32 = cast(dtype = mean_squared_9_cast_fp16_to_fp32_dtype_0, x = mean_squared_9_cast_fp16)[name = string("cast_515")];
+            tensor<fp32, [1, 50, 1]> var_535 = pow(x = mean_squared_9_cast_fp16_to_fp32, y = var_522)[name = string("op_535")];
+            string var_535_to_fp16_dtype_0 = const()[name = string("op_535_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_535_to_fp16 = cast(dtype = var_535_to_fp16_dtype_0, x = var_535)[name = string("cast_514")];
+            tensor<fp16, [1, 50, 1024]> normed_output_17_cast_fp16 = mul(x = hidden_states_61_cast_fp16, y = var_535_to_fp16)[name = string("normed_output_17_cast_fp16")];
+            tensor<fp16, [1024]> const_11_to_fp16 = const()[name = string("const_11_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6894336)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_19_cast_fp16 = mul(x = normed_output_17_cast_fp16, y = const_11_to_fp16)[name = string("normed_output_19_cast_fp16")];
+            fp16 lconv1ds_0_linear_start_input_min_to_fp16 = const()[name = string("lconv1ds_0_linear_start_input_min_to_fp16"), val = fp16(-0x1.04p+5)];
+            fp16 lconv1ds_0_linear_start_input_max_to_fp16 = const()[name = string("lconv1ds_0_linear_start_input_max_to_fp16"), val = fp16(0x1.02p+5)];
+            tensor<fp16, [1, 50, 1024]> clip_16_cast_fp16 = clip(alpha = lconv1ds_0_linear_start_input_min_to_fp16, beta = lconv1ds_0_linear_start_input_max_to_fp16, x = normed_output_19_cast_fp16)[name = string("clip_16_cast_fp16")];
+            tensor<fp16, [2048, 1024]> lconv1ds_0_linear_start_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(6896448))), lut = tensor<fp16, [64, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(7945088))))[name = string("lconv1ds_0_linear_start_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [2048]> linear_8_bias_0_to_fp16 = const()[name = string("linear_8_bias_0_to_fp16"), val = tensor<fp16, [2048]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(7947200)))];
+            tensor<fp16, [1, 50, 2048]> linear_8_cast_fp16 = linear(bias = linear_8_bias_0_to_fp16, weight = lconv1ds_0_linear_start_linear_weight_to_fp16_palettized, x = clip_16_cast_fp16)[name = string("linear_8_cast_fp16")];
+            fp16 lconv1ds_0_linear_start_output_min_to_fp16 = const()[name = string("lconv1ds_0_linear_start_output_min_to_fp16"), val = fp16(-0x1.dap+4)];
+            fp16 lconv1ds_0_linear_start_output_max_to_fp16 = const()[name = string("lconv1ds_0_linear_start_output_max_to_fp16"), val = fp16(0x1.d6p+4)];
+            tensor<fp16, [1, 50, 2048]> clip_17_cast_fp16 = clip(alpha = lconv1ds_0_linear_start_output_min_to_fp16, beta = lconv1ds_0_linear_start_output_max_to_fp16, x = linear_8_cast_fp16)[name = string("clip_17_cast_fp16")];
+            int32 hidden_states_69_split_num_splits_0 = const()[name = string("hidden_states_69_split_num_splits_0"), val = int32(2)];
+            int32 hidden_states_69_split_axis_0 = const()[name = string("hidden_states_69_split_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_69_split_cast_fp16_0, tensor<fp16, [1, 50, 1024]> hidden_states_69_split_cast_fp16_1 = split(axis = hidden_states_69_split_axis_0, num_splits = hidden_states_69_split_num_splits_0, x = clip_17_cast_fp16)[name = string("hidden_states_69_split_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_69_split_1_sigmoid_cast_fp16 = sigmoid(x = hidden_states_69_split_cast_fp16_1)[name = string("hidden_states_69_split_1_sigmoid_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_69_cast_fp16 = mul(x = hidden_states_69_split_cast_fp16_0, y = hidden_states_69_split_1_sigmoid_cast_fp16)[name = string("hidden_states_69_cast_fp16")];
+            tensor<int32, [3]> input_45_perm_0 = const()[name = string("input_45_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [6]> input_47_pad_0 = const()[name = string("input_47_pad_0"), val = tensor<int32, [6]>([0, 0, 0, 0, 4, 0])];
+            string input_47_mode_0 = const()[name = string("input_47_mode_0"), val = string("constant")];
+            fp16 const_12_to_fp16 = const()[name = string("const_12_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 1024, 50]> input_45_cast_fp16 = transpose(perm = input_45_perm_0, x = hidden_states_69_cast_fp16)[name = string("transpose_67")];
+            tensor<fp16, [1, 1024, 54]> input_47_cast_fp16 = pad(constant_val = const_12_to_fp16, mode = input_47_mode_0, pad = input_47_pad_0, x = input_45_cast_fp16)[name = string("input_47_cast_fp16")];
+            string var_561_pad_type_0 = const()[name = string("op_561_pad_type_0"), val = string("valid")];
+            int32 var_561_groups_0 = const()[name = string("op_561_groups_0"), val = int32(1024)];
+            tensor<int32, [1]> var_561_strides_0 = const()[name = string("op_561_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_561_pad_0 = const()[name = string("op_561_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_561_dilations_0 = const()[name = string("op_561_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1024, 1, 5]> lconv1ds_0_depthwise_conv1d_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1, 5]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(7951360))), lut = tensor<fp16, [32, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(7953984))))[name = string("lconv1ds_0_depthwise_conv1d_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 50]> var_561_cast_fp16 = conv(dilations = var_561_dilations_0, groups = var_561_groups_0, pad = var_561_pad_0, pad_type = var_561_pad_type_0, strides = var_561_strides_0, weight = lconv1ds_0_depthwise_conv1d_weight_to_fp16_palettized, x = input_47_cast_fp16)[name = string("op_561_cast_fp16")];
+            tensor<int32, [3]> hidden_states_71_perm_0 = const()[name = string("hidden_states_71_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            string hidden_states_71_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_71_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_71_cast_fp16 = transpose(perm = hidden_states_71_perm_0, x = var_561_cast_fp16)[name = string("transpose_66")];
+            tensor<fp32, [1, 50, 1024]> hidden_states_71_cast_fp16_to_fp32 = cast(dtype = hidden_states_71_cast_fp16_to_fp32_dtype_0, x = hidden_states_71_cast_fp16)[name = string("cast_513")];
+            tensor<fp32, [1, 50, 1024]> clip_18 = clip(alpha = var_510, beta = var_509, x = hidden_states_71_cast_fp16_to_fp32)[name = string("clip_18")];
+            fp32 var_518_promoted_1 = const()[name = string("op_518_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_566 = pow(x = clip_18, y = var_518_promoted_1)[name = string("op_566")];
+            tensor<int32, [1]> var_568_axes_0 = const()[name = string("op_568_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_568_keep_dims_0 = const()[name = string("op_568_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_568 = reduce_mean(axes = var_568_axes_0, keep_dims = var_568_keep_dims_0, x = var_566)[name = string("op_568")];
+            string var_568_to_fp16_dtype_0 = const()[name = string("op_568_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_569_to_fp16 = const()[name = string("op_569_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_568_to_fp16 = cast(dtype = var_568_to_fp16_dtype_0, x = var_568)[name = string("cast_512")];
+            tensor<fp16, [1, 50, 1]> mean_squared_11_cast_fp16 = add(x = var_568_to_fp16, y = var_569_to_fp16)[name = string("mean_squared_11_cast_fp16")];
+            string mean_squared_11_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_11_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_11_cast_fp16_to_fp32 = cast(dtype = mean_squared_11_cast_fp16_to_fp32_dtype_0, x = mean_squared_11_cast_fp16)[name = string("cast_511")];
+            tensor<fp32, [1, 50, 1]> var_571 = pow(x = mean_squared_11_cast_fp16_to_fp32, y = var_522)[name = string("op_571")];
+            string clip_18_to_fp16_dtype_0 = const()[name = string("clip_18_to_fp16_dtype_0"), val = string("fp16")];
+            string var_571_to_fp16_dtype_0 = const()[name = string("op_571_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_18_to_fp16 = cast(dtype = clip_18_to_fp16_dtype_0, x = clip_18)[name = string("cast_509")];
+            tensor<fp16, [1, 50, 1]> var_571_to_fp16 = cast(dtype = var_571_to_fp16_dtype_0, x = var_571)[name = string("cast_510")];
+            tensor<fp16, [1, 50, 1024]> normed_output_21_cast_fp16 = mul(x = clip_18_to_fp16, y = var_571_to_fp16)[name = string("normed_output_21_cast_fp16")];
+            tensor<fp16, [1024]> const_13_to_fp16 = const()[name = string("const_13_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(7955072)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_23_cast_fp16 = mul(x = normed_output_21_cast_fp16, y = const_13_to_fp16)[name = string("normed_output_23_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_77_cast_fp16 = silu(x = normed_output_23_cast_fp16)[name = string("hidden_states_77_cast_fp16")];
+            fp16 lconv1ds_0_linear_end_input_min_to_fp16 = const()[name = string("lconv1ds_0_linear_end_input_min_to_fp16"), val = fp16(-0x1.74p+2)];
+            fp16 lconv1ds_0_linear_end_input_max_to_fp16 = const()[name = string("lconv1ds_0_linear_end_input_max_to_fp16"), val = fp16(0x1.72p+2)];
+            tensor<fp16, [1, 50, 1024]> clip_19_cast_fp16 = clip(alpha = lconv1ds_0_linear_end_input_min_to_fp16, beta = lconv1ds_0_linear_end_input_max_to_fp16, x = hidden_states_77_cast_fp16)[name = string("clip_19_cast_fp16")];
+            tensor<fp16, [1024, 1024]> lconv1ds_0_linear_end_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(7957184))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8481536))))[name = string("lconv1ds_0_linear_end_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_9_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = lconv1ds_0_linear_end_linear_weight_to_fp16_palettized, x = clip_19_cast_fp16)[name = string("linear_9_cast_fp16")];
+            fp16 lconv1ds_0_linear_end_output_min_to_fp16 = const()[name = string("lconv1ds_0_linear_end_output_min_to_fp16"), val = fp16(-0x1.9p+2)];
+            fp16 lconv1ds_0_linear_end_output_max_to_fp16 = const()[name = string("lconv1ds_0_linear_end_output_max_to_fp16"), val = fp16(0x1.8cp+2)];
+            tensor<fp16, [1, 50, 1024]> clip_20_cast_fp16 = clip(alpha = lconv1ds_0_linear_end_output_min_to_fp16, beta = lconv1ds_0_linear_end_output_max_to_fp16, x = linear_9_cast_fp16)[name = string("clip_20_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_83_cast_fp16 = add(x = clip_20_cast_fp16, y = hidden_states_61_cast_fp16)[name = string("hidden_states_83_cast_fp16")];
+            string hidden_states_83_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_83_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_595 = const()[name = string("op_595"), val = fp32(-0x1p-1)];
+            fp32 var_596 = const()[name = string("op_596"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_597 = const()[name = string("op_597"), val = fp32(-0x1.2a05f2p+33)];
+            tensor<fp32, [1, 50, 1024]> hidden_states_83_cast_fp16_to_fp32 = cast(dtype = hidden_states_83_cast_fp16_to_fp32_dtype_0, x = hidden_states_83_cast_fp16)[name = string("cast_508")];
+            tensor<fp32, [1, 50, 1024]> clip_21 = clip(alpha = var_597, beta = var_596, x = hidden_states_83_cast_fp16_to_fp32)[name = string("clip_21")];
+            fp32 var_591_promoted = const()[name = string("op_591_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_605 = pow(x = clip_21, y = var_591_promoted)[name = string("op_605")];
+            tensor<int32, [1]> var_607_axes_0 = const()[name = string("op_607_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_607_keep_dims_0 = const()[name = string("op_607_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_607 = reduce_mean(axes = var_607_axes_0, keep_dims = var_607_keep_dims_0, x = var_605)[name = string("op_607")];
+            string var_607_to_fp16_dtype_0 = const()[name = string("op_607_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_608_to_fp16 = const()[name = string("op_608_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_607_to_fp16 = cast(dtype = var_607_to_fp16_dtype_0, x = var_607)[name = string("cast_507")];
+            tensor<fp16, [1, 50, 1]> mean_squared_13_cast_fp16 = add(x = var_607_to_fp16, y = var_608_to_fp16)[name = string("mean_squared_13_cast_fp16")];
+            string mean_squared_13_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_13_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_13_cast_fp16_to_fp32 = cast(dtype = mean_squared_13_cast_fp16_to_fp32_dtype_0, x = mean_squared_13_cast_fp16)[name = string("cast_506")];
+            tensor<fp32, [1, 50, 1]> var_610 = pow(x = mean_squared_13_cast_fp16_to_fp32, y = var_595)[name = string("op_610")];
+            string clip_21_to_fp16_dtype_0 = const()[name = string("clip_21_to_fp16_dtype_0"), val = string("fp16")];
+            string var_610_to_fp16_dtype_0 = const()[name = string("op_610_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_21_to_fp16 = cast(dtype = clip_21_to_fp16_dtype_0, x = clip_21)[name = string("cast_504")];
+            tensor<fp16, [1, 50, 1]> var_610_to_fp16 = cast(dtype = var_610_to_fp16_dtype_0, x = var_610)[name = string("cast_505")];
+            tensor<fp16, [1, 50, 1024]> normed_output_25_cast_fp16 = mul(x = clip_21_to_fp16, y = var_610_to_fp16)[name = string("normed_output_25_cast_fp16")];
+            tensor<fp16, [1024]> const_14_to_fp16 = const()[name = string("const_14_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8482624)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_27_cast_fp16 = mul(x = normed_output_25_cast_fp16, y = const_14_to_fp16)[name = string("normed_output_27_cast_fp16")];
+            fp16 feed_forward2s_0_ffw_layer_1_input_min_to_fp16 = const()[name = string("feed_forward2s_0_ffw_layer_1_input_min_to_fp16"), val = fp16(-0x1.66p+3)];
+            fp16 feed_forward2s_0_ffw_layer_1_input_max_to_fp16 = const()[name = string("feed_forward2s_0_ffw_layer_1_input_max_to_fp16"), val = fp16(0x1.62p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_22_cast_fp16 = clip(alpha = feed_forward2s_0_ffw_layer_1_input_min_to_fp16, beta = feed_forward2s_0_ffw_layer_1_input_max_to_fp16, x = normed_output_27_cast_fp16)[name = string("clip_22_cast_fp16")];
+            tensor<fp16, [4096, 1024]> feed_forward2s_0_ffw_layer_1_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(8484736))), lut = tensor<fp16, [128, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(10581952))))[name = string("feed_forward2s_0_ffw_layer_1_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 4096]> linear_10_cast_fp16 = linear(bias = linear_1_bias_0_to_fp16, weight = feed_forward2s_0_ffw_layer_1_linear_weight_to_fp16_palettized, x = clip_22_cast_fp16)[name = string("linear_10_cast_fp16")];
+            fp16 feed_forward2s_0_ffw_layer_1_output_min_to_fp16 = const()[name = string("feed_forward2s_0_ffw_layer_1_output_min_to_fp16"), val = fp16(-0x1.a6p+4)];
+            fp16 feed_forward2s_0_ffw_layer_1_output_max_to_fp16 = const()[name = string("feed_forward2s_0_ffw_layer_1_output_max_to_fp16"), val = fp16(0x1.a2p+4)];
+            tensor<fp16, [1, 50, 4096]> clip_23_cast_fp16 = clip(alpha = feed_forward2s_0_ffw_layer_1_output_min_to_fp16, beta = feed_forward2s_0_ffw_layer_1_output_max_to_fp16, x = linear_10_cast_fp16)[name = string("clip_23_cast_fp16")];
+            tensor<fp16, [1, 50, 4096]> hidden_states_93_cast_fp16 = silu(x = clip_23_cast_fp16)[name = string("hidden_states_93_cast_fp16")];
+            fp16 feed_forward2s_0_ffw_layer_2_input_min_to_fp16 = const()[name = string("feed_forward2s_0_ffw_layer_2_input_min_to_fp16"), val = fp16(-0x1.2ep+3)];
+            fp16 feed_forward2s_0_ffw_layer_2_input_max_to_fp16 = const()[name = string("feed_forward2s_0_ffw_layer_2_input_max_to_fp16"), val = fp16(0x1.2cp+3)];
+            tensor<fp16, [1, 50, 4096]> clip_24_cast_fp16 = clip(alpha = feed_forward2s_0_ffw_layer_2_input_min_to_fp16, beta = feed_forward2s_0_ffw_layer_2_input_max_to_fp16, x = hidden_states_93_cast_fp16)[name = string("clip_24_cast_fp16")];
+            tensor<fp16, [1024, 4096]> feed_forward2s_0_ffw_layer_2_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 4096]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(10586112))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(12683328))))[name = string("feed_forward2s_0_ffw_layer_2_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_11_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = feed_forward2s_0_ffw_layer_2_linear_weight_to_fp16_palettized, x = clip_24_cast_fp16)[name = string("linear_11_cast_fp16")];
+            fp16 feed_forward2s_0_ffw_layer_2_output_min_to_fp16 = const()[name = string("feed_forward2s_0_ffw_layer_2_output_min_to_fp16"), val = fp16(-0x1.0ap+5)];
+            fp16 feed_forward2s_0_ffw_layer_2_output_max_to_fp16 = const()[name = string("feed_forward2s_0_ffw_layer_2_output_max_to_fp16"), val = fp16(0x1.08p+5)];
+            tensor<fp16, [1, 50, 1024]> clip_25_cast_fp16 = clip(alpha = feed_forward2s_0_ffw_layer_2_output_min_to_fp16, beta = feed_forward2s_0_ffw_layer_2_output_max_to_fp16, x = linear_11_cast_fp16)[name = string("clip_25_cast_fp16")];
+            string clip_25_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_25_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1024]> clip_25_cast_fp16_to_fp32 = cast(dtype = clip_25_cast_fp16_to_fp32_dtype_0, x = clip_25_cast_fp16)[name = string("cast_503")];
+            tensor<fp32, [1, 50, 1024]> clip_26 = clip(alpha = var_597, beta = var_596, x = clip_25_cast_fp16_to_fp32)[name = string("clip_26")];
+            fp32 var_591_promoted_1 = const()[name = string("op_591_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_637 = pow(x = clip_26, y = var_591_promoted_1)[name = string("op_637")];
+            tensor<int32, [1]> var_639_axes_0 = const()[name = string("op_639_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_639_keep_dims_0 = const()[name = string("op_639_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_639 = reduce_mean(axes = var_639_axes_0, keep_dims = var_639_keep_dims_0, x = var_637)[name = string("op_639")];
+            string var_639_to_fp16_dtype_0 = const()[name = string("op_639_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_640_to_fp16 = const()[name = string("op_640_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_639_to_fp16 = cast(dtype = var_639_to_fp16_dtype_0, x = var_639)[name = string("cast_502")];
+            tensor<fp16, [1, 50, 1]> mean_squared_15_cast_fp16 = add(x = var_639_to_fp16, y = var_640_to_fp16)[name = string("mean_squared_15_cast_fp16")];
+            string mean_squared_15_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_15_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_15_cast_fp16_to_fp32 = cast(dtype = mean_squared_15_cast_fp16_to_fp32_dtype_0, x = mean_squared_15_cast_fp16)[name = string("cast_501")];
+            tensor<fp32, [1, 50, 1]> var_642 = pow(x = mean_squared_15_cast_fp16_to_fp32, y = var_595)[name = string("op_642")];
+            string clip_26_to_fp16_dtype_0 = const()[name = string("clip_26_to_fp16_dtype_0"), val = string("fp16")];
+            string var_642_to_fp16_dtype_0 = const()[name = string("op_642_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_26_to_fp16 = cast(dtype = clip_26_to_fp16_dtype_0, x = clip_26)[name = string("cast_499")];
+            tensor<fp16, [1, 50, 1]> var_642_to_fp16 = cast(dtype = var_642_to_fp16_dtype_0, x = var_642)[name = string("cast_500")];
+            tensor<fp16, [1, 50, 1024]> normed_output_29_cast_fp16 = mul(x = clip_26_to_fp16, y = var_642_to_fp16)[name = string("normed_output_29_cast_fp16")];
+            tensor<fp16, [1024]> const_15_to_fp16 = const()[name = string("const_15_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(12684416)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_31_cast_fp16 = mul(x = normed_output_29_cast_fp16, y = const_15_to_fp16)[name = string("normed_output_31_cast_fp16")];
+            fp16 var_587_to_fp16 = const()[name = string("op_587_to_fp16"), val = fp16(0x1p-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_105_cast_fp16 = mul(x = normed_output_31_cast_fp16, y = var_587_to_fp16)[name = string("hidden_states_105_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_107_cast_fp16 = add(x = hidden_states_105_cast_fp16, y = hidden_states_83_cast_fp16)[name = string("hidden_states_107_cast_fp16")];
+            fp16 var_649_to_fp16 = const()[name = string("op_649_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_650_to_fp16 = const()[name = string("op_650_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_27_cast_fp16 = clip(alpha = var_649_to_fp16, beta = var_650_to_fp16, x = hidden_states_107_cast_fp16)[name = string("clip_27_cast_fp16")];
+            string clip_27_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_27_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_652 = const()[name = string("op_652"), val = fp32(-0x1p-1)];
+            fp32 var_656_promoted = const()[name = string("op_656_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_27_cast_fp16_to_fp32 = cast(dtype = clip_27_cast_fp16_to_fp32_dtype_0, x = clip_27_cast_fp16)[name = string("cast_498")];
+            tensor<fp32, [1, 50, 1024]> var_662 = pow(x = clip_27_cast_fp16_to_fp32, y = var_656_promoted)[name = string("op_662")];
+            tensor<int32, [1]> var_664_axes_0 = const()[name = string("op_664_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_664_keep_dims_0 = const()[name = string("op_664_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_664 = reduce_mean(axes = var_664_axes_0, keep_dims = var_664_keep_dims_0, x = var_662)[name = string("op_664")];
+            string var_664_to_fp16_dtype_0 = const()[name = string("op_664_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_665_to_fp16 = const()[name = string("op_665_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_664_to_fp16 = cast(dtype = var_664_to_fp16_dtype_0, x = var_664)[name = string("cast_497")];
+            tensor<fp16, [1, 50, 1]> mean_squared_17_cast_fp16 = add(x = var_664_to_fp16, y = var_665_to_fp16)[name = string("mean_squared_17_cast_fp16")];
+            string mean_squared_17_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_17_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_17_cast_fp16_to_fp32 = cast(dtype = mean_squared_17_cast_fp16_to_fp32_dtype_0, x = mean_squared_17_cast_fp16)[name = string("cast_496")];
+            tensor<fp32, [1, 50, 1]> var_667 = pow(x = mean_squared_17_cast_fp16_to_fp32, y = var_652)[name = string("op_667")];
+            string var_667_to_fp16_dtype_0 = const()[name = string("op_667_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_667_to_fp16 = cast(dtype = var_667_to_fp16_dtype_0, x = var_667)[name = string("cast_495")];
+            tensor<fp16, [1, 50, 1024]> normed_output_33_cast_fp16 = mul(x = clip_27_cast_fp16, y = var_667_to_fp16)[name = string("normed_output_33_cast_fp16")];
+            tensor<fp16, [1024]> const_16_to_fp16 = const()[name = string("const_16_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(12686528)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_35_cast_fp16 = mul(x = normed_output_33_cast_fp16, y = const_16_to_fp16)[name = string("normed_output_35_cast_fp16")];
+            string normed_output_35_cast_fp16_to_fp32_dtype_0 = const()[name = string("normed_output_35_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_680 = const()[name = string("op_680"), val = fp32(-0x1p-1)];
+            fp32 var_681 = const()[name = string("op_681"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_682 = const()[name = string("op_682"), val = fp32(-0x1.2a05f2p+33)];
+            tensor<fp32, [1, 50, 1024]> normed_output_35_cast_fp16_to_fp32 = cast(dtype = normed_output_35_cast_fp16_to_fp32_dtype_0, x = normed_output_35_cast_fp16)[name = string("cast_494")];
+            tensor<fp32, [1, 50, 1024]> clip_28 = clip(alpha = var_682, beta = var_681, x = normed_output_35_cast_fp16_to_fp32)[name = string("clip_28")];
+            fp32 var_676_promoted = const()[name = string("op_676_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_690 = pow(x = clip_28, y = var_676_promoted)[name = string("op_690")];
+            tensor<int32, [1]> var_692_axes_0 = const()[name = string("op_692_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_692_keep_dims_0 = const()[name = string("op_692_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_692 = reduce_mean(axes = var_692_axes_0, keep_dims = var_692_keep_dims_0, x = var_690)[name = string("op_692")];
+            string var_692_to_fp16_dtype_0 = const()[name = string("op_692_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_693_to_fp16 = const()[name = string("op_693_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_692_to_fp16 = cast(dtype = var_692_to_fp16_dtype_0, x = var_692)[name = string("cast_493")];
+            tensor<fp16, [1, 50, 1]> mean_squared_19_cast_fp16 = add(x = var_692_to_fp16, y = var_693_to_fp16)[name = string("mean_squared_19_cast_fp16")];
+            string mean_squared_19_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_19_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_19_cast_fp16_to_fp32 = cast(dtype = mean_squared_19_cast_fp16_to_fp32_dtype_0, x = mean_squared_19_cast_fp16)[name = string("cast_492")];
+            tensor<fp32, [1, 50, 1]> var_695 = pow(x = mean_squared_19_cast_fp16_to_fp32, y = var_680)[name = string("op_695")];
+            string clip_28_to_fp16_dtype_0 = const()[name = string("clip_28_to_fp16_dtype_0"), val = string("fp16")];
+            string var_695_to_fp16_dtype_0 = const()[name = string("op_695_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_28_to_fp16 = cast(dtype = clip_28_to_fp16_dtype_0, x = clip_28)[name = string("cast_490")];
+            tensor<fp16, [1, 50, 1]> var_695_to_fp16 = cast(dtype = var_695_to_fp16_dtype_0, x = var_695)[name = string("cast_491")];
+            tensor<fp16, [1, 50, 1024]> normed_output_37_cast_fp16 = mul(x = clip_28_to_fp16, y = var_695_to_fp16)[name = string("normed_output_37_cast_fp16")];
+            tensor<fp16, [1024]> const_17_to_fp16 = const()[name = string("const_17_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(12688640)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_39_cast_fp16 = mul(x = normed_output_37_cast_fp16, y = const_17_to_fp16)[name = string("normed_output_39_cast_fp16")];
+            fp16 feed_forward1s_1_ffw_layer_1_input_min_to_fp16 = const()[name = string("feed_forward1s_1_ffw_layer_1_input_min_to_fp16"), val = fp16(-0x1.86p+3)];
+            fp16 feed_forward1s_1_ffw_layer_1_input_max_to_fp16 = const()[name = string("feed_forward1s_1_ffw_layer_1_input_max_to_fp16"), val = fp16(0x1.84p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_29_cast_fp16 = clip(alpha = feed_forward1s_1_ffw_layer_1_input_min_to_fp16, beta = feed_forward1s_1_ffw_layer_1_input_max_to_fp16, x = normed_output_39_cast_fp16)[name = string("clip_29_cast_fp16")];
+            tensor<fp16, [4096, 1024]> feed_forward1s_1_ffw_layer_1_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(12690752))), lut = tensor<fp16, [128, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(14787968))))[name = string("feed_forward1s_1_ffw_layer_1_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 4096]> linear_12_cast_fp16 = linear(bias = linear_1_bias_0_to_fp16, weight = feed_forward1s_1_ffw_layer_1_linear_weight_to_fp16_palettized, x = clip_29_cast_fp16)[name = string("linear_12_cast_fp16")];
+            fp16 feed_forward1s_1_ffw_layer_1_output_min_to_fp16 = const()[name = string("feed_forward1s_1_ffw_layer_1_output_min_to_fp16"), val = fp16(-0x1.bap+4)];
+            fp16 feed_forward1s_1_ffw_layer_1_output_max_to_fp16 = const()[name = string("feed_forward1s_1_ffw_layer_1_output_max_to_fp16"), val = fp16(0x1.b6p+4)];
+            tensor<fp16, [1, 50, 4096]> clip_30_cast_fp16 = clip(alpha = feed_forward1s_1_ffw_layer_1_output_min_to_fp16, beta = feed_forward1s_1_ffw_layer_1_output_max_to_fp16, x = linear_12_cast_fp16)[name = string("clip_30_cast_fp16")];
+            tensor<fp16, [1, 50, 4096]> hidden_states_123_cast_fp16 = silu(x = clip_30_cast_fp16)[name = string("hidden_states_123_cast_fp16")];
+            fp16 feed_forward1s_1_ffw_layer_2_input_min_to_fp16 = const()[name = string("feed_forward1s_1_ffw_layer_2_input_min_to_fp16"), val = fp16(-0x1.36p+3)];
+            fp16 feed_forward1s_1_ffw_layer_2_input_max_to_fp16 = const()[name = string("feed_forward1s_1_ffw_layer_2_input_max_to_fp16"), val = fp16(0x1.34p+3)];
+            tensor<fp16, [1, 50, 4096]> clip_31_cast_fp16 = clip(alpha = feed_forward1s_1_ffw_layer_2_input_min_to_fp16, beta = feed_forward1s_1_ffw_layer_2_input_max_to_fp16, x = hidden_states_123_cast_fp16)[name = string("clip_31_cast_fp16")];
+            tensor<fp16, [1024, 4096]> feed_forward1s_1_ffw_layer_2_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 4096]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(14792128))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16889344))))[name = string("feed_forward1s_1_ffw_layer_2_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_13_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = feed_forward1s_1_ffw_layer_2_linear_weight_to_fp16_palettized, x = clip_31_cast_fp16)[name = string("linear_13_cast_fp16")];
+            fp16 feed_forward1s_1_ffw_layer_2_output_min_to_fp16 = const()[name = string("feed_forward1s_1_ffw_layer_2_output_min_to_fp16"), val = fp16(-0x1.3ep+5)];
+            fp16 feed_forward1s_1_ffw_layer_2_output_max_to_fp16 = const()[name = string("feed_forward1s_1_ffw_layer_2_output_max_to_fp16"), val = fp16(0x1.3cp+5)];
+            tensor<fp16, [1, 50, 1024]> clip_32_cast_fp16 = clip(alpha = feed_forward1s_1_ffw_layer_2_output_min_to_fp16, beta = feed_forward1s_1_ffw_layer_2_output_max_to_fp16, x = linear_13_cast_fp16)[name = string("clip_32_cast_fp16")];
+            string clip_32_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_32_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1024]> clip_32_cast_fp16_to_fp32 = cast(dtype = clip_32_cast_fp16_to_fp32_dtype_0, x = clip_32_cast_fp16)[name = string("cast_489")];
+            tensor<fp32, [1, 50, 1024]> clip_33 = clip(alpha = var_682, beta = var_681, x = clip_32_cast_fp16_to_fp32)[name = string("clip_33")];
+            fp32 var_676_promoted_1 = const()[name = string("op_676_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_722 = pow(x = clip_33, y = var_676_promoted_1)[name = string("op_722")];
+            tensor<int32, [1]> var_724_axes_0 = const()[name = string("op_724_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_724_keep_dims_0 = const()[name = string("op_724_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_724 = reduce_mean(axes = var_724_axes_0, keep_dims = var_724_keep_dims_0, x = var_722)[name = string("op_724")];
+            string var_724_to_fp16_dtype_0 = const()[name = string("op_724_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_725_to_fp16 = const()[name = string("op_725_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_724_to_fp16 = cast(dtype = var_724_to_fp16_dtype_0, x = var_724)[name = string("cast_488")];
+            tensor<fp16, [1, 50, 1]> mean_squared_21_cast_fp16 = add(x = var_724_to_fp16, y = var_725_to_fp16)[name = string("mean_squared_21_cast_fp16")];
+            string mean_squared_21_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_21_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_21_cast_fp16_to_fp32 = cast(dtype = mean_squared_21_cast_fp16_to_fp32_dtype_0, x = mean_squared_21_cast_fp16)[name = string("cast_487")];
+            tensor<fp32, [1, 50, 1]> var_727 = pow(x = mean_squared_21_cast_fp16_to_fp32, y = var_680)[name = string("op_727")];
+            string clip_33_to_fp16_dtype_0 = const()[name = string("clip_33_to_fp16_dtype_0"), val = string("fp16")];
+            string var_727_to_fp16_dtype_0 = const()[name = string("op_727_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_33_to_fp16 = cast(dtype = clip_33_to_fp16_dtype_0, x = clip_33)[name = string("cast_485")];
+            tensor<fp16, [1, 50, 1]> var_727_to_fp16 = cast(dtype = var_727_to_fp16_dtype_0, x = var_727)[name = string("cast_486")];
+            tensor<fp16, [1, 50, 1024]> normed_output_41_cast_fp16 = mul(x = clip_33_to_fp16, y = var_727_to_fp16)[name = string("normed_output_41_cast_fp16")];
+            tensor<fp16, [1024]> const_18_to_fp16 = const()[name = string("const_18_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16890432)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_43_cast_fp16 = mul(x = normed_output_41_cast_fp16, y = const_18_to_fp16)[name = string("normed_output_43_cast_fp16")];
+            fp16 var_672_to_fp16 = const()[name = string("op_672_to_fp16"), val = fp16(0x1p-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_135_cast_fp16 = mul(x = normed_output_43_cast_fp16, y = var_672_to_fp16)[name = string("hidden_states_135_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_137_cast_fp16 = add(x = hidden_states_135_cast_fp16, y = normed_output_35_cast_fp16)[name = string("hidden_states_137_cast_fp16")];
+            fp16 var_734_to_fp16 = const()[name = string("op_734_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_735_to_fp16 = const()[name = string("op_735_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_34_cast_fp16 = clip(alpha = var_734_to_fp16, beta = var_735_to_fp16, x = hidden_states_137_cast_fp16)[name = string("clip_34_cast_fp16")];
+            string clip_34_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_34_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_737 = const()[name = string("op_737"), val = fp32(-0x1p-1)];
+            fp32 var_741_promoted = const()[name = string("op_741_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_34_cast_fp16_to_fp32 = cast(dtype = clip_34_cast_fp16_to_fp32_dtype_0, x = clip_34_cast_fp16)[name = string("cast_484")];
+            tensor<fp32, [1, 50, 1024]> var_747 = pow(x = clip_34_cast_fp16_to_fp32, y = var_741_promoted)[name = string("op_747")];
+            tensor<int32, [1]> var_749_axes_0 = const()[name = string("op_749_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_749_keep_dims_0 = const()[name = string("op_749_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_749 = reduce_mean(axes = var_749_axes_0, keep_dims = var_749_keep_dims_0, x = var_747)[name = string("op_749")];
+            string var_749_to_fp16_dtype_0 = const()[name = string("op_749_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_750_to_fp16 = const()[name = string("op_750_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_749_to_fp16 = cast(dtype = var_749_to_fp16_dtype_0, x = var_749)[name = string("cast_483")];
+            tensor<fp16, [1, 50, 1]> mean_squared_23_cast_fp16 = add(x = var_749_to_fp16, y = var_750_to_fp16)[name = string("mean_squared_23_cast_fp16")];
+            string mean_squared_23_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_23_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_23_cast_fp16_to_fp32 = cast(dtype = mean_squared_23_cast_fp16_to_fp32_dtype_0, x = mean_squared_23_cast_fp16)[name = string("cast_482")];
+            tensor<fp32, [1, 50, 1]> var_752 = pow(x = mean_squared_23_cast_fp16_to_fp32, y = var_737)[name = string("op_752")];
+            string var_752_to_fp16_dtype_0 = const()[name = string("op_752_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_752_to_fp16 = cast(dtype = var_752_to_fp16_dtype_0, x = var_752)[name = string("cast_481")];
+            tensor<fp16, [1, 50, 1024]> normed_output_45_cast_fp16 = mul(x = clip_34_cast_fp16, y = var_752_to_fp16)[name = string("normed_output_45_cast_fp16")];
+            tensor<fp16, [1024]> const_19_to_fp16 = const()[name = string("const_19_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16892544)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_47_cast_fp16 = mul(x = normed_output_45_cast_fp16, y = const_19_to_fp16)[name = string("normed_output_47_cast_fp16")];
+            int32 var_758 = const()[name = string("op_758"), val = int32(-1)];
+            fp32 var_759 = const()[name = string("op_759"), val = fp32(-0x1.dcd65p+29)];
+            fp16 self_attns_1_q_proj_input_min_to_fp16 = const()[name = string("self_attns_1_q_proj_input_min_to_fp16"), val = fp16(-0x1.6cp+3)];
+            fp16 self_attns_1_q_proj_input_max_to_fp16 = const()[name = string("self_attns_1_q_proj_input_max_to_fp16"), val = fp16(0x1.6ap+3)];
+            tensor<fp16, [1, 50, 1024]> clip_35_cast_fp16 = clip(alpha = self_attns_1_q_proj_input_min_to_fp16, beta = self_attns_1_q_proj_input_max_to_fp16, x = normed_output_47_cast_fp16)[name = string("clip_35_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_1_q_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16894656))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17419008))))[name = string("self_attns_1_q_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_14_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_1_q_proj_linear_weight_to_fp16_palettized, x = clip_35_cast_fp16)[name = string("linear_14_cast_fp16")];
+            fp16 self_attns_1_q_proj_output_min_to_fp16 = const()[name = string("self_attns_1_q_proj_output_min_to_fp16"), val = fp16(-0x1.6p+4)];
+            fp16 self_attns_1_q_proj_output_max_to_fp16 = const()[name = string("self_attns_1_q_proj_output_max_to_fp16"), val = fp16(0x1.5ep+4)];
+            tensor<fp16, [1, 50, 1024]> clip_36_cast_fp16 = clip(alpha = self_attns_1_q_proj_output_min_to_fp16, beta = self_attns_1_q_proj_output_max_to_fp16, x = linear_14_cast_fp16)[name = string("clip_36_cast_fp16")];
+            tensor<int32, [4]> var_803 = const()[name = string("op_803"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> q_3_cast_fp16 = reshape(shape = var_803, x = clip_36_cast_fp16)[name = string("q_3_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_1_k_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17420096))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17944448))))[name = string("self_attns_1_k_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_15_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_1_k_proj_linear_weight_to_fp16_palettized, x = clip_35_cast_fp16)[name = string("linear_15_cast_fp16")];
+            fp16 self_attns_1_k_proj_output_min_to_fp16 = const()[name = string("self_attns_1_k_proj_output_min_to_fp16"), val = fp16(-0x1.6p+4)];
+            fp16 self_attns_1_k_proj_output_max_to_fp16 = const()[name = string("self_attns_1_k_proj_output_max_to_fp16"), val = fp16(0x1.5ep+4)];
+            tensor<fp16, [1, 50, 1024]> clip_38_cast_fp16 = clip(alpha = self_attns_1_k_proj_output_min_to_fp16, beta = self_attns_1_k_proj_output_max_to_fp16, x = linear_15_cast_fp16)[name = string("clip_38_cast_fp16")];
+            tensor<int32, [4]> var_815 = const()[name = string("op_815"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> k_3_cast_fp16 = reshape(shape = var_815, x = clip_38_cast_fp16)[name = string("k_3_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_1_v_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17945536))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(18469888))))[name = string("self_attns_1_v_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_16_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_1_v_proj_linear_weight_to_fp16_palettized, x = clip_35_cast_fp16)[name = string("linear_16_cast_fp16")];
+            fp16 self_attns_1_v_proj_output_min_to_fp16 = const()[name = string("self_attns_1_v_proj_output_min_to_fp16"), val = fp16(-0x1.6p+4)];
+            fp16 self_attns_1_v_proj_output_max_to_fp16 = const()[name = string("self_attns_1_v_proj_output_max_to_fp16"), val = fp16(0x1.5ep+4)];
+            tensor<fp16, [1, 50, 1024]> clip_40_cast_fp16 = clip(alpha = self_attns_1_v_proj_output_min_to_fp16, beta = self_attns_1_v_proj_output_max_to_fp16, x = linear_16_cast_fp16)[name = string("clip_40_cast_fp16")];
+            tensor<int32, [4]> var_827 = const()[name = string("op_827"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> input_75_cast_fp16 = reshape(shape = var_827, x = clip_40_cast_fp16)[name = string("input_75_cast_fp16")];
+            fp16 var_829_to_fp16 = const()[name = string("op_829_to_fp16"), val = fp16(0x1.054p-3)];
+            tensor<fp16, [1, 50, 8, 128]> var_830_cast_fp16 = mul(x = q_3_cast_fp16, y = var_829_to_fp16)[name = string("op_830_cast_fp16")];
+            tensor<fp16, [128]> var_831_to_fp16 = const()[name = string("op_831_to_fp16"), val = tensor<fp16, [128]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(18470976)))];
+            tensor<fp16, [1, 50, 8, 128]> input_71_cast_fp16 = mul(x = var_830_cast_fp16, y = var_831_to_fp16)[name = string("input_71_cast_fp16")];
+            fp16 var_833_to_fp16 = const()[name = string("op_833_to_fp16"), val = fp16(0x1.e5p+0)];
+            tensor<fp16, [1, 50, 8, 128]> input_73_cast_fp16 = mul(x = k_3_cast_fp16, y = var_833_to_fp16)[name = string("input_73_cast_fp16")];
+            tensor<int32, [8]> q_padded_3_pad_0 = const()[name = string("q_padded_3_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 10, 0, 0, 0, 0])];
+            string q_padded_3_mode_0 = const()[name = string("q_padded_3_mode_0"), val = string("constant")];
+            fp16 const_20_to_fp16 = const()[name = string("const_20_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 60, 8, 128]> q_padded_3_cast_fp16 = pad(constant_val = const_20_to_fp16, mode = q_padded_3_mode_0, pad = q_padded_3_pad_0, x = input_71_cast_fp16)[name = string("q_padded_3_cast_fp16")];
+            tensor<int32, [5]> var_837 = const()[name = string("op_837"), val = tensor<int32, [5]>([1, 5, 12, 8, 128])];
+            tensor<fp16, [1, 5, 12, 8, 128]> q_blocks_3_cast_fp16 = reshape(shape = var_837, x = q_padded_3_cast_fp16)[name = string("q_blocks_3_cast_fp16")];
+            tensor<int32, [8]> k_padded_3_pad_0 = const()[name = string("k_padded_3_pad_0"), val = tensor<int32, [8]>([0, 0, 12, 11, 0, 0, 0, 0])];
+            string k_padded_3_mode_0 = const()[name = string("k_padded_3_mode_0"), val = string("constant")];
+            fp16 const_21_to_fp16 = const()[name = string("const_21_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 73, 8, 128]> k_padded_3_cast_fp16 = pad(constant_val = const_21_to_fp16, mode = k_padded_3_mode_0, pad = k_padded_3_pad_0, x = input_73_cast_fp16)[name = string("k_padded_3_cast_fp16")];
+            tensor<int32, [8]> v_padded_3_pad_0 = const()[name = string("v_padded_3_pad_0"), val = tensor<int32, [8]>([0, 0, 12, 11, 0, 0, 0, 0])];
+            string v_padded_3_mode_0 = const()[name = string("v_padded_3_mode_0"), val = string("constant")];
+            fp16 const_22_to_fp16 = const()[name = string("const_22_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 73, 8, 128]> v_padded_3_cast_fp16 = pad(constant_val = const_22_to_fp16, mode = v_padded_3_mode_0, pad = v_padded_3_pad_0, x = input_75_cast_fp16)[name = string("v_padded_3_cast_fp16")];
+            tensor<int32, [4]> var_844_begin_0 = const()[name = string("op_844_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_844_end_0 = const()[name = string("op_844_end_0"), val = tensor<int32, [4]>([1, 24, 8, 128])];
+            tensor<bool, [4]> var_844_end_mask_0 = const()[name = string("op_844_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_844_cast_fp16 = slice_by_index(begin = var_844_begin_0, end = var_844_end_0, end_mask = var_844_end_mask_0, x = k_padded_3_cast_fp16)[name = string("op_844_cast_fp16")];
+            tensor<int32, [4]> var_846_begin_0 = const()[name = string("op_846_begin_0"), val = tensor<int32, [4]>([0, 12, 0, 0])];
+            tensor<int32, [4]> var_846_end_0 = const()[name = string("op_846_end_0"), val = tensor<int32, [4]>([1, 36, 8, 128])];
+            tensor<bool, [4]> var_846_end_mask_0 = const()[name = string("op_846_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_846_cast_fp16 = slice_by_index(begin = var_846_begin_0, end = var_846_end_0, end_mask = var_846_end_mask_0, x = k_padded_3_cast_fp16)[name = string("op_846_cast_fp16")];
+            tensor<int32, [4]> var_848_begin_0 = const()[name = string("op_848_begin_0"), val = tensor<int32, [4]>([0, 24, 0, 0])];
+            tensor<int32, [4]> var_848_end_0 = const()[name = string("op_848_end_0"), val = tensor<int32, [4]>([1, 48, 8, 128])];
+            tensor<bool, [4]> var_848_end_mask_0 = const()[name = string("op_848_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_848_cast_fp16 = slice_by_index(begin = var_848_begin_0, end = var_848_end_0, end_mask = var_848_end_mask_0, x = k_padded_3_cast_fp16)[name = string("op_848_cast_fp16")];
+            tensor<int32, [4]> var_850_begin_0 = const()[name = string("op_850_begin_0"), val = tensor<int32, [4]>([0, 36, 0, 0])];
+            tensor<int32, [4]> var_850_end_0 = const()[name = string("op_850_end_0"), val = tensor<int32, [4]>([1, 60, 8, 128])];
+            tensor<bool, [4]> var_850_end_mask_0 = const()[name = string("op_850_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_850_cast_fp16 = slice_by_index(begin = var_850_begin_0, end = var_850_end_0, end_mask = var_850_end_mask_0, x = k_padded_3_cast_fp16)[name = string("op_850_cast_fp16")];
+            tensor<int32, [4]> var_852_begin_0 = const()[name = string("op_852_begin_0"), val = tensor<int32, [4]>([0, 48, 0, 0])];
+            tensor<int32, [4]> var_852_end_0 = const()[name = string("op_852_end_0"), val = tensor<int32, [4]>([1, 72, 8, 128])];
+            tensor<bool, [4]> var_852_end_mask_0 = const()[name = string("op_852_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_852_cast_fp16 = slice_by_index(begin = var_852_begin_0, end = var_852_end_0, end_mask = var_852_end_mask_0, x = k_padded_3_cast_fp16)[name = string("op_852_cast_fp16")];
+            int32 k_blocks_3_axis_0 = const()[name = string("k_blocks_3_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 5, 24, 8, 128]> k_blocks_3_cast_fp16 = stack(axis = k_blocks_3_axis_0, values = (var_844_cast_fp16, var_846_cast_fp16, var_848_cast_fp16, var_850_cast_fp16, var_852_cast_fp16))[name = string("k_blocks_3_cast_fp16")];
+            tensor<int32, [4]> var_856_begin_0 = const()[name = string("op_856_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_856_end_0 = const()[name = string("op_856_end_0"), val = tensor<int32, [4]>([1, 24, 8, 128])];
+            tensor<bool, [4]> var_856_end_mask_0 = const()[name = string("op_856_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_856_cast_fp16 = slice_by_index(begin = var_856_begin_0, end = var_856_end_0, end_mask = var_856_end_mask_0, x = v_padded_3_cast_fp16)[name = string("op_856_cast_fp16")];
+            tensor<int32, [4]> var_858_begin_0 = const()[name = string("op_858_begin_0"), val = tensor<int32, [4]>([0, 12, 0, 0])];
+            tensor<int32, [4]> var_858_end_0 = const()[name = string("op_858_end_0"), val = tensor<int32, [4]>([1, 36, 8, 128])];
+            tensor<bool, [4]> var_858_end_mask_0 = const()[name = string("op_858_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_858_cast_fp16 = slice_by_index(begin = var_858_begin_0, end = var_858_end_0, end_mask = var_858_end_mask_0, x = v_padded_3_cast_fp16)[name = string("op_858_cast_fp16")];
+            tensor<int32, [4]> var_860_begin_0 = const()[name = string("op_860_begin_0"), val = tensor<int32, [4]>([0, 24, 0, 0])];
+            tensor<int32, [4]> var_860_end_0 = const()[name = string("op_860_end_0"), val = tensor<int32, [4]>([1, 48, 8, 128])];
+            tensor<bool, [4]> var_860_end_mask_0 = const()[name = string("op_860_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_860_cast_fp16 = slice_by_index(begin = var_860_begin_0, end = var_860_end_0, end_mask = var_860_end_mask_0, x = v_padded_3_cast_fp16)[name = string("op_860_cast_fp16")];
+            tensor<int32, [4]> var_862_begin_0 = const()[name = string("op_862_begin_0"), val = tensor<int32, [4]>([0, 36, 0, 0])];
+            tensor<int32, [4]> var_862_end_0 = const()[name = string("op_862_end_0"), val = tensor<int32, [4]>([1, 60, 8, 128])];
+            tensor<bool, [4]> var_862_end_mask_0 = const()[name = string("op_862_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_862_cast_fp16 = slice_by_index(begin = var_862_begin_0, end = var_862_end_0, end_mask = var_862_end_mask_0, x = v_padded_3_cast_fp16)[name = string("op_862_cast_fp16")];
+            tensor<int32, [4]> var_864_begin_0 = const()[name = string("op_864_begin_0"), val = tensor<int32, [4]>([0, 48, 0, 0])];
+            tensor<int32, [4]> var_864_end_0 = const()[name = string("op_864_end_0"), val = tensor<int32, [4]>([1, 72, 8, 128])];
+            tensor<bool, [4]> var_864_end_mask_0 = const()[name = string("op_864_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_864_cast_fp16 = slice_by_index(begin = var_864_begin_0, end = var_864_end_0, end_mask = var_864_end_mask_0, x = v_padded_3_cast_fp16)[name = string("op_864_cast_fp16")];
+            int32 v_blocks_3_axis_0 = const()[name = string("v_blocks_3_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 5, 24, 8, 128]> v_blocks_3_cast_fp16 = stack(axis = v_blocks_3_axis_0, values = (var_856_cast_fp16, var_858_cast_fp16, var_860_cast_fp16, var_862_cast_fp16, var_864_cast_fp16))[name = string("v_blocks_3_cast_fp16")];
+            tensor<int32, [5]> var_872 = const()[name = string("op_872"), val = tensor<int32, [5]>([0, 3, 1, -3, -1])];
+            tensor<int32, [5]> var_874 = const()[name = string("op_874"), val = tensor<int32, [5]>([0, 3, 1, -1, -3])];
+            bool matrix_ac_3_transpose_x_0 = const()[name = string("matrix_ac_3_transpose_x_0"), val = bool(false)];
+            bool matrix_ac_3_transpose_y_0 = const()[name = string("matrix_ac_3_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 5, 12, 128]> queries_3_cast_fp16 = transpose(perm = var_872, x = q_blocks_3_cast_fp16)[name = string("transpose_64")];
+            tensor<fp16, [1, 8, 5, 128, 24]> keys_t_3_cast_fp16 = transpose(perm = var_874, x = k_blocks_3_cast_fp16)[name = string("transpose_65")];
+            tensor<fp16, [1, 8, 5, 12, 24]> matrix_ac_3_cast_fp16 = matmul(transpose_x = matrix_ac_3_transpose_x_0, transpose_y = matrix_ac_3_transpose_y_0, x = queries_3_cast_fp16, y = keys_t_3_cast_fp16)[name = string("matrix_ac_3_cast_fp16")];
+            tensor<int32, [4]> var_877 = const()[name = string("op_877"), val = tensor<int32, [4]>([1, 8, 60, 128])];
+            tensor<fp16, [1, 8, 60, 128]> q_flat_3_cast_fp16 = reshape(shape = var_877, x = queries_3_cast_fp16)[name = string("q_flat_3_cast_fp16")];
+            bool matrix_bd_11_transpose_x_0 = const()[name = string("matrix_bd_11_transpose_x_0"), val = bool(false)];
+            bool matrix_bd_11_transpose_y_0 = const()[name = string("matrix_bd_11_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [8, 128, 13]> rel_k_t_3_to_fp16 = const()[name = string("rel_k_t_3_to_fp16"), val = tensor<fp16, [8, 128, 13]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(18471296)))];
+            tensor<fp16, [1, 8, 60, 13]> matrix_bd_11_cast_fp16 = matmul(transpose_x = matrix_bd_11_transpose_x_0, transpose_y = matrix_bd_11_transpose_y_0, x = q_flat_3_cast_fp16, y = rel_k_t_3_to_fp16)[name = string("matrix_bd_11_cast_fp16")];
+            tensor<int32, [5]> var_882 = const()[name = string("op_882"), val = tensor<int32, [5]>([1, 8, 5, 12, 13])];
+            tensor<fp16, [1, 8, 5, 12, 13]> input_77_cast_fp16 = reshape(shape = var_882, x = matrix_bd_11_cast_fp16)[name = string("input_77_cast_fp16")];
+            tensor<int32, [10]> matrix_bd_13_pad_0 = const()[name = string("matrix_bd_13_pad_0"), val = tensor<int32, [10]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(18497984)))];
+            string matrix_bd_13_mode_0 = const()[name = string("matrix_bd_13_mode_0"), val = string("constant")];
+            fp16 const_24_to_fp16 = const()[name = string("const_24_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 8, 5, 12, 25]> matrix_bd_13_cast_fp16 = pad(constant_val = const_24_to_fp16, mode = matrix_bd_13_mode_0, pad = matrix_bd_13_pad_0, x = input_77_cast_fp16)[name = string("matrix_bd_13_cast_fp16")];
+            tensor<int32, [4]> var_886 = const()[name = string("op_886"), val = tensor<int32, [4]>([1, 8, 5, 300])];
+            tensor<fp16, [1, 8, 5, 300]> matrix_bd_15_cast_fp16 = reshape(shape = var_886, x = matrix_bd_13_cast_fp16)[name = string("matrix_bd_15_cast_fp16")];
+            tensor<int32, [4]> matrix_bd_17_begin_0 = const()[name = string("matrix_bd_17_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> matrix_bd_17_end_0 = const()[name = string("matrix_bd_17_end_0"), val = tensor<int32, [4]>([1, 8, 5, 288])];
+            tensor<bool, [4]> matrix_bd_17_end_mask_0 = const()[name = string("matrix_bd_17_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 8, 5, 288]> matrix_bd_17_cast_fp16 = slice_by_index(begin = matrix_bd_17_begin_0, end = matrix_bd_17_end_0, end_mask = matrix_bd_17_end_mask_0, x = matrix_bd_15_cast_fp16)[name = string("matrix_bd_17_cast_fp16")];
+            tensor<int32, [5]> var_892 = const()[name = string("op_892"), val = tensor<int32, [5]>([1, 8, 5, 12, 24])];
+            tensor<fp16, [1, 8, 5, 12, 24]> matrix_bd_19_cast_fp16 = reshape(shape = var_892, x = matrix_bd_17_cast_fp16)[name = string("matrix_bd_19_cast_fp16")];
+            tensor<fp16, [1, 8, 5, 12, 24]> attn_7_cast_fp16 = add(x = matrix_ac_3_cast_fp16, y = matrix_bd_19_cast_fp16)[name = string("attn_7_cast_fp16")];
+            fp16 _inversed_895_y_0_to_fp16 = const()[name = string("_inversed_895_y_0_to_fp16"), val = fp16(0x1.47cp-6)];
+            tensor<fp16, [1, 8, 5, 12, 24]> _inversed_895_cast_fp16 = mul(x = attn_7_cast_fp16, y = _inversed_895_y_0_to_fp16)[name = string("_inversed_895_cast_fp16")];
+            string _inversed_895_cast_fp16_to_fp32_dtype_0 = const()[name = string("_inversed_895_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 8, 5, 12, 24]> _inversed_895_cast_fp16_to_fp32 = cast(dtype = _inversed_895_cast_fp16_to_fp32_dtype_0, x = _inversed_895_cast_fp16)[name = string("cast_480")];
+            tensor<fp32, [1, 8, 5, 12, 24]> var_896 = tanh(x = _inversed_895_cast_fp16_to_fp32)[name = string("op_896")];
+            string var_896_to_fp16_dtype_0 = const()[name = string("op_896_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 self_attns_1_softcap_to_fp16 = const()[name = string("self_attns_1_softcap_to_fp16"), val = fp16(0x1.9p+5)];
+            tensor<fp16, [1, 8, 5, 12, 24]> var_896_to_fp16 = cast(dtype = var_896_to_fp16_dtype_0, x = var_896)[name = string("cast_479")];
+            tensor<fp16, [1, 8, 5, 12, 24]> attn_9_cast_fp16 = mul(x = var_896_to_fp16, y = self_attns_1_softcap_to_fp16)[name = string("attn_9_cast_fp16")];
+            string attn_9_cast_fp16_to_fp32_dtype_0 = const()[name = string("attn_9_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 8, 5, 12, 24]> attn_9_cast_fp16_to_fp32 = cast(dtype = attn_9_cast_fp16_to_fp32_dtype_0, x = attn_9_cast_fp16)[name = string("cast_478")];
+            tensor<fp32, [1, 8, 5, 12, 24]> input_79 = select(a = var_759, b = attn_9_cast_fp16_to_fp32, cond = var_460)[name = string("input_79")];
+            tensor<fp32, [1, 8, 5, 12, 24]> var_900 = softmax(axis = var_758, x = input_79)[name = string("op_900")];
+            tensor<int32, [5]> var_902 = const()[name = string("op_902"), val = tensor<int32, [5]>([0, 3, 1, -3, -1])];
+            bool out_7_transpose_x_0 = const()[name = string("out_7_transpose_x_0"), val = bool(false)];
+            bool out_7_transpose_y_0 = const()[name = string("out_7_transpose_y_0"), val = bool(false)];
+            string var_900_to_fp16_dtype_0 = const()[name = string("op_900_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 8, 5, 24, 128]> values_t_3_cast_fp16 = transpose(perm = var_902, x = v_blocks_3_cast_fp16)[name = string("transpose_63")];
+            tensor<fp16, [1, 8, 5, 12, 24]> var_900_to_fp16 = cast(dtype = var_900_to_fp16_dtype_0, x = var_900)[name = string("cast_477")];
+            tensor<fp16, [1, 8, 5, 12, 128]> out_7_cast_fp16 = matmul(transpose_x = out_7_transpose_x_0, transpose_y = out_7_transpose_y_0, x = var_900_to_fp16, y = values_t_3_cast_fp16)[name = string("out_7_cast_fp16")];
+            tensor<int32, [5]> var_905 = const()[name = string("op_905"), val = tensor<int32, [5]>([0, 2, 3, 1, 4])];
+            tensor<int32, [3]> var_907 = const()[name = string("op_907"), val = tensor<int32, [3]>([1, 60, 1024])];
+            tensor<fp16, [1, 5, 12, 8, 128]> var_906_cast_fp16 = transpose(perm = var_905, x = out_7_cast_fp16)[name = string("transpose_62")];
+            tensor<fp16, [1, 60, 1024]> out_9_cast_fp16 = reshape(shape = var_907, x = var_906_cast_fp16)[name = string("out_9_cast_fp16")];
+            tensor<int32, [3]> var_910_begin_0 = const()[name = string("op_910_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_910_end_0 = const()[name = string("op_910_end_0"), val = tensor<int32, [3]>([1, 50, 1024])];
+            tensor<bool, [3]> var_910_end_mask_0 = const()[name = string("op_910_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp16, [1, 50, 1024]> var_910_cast_fp16 = slice_by_index(begin = var_910_begin_0, end = var_910_end_0, end_mask = var_910_end_mask_0, x = out_9_cast_fp16)[name = string("op_910_cast_fp16")];
+            fp16 self_attns_1_post_input_min_to_fp16 = const()[name = string("self_attns_1_post_input_min_to_fp16"), val = fp16(-0x1.1ep+4)];
+            fp16 self_attns_1_post_input_max_to_fp16 = const()[name = string("self_attns_1_post_input_max_to_fp16"), val = fp16(0x1.1cp+4)];
+            tensor<fp16, [1, 50, 1024]> clip_41_cast_fp16 = clip(alpha = self_attns_1_post_input_min_to_fp16, beta = self_attns_1_post_input_max_to_fp16, x = var_910_cast_fp16)[name = string("clip_41_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_1_post_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(18498112))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(19022464))))[name = string("self_attns_1_post_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_18_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_1_post_linear_weight_to_fp16_palettized, x = clip_41_cast_fp16)[name = string("linear_18_cast_fp16")];
+            fp16 self_attns_1_post_output_min_to_fp16 = const()[name = string("self_attns_1_post_output_min_to_fp16"), val = fp16(-0x1.dp+5)];
+            fp16 self_attns_1_post_output_max_to_fp16 = const()[name = string("self_attns_1_post_output_max_to_fp16"), val = fp16(0x1.ccp+5)];
+            tensor<fp16, [1, 50, 1024]> clip_42_cast_fp16 = clip(alpha = self_attns_1_post_output_min_to_fp16, beta = self_attns_1_post_output_max_to_fp16, x = linear_18_cast_fp16)[name = string("clip_42_cast_fp16")];
+            fp16 var_922_to_fp16 = const()[name = string("op_922_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_923_to_fp16 = const()[name = string("op_923_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_43_cast_fp16 = clip(alpha = var_922_to_fp16, beta = var_923_to_fp16, x = clip_42_cast_fp16)[name = string("clip_43_cast_fp16")];
+            string clip_43_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_43_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_925 = const()[name = string("op_925"), val = fp32(-0x1p-1)];
+            fp32 var_929_promoted = const()[name = string("op_929_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_43_cast_fp16_to_fp32 = cast(dtype = clip_43_cast_fp16_to_fp32_dtype_0, x = clip_43_cast_fp16)[name = string("cast_476")];
+            tensor<fp32, [1, 50, 1024]> var_935 = pow(x = clip_43_cast_fp16_to_fp32, y = var_929_promoted)[name = string("op_935")];
+            tensor<int32, [1]> var_937_axes_0 = const()[name = string("op_937_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_937_keep_dims_0 = const()[name = string("op_937_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_937 = reduce_mean(axes = var_937_axes_0, keep_dims = var_937_keep_dims_0, x = var_935)[name = string("op_937")];
+            string var_937_to_fp16_dtype_0 = const()[name = string("op_937_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_938_to_fp16 = const()[name = string("op_938_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_937_to_fp16 = cast(dtype = var_937_to_fp16_dtype_0, x = var_937)[name = string("cast_475")];
+            tensor<fp16, [1, 50, 1]> mean_squared_25_cast_fp16 = add(x = var_937_to_fp16, y = var_938_to_fp16)[name = string("mean_squared_25_cast_fp16")];
+            string mean_squared_25_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_25_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_25_cast_fp16_to_fp32 = cast(dtype = mean_squared_25_cast_fp16_to_fp32_dtype_0, x = mean_squared_25_cast_fp16)[name = string("cast_474")];
+            tensor<fp32, [1, 50, 1]> var_940 = pow(x = mean_squared_25_cast_fp16_to_fp32, y = var_925)[name = string("op_940")];
+            string var_940_to_fp16_dtype_0 = const()[name = string("op_940_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_940_to_fp16 = cast(dtype = var_940_to_fp16_dtype_0, x = var_940)[name = string("cast_473")];
+            tensor<fp16, [1, 50, 1024]> normed_output_49_cast_fp16 = mul(x = clip_43_cast_fp16, y = var_940_to_fp16)[name = string("normed_output_49_cast_fp16")];
+            tensor<fp16, [1024]> const_25_to_fp16 = const()[name = string("const_25_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(19023552)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_51_cast_fp16 = mul(x = normed_output_49_cast_fp16, y = const_25_to_fp16)[name = string("normed_output_51_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_163_cast_fp16 = add(x = normed_output_51_cast_fp16, y = hidden_states_137_cast_fp16)[name = string("hidden_states_163_cast_fp16")];
+            string hidden_states_163_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_163_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_947 = const()[name = string("op_947"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_948 = const()[name = string("op_948"), val = fp32(-0x1.2a05f2p+33)];
+            fp32 var_960 = const()[name = string("op_960"), val = fp32(-0x1p-1)];
+            fp32 var_956_promoted = const()[name = string("op_956_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> hidden_states_163_cast_fp16_to_fp32 = cast(dtype = hidden_states_163_cast_fp16_to_fp32_dtype_0, x = hidden_states_163_cast_fp16)[name = string("cast_472")];
+            tensor<fp32, [1, 50, 1024]> var_968 = pow(x = hidden_states_163_cast_fp16_to_fp32, y = var_956_promoted)[name = string("op_968")];
+            tensor<int32, [1]> var_970_axes_0 = const()[name = string("op_970_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_970_keep_dims_0 = const()[name = string("op_970_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_970 = reduce_mean(axes = var_970_axes_0, keep_dims = var_970_keep_dims_0, x = var_968)[name = string("op_970")];
+            string var_970_to_fp16_dtype_0 = const()[name = string("op_970_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_971_to_fp16 = const()[name = string("op_971_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_970_to_fp16 = cast(dtype = var_970_to_fp16_dtype_0, x = var_970)[name = string("cast_471")];
+            tensor<fp16, [1, 50, 1]> mean_squared_27_cast_fp16 = add(x = var_970_to_fp16, y = var_971_to_fp16)[name = string("mean_squared_27_cast_fp16")];
+            string mean_squared_27_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_27_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_27_cast_fp16_to_fp32 = cast(dtype = mean_squared_27_cast_fp16_to_fp32_dtype_0, x = mean_squared_27_cast_fp16)[name = string("cast_470")];
+            tensor<fp32, [1, 50, 1]> var_973 = pow(x = mean_squared_27_cast_fp16_to_fp32, y = var_960)[name = string("op_973")];
+            string var_973_to_fp16_dtype_0 = const()[name = string("op_973_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_973_to_fp16 = cast(dtype = var_973_to_fp16_dtype_0, x = var_973)[name = string("cast_469")];
+            tensor<fp16, [1, 50, 1024]> normed_output_53_cast_fp16 = mul(x = hidden_states_163_cast_fp16, y = var_973_to_fp16)[name = string("normed_output_53_cast_fp16")];
+            tensor<fp16, [1024]> const_26_to_fp16 = const()[name = string("const_26_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(19025664)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_55_cast_fp16 = mul(x = normed_output_53_cast_fp16, y = const_26_to_fp16)[name = string("normed_output_55_cast_fp16")];
+            fp16 lconv1ds_1_linear_start_input_min_to_fp16 = const()[name = string("lconv1ds_1_linear_start_input_min_to_fp16"), val = fp16(-0x1.46p+4)];
+            fp16 lconv1ds_1_linear_start_input_max_to_fp16 = const()[name = string("lconv1ds_1_linear_start_input_max_to_fp16"), val = fp16(0x1.42p+4)];
+            tensor<fp16, [1, 50, 1024]> clip_44_cast_fp16 = clip(alpha = lconv1ds_1_linear_start_input_min_to_fp16, beta = lconv1ds_1_linear_start_input_max_to_fp16, x = normed_output_55_cast_fp16)[name = string("clip_44_cast_fp16")];
+            tensor<fp16, [2048, 1024]> lconv1ds_1_linear_start_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(19027776))), lut = tensor<fp16, [64, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(20076416))))[name = string("lconv1ds_1_linear_start_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 2048]> linear_19_cast_fp16 = linear(bias = linear_8_bias_0_to_fp16, weight = lconv1ds_1_linear_start_linear_weight_to_fp16_palettized, x = clip_44_cast_fp16)[name = string("linear_19_cast_fp16")];
+            fp16 lconv1ds_1_linear_start_output_min_to_fp16 = const()[name = string("lconv1ds_1_linear_start_output_min_to_fp16"), val = fp16(-0x1.aep+4)];
+            fp16 lconv1ds_1_linear_start_output_max_to_fp16 = const()[name = string("lconv1ds_1_linear_start_output_max_to_fp16"), val = fp16(0x1.aap+4)];
+            tensor<fp16, [1, 50, 2048]> clip_45_cast_fp16 = clip(alpha = lconv1ds_1_linear_start_output_min_to_fp16, beta = lconv1ds_1_linear_start_output_max_to_fp16, x = linear_19_cast_fp16)[name = string("clip_45_cast_fp16")];
+            int32 hidden_states_171_split_num_splits_0 = const()[name = string("hidden_states_171_split_num_splits_0"), val = int32(2)];
+            int32 hidden_states_171_split_axis_0 = const()[name = string("hidden_states_171_split_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_171_split_cast_fp16_0, tensor<fp16, [1, 50, 1024]> hidden_states_171_split_cast_fp16_1 = split(axis = hidden_states_171_split_axis_0, num_splits = hidden_states_171_split_num_splits_0, x = clip_45_cast_fp16)[name = string("hidden_states_171_split_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_171_split_1_sigmoid_cast_fp16 = sigmoid(x = hidden_states_171_split_cast_fp16_1)[name = string("hidden_states_171_split_1_sigmoid_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_171_cast_fp16 = mul(x = hidden_states_171_split_cast_fp16_0, y = hidden_states_171_split_1_sigmoid_cast_fp16)[name = string("hidden_states_171_cast_fp16")];
+            tensor<int32, [3]> input_87_perm_0 = const()[name = string("input_87_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [6]> input_89_pad_0 = const()[name = string("input_89_pad_0"), val = tensor<int32, [6]>([0, 0, 0, 0, 4, 0])];
+            string input_89_mode_0 = const()[name = string("input_89_mode_0"), val = string("constant")];
+            fp16 const_27_to_fp16 = const()[name = string("const_27_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 1024, 50]> input_87_cast_fp16 = transpose(perm = input_87_perm_0, x = hidden_states_171_cast_fp16)[name = string("transpose_61")];
+            tensor<fp16, [1, 1024, 54]> input_89_cast_fp16 = pad(constant_val = const_27_to_fp16, mode = input_89_mode_0, pad = input_89_pad_0, x = input_87_cast_fp16)[name = string("input_89_cast_fp16")];
+            string var_999_pad_type_0 = const()[name = string("op_999_pad_type_0"), val = string("valid")];
+            int32 var_999_groups_0 = const()[name = string("op_999_groups_0"), val = int32(1024)];
+            tensor<int32, [1]> var_999_strides_0 = const()[name = string("op_999_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_999_pad_0 = const()[name = string("op_999_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_999_dilations_0 = const()[name = string("op_999_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1024, 1, 5]> lconv1ds_1_depthwise_conv1d_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1, 5]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(20078528))), lut = tensor<fp16, [32, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(20081152))))[name = string("lconv1ds_1_depthwise_conv1d_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 50]> var_999_cast_fp16 = conv(dilations = var_999_dilations_0, groups = var_999_groups_0, pad = var_999_pad_0, pad_type = var_999_pad_type_0, strides = var_999_strides_0, weight = lconv1ds_1_depthwise_conv1d_weight_to_fp16_palettized, x = input_89_cast_fp16)[name = string("op_999_cast_fp16")];
+            tensor<int32, [3]> hidden_states_173_perm_0 = const()[name = string("hidden_states_173_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            string hidden_states_173_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_173_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_173_cast_fp16 = transpose(perm = hidden_states_173_perm_0, x = var_999_cast_fp16)[name = string("transpose_60")];
+            tensor<fp32, [1, 50, 1024]> hidden_states_173_cast_fp16_to_fp32 = cast(dtype = hidden_states_173_cast_fp16_to_fp32_dtype_0, x = hidden_states_173_cast_fp16)[name = string("cast_468")];
+            tensor<fp32, [1, 50, 1024]> clip_46 = clip(alpha = var_948, beta = var_947, x = hidden_states_173_cast_fp16_to_fp32)[name = string("clip_46")];
+            fp32 var_956_promoted_1 = const()[name = string("op_956_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_1004 = pow(x = clip_46, y = var_956_promoted_1)[name = string("op_1004")];
+            tensor<int32, [1]> var_1006_axes_0 = const()[name = string("op_1006_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1006_keep_dims_0 = const()[name = string("op_1006_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_1006 = reduce_mean(axes = var_1006_axes_0, keep_dims = var_1006_keep_dims_0, x = var_1004)[name = string("op_1006")];
+            string var_1006_to_fp16_dtype_0 = const()[name = string("op_1006_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_1007_to_fp16 = const()[name = string("op_1007_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_1006_to_fp16 = cast(dtype = var_1006_to_fp16_dtype_0, x = var_1006)[name = string("cast_467")];
+            tensor<fp16, [1, 50, 1]> mean_squared_29_cast_fp16 = add(x = var_1006_to_fp16, y = var_1007_to_fp16)[name = string("mean_squared_29_cast_fp16")];
+            string mean_squared_29_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_29_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_29_cast_fp16_to_fp32 = cast(dtype = mean_squared_29_cast_fp16_to_fp32_dtype_0, x = mean_squared_29_cast_fp16)[name = string("cast_466")];
+            tensor<fp32, [1, 50, 1]> var_1009 = pow(x = mean_squared_29_cast_fp16_to_fp32, y = var_960)[name = string("op_1009")];
+            string clip_46_to_fp16_dtype_0 = const()[name = string("clip_46_to_fp16_dtype_0"), val = string("fp16")];
+            string var_1009_to_fp16_dtype_0 = const()[name = string("op_1009_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_46_to_fp16 = cast(dtype = clip_46_to_fp16_dtype_0, x = clip_46)[name = string("cast_464")];
+            tensor<fp16, [1, 50, 1]> var_1009_to_fp16 = cast(dtype = var_1009_to_fp16_dtype_0, x = var_1009)[name = string("cast_465")];
+            tensor<fp16, [1, 50, 1024]> normed_output_57_cast_fp16 = mul(x = clip_46_to_fp16, y = var_1009_to_fp16)[name = string("normed_output_57_cast_fp16")];
+            tensor<fp16, [1024]> const_28_to_fp16 = const()[name = string("const_28_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(20082240)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_59_cast_fp16 = mul(x = normed_output_57_cast_fp16, y = const_28_to_fp16)[name = string("normed_output_59_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_179_cast_fp16 = silu(x = normed_output_59_cast_fp16)[name = string("hidden_states_179_cast_fp16")];
+            fp16 lconv1ds_1_linear_end_input_min_to_fp16 = const()[name = string("lconv1ds_1_linear_end_input_min_to_fp16"), val = fp16(-0x1.fp+2)];
+            fp16 lconv1ds_1_linear_end_input_max_to_fp16 = const()[name = string("lconv1ds_1_linear_end_input_max_to_fp16"), val = fp16(0x1.ecp+2)];
+            tensor<fp16, [1, 50, 1024]> clip_47_cast_fp16 = clip(alpha = lconv1ds_1_linear_end_input_min_to_fp16, beta = lconv1ds_1_linear_end_input_max_to_fp16, x = hidden_states_179_cast_fp16)[name = string("clip_47_cast_fp16")];
+            tensor<fp16, [1024, 1024]> lconv1ds_1_linear_end_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(20084352))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(20608704))))[name = string("lconv1ds_1_linear_end_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_20_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = lconv1ds_1_linear_end_linear_weight_to_fp16_palettized, x = clip_47_cast_fp16)[name = string("linear_20_cast_fp16")];
+            fp16 lconv1ds_1_linear_end_output_min_to_fp16 = const()[name = string("lconv1ds_1_linear_end_output_min_to_fp16"), val = fp16(-0x1.04p+3)];
+            fp16 lconv1ds_1_linear_end_output_max_to_fp16 = const()[name = string("lconv1ds_1_linear_end_output_max_to_fp16"), val = fp16(0x1.02p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_48_cast_fp16 = clip(alpha = lconv1ds_1_linear_end_output_min_to_fp16, beta = lconv1ds_1_linear_end_output_max_to_fp16, x = linear_20_cast_fp16)[name = string("clip_48_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_185_cast_fp16 = add(x = clip_48_cast_fp16, y = hidden_states_163_cast_fp16)[name = string("hidden_states_185_cast_fp16")];
+            string hidden_states_185_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_185_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_1033 = const()[name = string("op_1033"), val = fp32(-0x1p-1)];
+            fp32 var_1034 = const()[name = string("op_1034"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_1035 = const()[name = string("op_1035"), val = fp32(-0x1.2a05f2p+33)];
+            tensor<fp32, [1, 50, 1024]> hidden_states_185_cast_fp16_to_fp32 = cast(dtype = hidden_states_185_cast_fp16_to_fp32_dtype_0, x = hidden_states_185_cast_fp16)[name = string("cast_463")];
+            tensor<fp32, [1, 50, 1024]> clip_49 = clip(alpha = var_1035, beta = var_1034, x = hidden_states_185_cast_fp16_to_fp32)[name = string("clip_49")];
+            fp32 var_1029_promoted = const()[name = string("op_1029_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_1043 = pow(x = clip_49, y = var_1029_promoted)[name = string("op_1043")];
+            tensor<int32, [1]> var_1045_axes_0 = const()[name = string("op_1045_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1045_keep_dims_0 = const()[name = string("op_1045_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_1045 = reduce_mean(axes = var_1045_axes_0, keep_dims = var_1045_keep_dims_0, x = var_1043)[name = string("op_1045")];
+            string var_1045_to_fp16_dtype_0 = const()[name = string("op_1045_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_1046_to_fp16 = const()[name = string("op_1046_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_1045_to_fp16 = cast(dtype = var_1045_to_fp16_dtype_0, x = var_1045)[name = string("cast_462")];
+            tensor<fp16, [1, 50, 1]> mean_squared_31_cast_fp16 = add(x = var_1045_to_fp16, y = var_1046_to_fp16)[name = string("mean_squared_31_cast_fp16")];
+            string mean_squared_31_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_31_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_31_cast_fp16_to_fp32 = cast(dtype = mean_squared_31_cast_fp16_to_fp32_dtype_0, x = mean_squared_31_cast_fp16)[name = string("cast_461")];
+            tensor<fp32, [1, 50, 1]> var_1048 = pow(x = mean_squared_31_cast_fp16_to_fp32, y = var_1033)[name = string("op_1048")];
+            string clip_49_to_fp16_dtype_0 = const()[name = string("clip_49_to_fp16_dtype_0"), val = string("fp16")];
+            string var_1048_to_fp16_dtype_0 = const()[name = string("op_1048_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_49_to_fp16 = cast(dtype = clip_49_to_fp16_dtype_0, x = clip_49)[name = string("cast_459")];
+            tensor<fp16, [1, 50, 1]> var_1048_to_fp16 = cast(dtype = var_1048_to_fp16_dtype_0, x = var_1048)[name = string("cast_460")];
+            tensor<fp16, [1, 50, 1024]> normed_output_61_cast_fp16 = mul(x = clip_49_to_fp16, y = var_1048_to_fp16)[name = string("normed_output_61_cast_fp16")];
+            tensor<fp16, [1024]> const_29_to_fp16 = const()[name = string("const_29_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(20609792)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_63_cast_fp16 = mul(x = normed_output_61_cast_fp16, y = const_29_to_fp16)[name = string("normed_output_63_cast_fp16")];
+            fp16 feed_forward2s_1_ffw_layer_1_input_min_to_fp16 = const()[name = string("feed_forward2s_1_ffw_layer_1_input_min_to_fp16"), val = fp16(-0x1.74p+3)];
+            fp16 feed_forward2s_1_ffw_layer_1_input_max_to_fp16 = const()[name = string("feed_forward2s_1_ffw_layer_1_input_max_to_fp16"), val = fp16(0x1.72p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_50_cast_fp16 = clip(alpha = feed_forward2s_1_ffw_layer_1_input_min_to_fp16, beta = feed_forward2s_1_ffw_layer_1_input_max_to_fp16, x = normed_output_63_cast_fp16)[name = string("clip_50_cast_fp16")];
+            tensor<fp16, [4096, 1024]> feed_forward2s_1_ffw_layer_1_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(20611904))), lut = tensor<fp16, [128, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(22709120))))[name = string("feed_forward2s_1_ffw_layer_1_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 4096]> linear_21_cast_fp16 = linear(bias = linear_1_bias_0_to_fp16, weight = feed_forward2s_1_ffw_layer_1_linear_weight_to_fp16_palettized, x = clip_50_cast_fp16)[name = string("linear_21_cast_fp16")];
+            fp16 feed_forward2s_1_ffw_layer_1_output_min_to_fp16 = const()[name = string("feed_forward2s_1_ffw_layer_1_output_min_to_fp16"), val = fp16(-0x1.bap+4)];
+            fp16 feed_forward2s_1_ffw_layer_1_output_max_to_fp16 = const()[name = string("feed_forward2s_1_ffw_layer_1_output_max_to_fp16"), val = fp16(0x1.b8p+4)];
+            tensor<fp16, [1, 50, 4096]> clip_51_cast_fp16 = clip(alpha = feed_forward2s_1_ffw_layer_1_output_min_to_fp16, beta = feed_forward2s_1_ffw_layer_1_output_max_to_fp16, x = linear_21_cast_fp16)[name = string("clip_51_cast_fp16")];
+            tensor<fp16, [1, 50, 4096]> hidden_states_195_cast_fp16 = silu(x = clip_51_cast_fp16)[name = string("hidden_states_195_cast_fp16")];
+            fp16 feed_forward2s_1_ffw_layer_2_input_min_to_fp16 = const()[name = string("feed_forward2s_1_ffw_layer_2_input_min_to_fp16"), val = fp16(-0x1.36p+3)];
+            fp16 feed_forward2s_1_ffw_layer_2_input_max_to_fp16 = const()[name = string("feed_forward2s_1_ffw_layer_2_input_max_to_fp16"), val = fp16(0x1.32p+3)];
+            tensor<fp16, [1, 50, 4096]> clip_52_cast_fp16 = clip(alpha = feed_forward2s_1_ffw_layer_2_input_min_to_fp16, beta = feed_forward2s_1_ffw_layer_2_input_max_to_fp16, x = hidden_states_195_cast_fp16)[name = string("clip_52_cast_fp16")];
+            tensor<fp16, [1024, 4096]> feed_forward2s_1_ffw_layer_2_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 4096]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(22713280))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(24810496))))[name = string("feed_forward2s_1_ffw_layer_2_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_22_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = feed_forward2s_1_ffw_layer_2_linear_weight_to_fp16_palettized, x = clip_52_cast_fp16)[name = string("linear_22_cast_fp16")];
+            fp16 feed_forward2s_1_ffw_layer_2_output_min_to_fp16 = const()[name = string("feed_forward2s_1_ffw_layer_2_output_min_to_fp16"), val = fp16(-0x1.32p+5)];
+            fp16 feed_forward2s_1_ffw_layer_2_output_max_to_fp16 = const()[name = string("feed_forward2s_1_ffw_layer_2_output_max_to_fp16"), val = fp16(0x1.2ep+5)];
+            tensor<fp16, [1, 50, 1024]> clip_53_cast_fp16 = clip(alpha = feed_forward2s_1_ffw_layer_2_output_min_to_fp16, beta = feed_forward2s_1_ffw_layer_2_output_max_to_fp16, x = linear_22_cast_fp16)[name = string("clip_53_cast_fp16")];
+            string clip_53_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_53_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1024]> clip_53_cast_fp16_to_fp32 = cast(dtype = clip_53_cast_fp16_to_fp32_dtype_0, x = clip_53_cast_fp16)[name = string("cast_458")];
+            tensor<fp32, [1, 50, 1024]> clip_54 = clip(alpha = var_1035, beta = var_1034, x = clip_53_cast_fp16_to_fp32)[name = string("clip_54")];
+            fp32 var_1029_promoted_1 = const()[name = string("op_1029_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_1075 = pow(x = clip_54, y = var_1029_promoted_1)[name = string("op_1075")];
+            tensor<int32, [1]> var_1077_axes_0 = const()[name = string("op_1077_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1077_keep_dims_0 = const()[name = string("op_1077_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_1077 = reduce_mean(axes = var_1077_axes_0, keep_dims = var_1077_keep_dims_0, x = var_1075)[name = string("op_1077")];
+            string var_1077_to_fp16_dtype_0 = const()[name = string("op_1077_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_1078_to_fp16 = const()[name = string("op_1078_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_1077_to_fp16 = cast(dtype = var_1077_to_fp16_dtype_0, x = var_1077)[name = string("cast_457")];
+            tensor<fp16, [1, 50, 1]> mean_squared_33_cast_fp16 = add(x = var_1077_to_fp16, y = var_1078_to_fp16)[name = string("mean_squared_33_cast_fp16")];
+            string mean_squared_33_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_33_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_33_cast_fp16_to_fp32 = cast(dtype = mean_squared_33_cast_fp16_to_fp32_dtype_0, x = mean_squared_33_cast_fp16)[name = string("cast_456")];
+            tensor<fp32, [1, 50, 1]> var_1080 = pow(x = mean_squared_33_cast_fp16_to_fp32, y = var_1033)[name = string("op_1080")];
+            string clip_54_to_fp16_dtype_0 = const()[name = string("clip_54_to_fp16_dtype_0"), val = string("fp16")];
+            string var_1080_to_fp16_dtype_0 = const()[name = string("op_1080_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_54_to_fp16 = cast(dtype = clip_54_to_fp16_dtype_0, x = clip_54)[name = string("cast_454")];
+            tensor<fp16, [1, 50, 1]> var_1080_to_fp16 = cast(dtype = var_1080_to_fp16_dtype_0, x = var_1080)[name = string("cast_455")];
+            tensor<fp16, [1, 50, 1024]> normed_output_65_cast_fp16 = mul(x = clip_54_to_fp16, y = var_1080_to_fp16)[name = string("normed_output_65_cast_fp16")];
+            tensor<fp16, [1024]> const_30_to_fp16 = const()[name = string("const_30_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(24811584)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_67_cast_fp16 = mul(x = normed_output_65_cast_fp16, y = const_30_to_fp16)[name = string("normed_output_67_cast_fp16")];
+            fp16 var_1025_to_fp16 = const()[name = string("op_1025_to_fp16"), val = fp16(0x1p-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_207_cast_fp16 = mul(x = normed_output_67_cast_fp16, y = var_1025_to_fp16)[name = string("hidden_states_207_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_209_cast_fp16 = add(x = hidden_states_207_cast_fp16, y = hidden_states_185_cast_fp16)[name = string("hidden_states_209_cast_fp16")];
+            fp16 var_1087_to_fp16 = const()[name = string("op_1087_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_1088_to_fp16 = const()[name = string("op_1088_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_55_cast_fp16 = clip(alpha = var_1087_to_fp16, beta = var_1088_to_fp16, x = hidden_states_209_cast_fp16)[name = string("clip_55_cast_fp16")];
+            string clip_55_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_55_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_1090 = const()[name = string("op_1090"), val = fp32(-0x1p-1)];
+            fp32 var_1094_promoted = const()[name = string("op_1094_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_55_cast_fp16_to_fp32 = cast(dtype = clip_55_cast_fp16_to_fp32_dtype_0, x = clip_55_cast_fp16)[name = string("cast_453")];
+            tensor<fp32, [1, 50, 1024]> var_1100 = pow(x = clip_55_cast_fp16_to_fp32, y = var_1094_promoted)[name = string("op_1100")];
+            tensor<int32, [1]> var_1102_axes_0 = const()[name = string("op_1102_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1102_keep_dims_0 = const()[name = string("op_1102_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_1102 = reduce_mean(axes = var_1102_axes_0, keep_dims = var_1102_keep_dims_0, x = var_1100)[name = string("op_1102")];
+            string var_1102_to_fp16_dtype_0 = const()[name = string("op_1102_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_1103_to_fp16 = const()[name = string("op_1103_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_1102_to_fp16 = cast(dtype = var_1102_to_fp16_dtype_0, x = var_1102)[name = string("cast_452")];
+            tensor<fp16, [1, 50, 1]> mean_squared_35_cast_fp16 = add(x = var_1102_to_fp16, y = var_1103_to_fp16)[name = string("mean_squared_35_cast_fp16")];
+            string mean_squared_35_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_35_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_35_cast_fp16_to_fp32 = cast(dtype = mean_squared_35_cast_fp16_to_fp32_dtype_0, x = mean_squared_35_cast_fp16)[name = string("cast_451")];
+            tensor<fp32, [1, 50, 1]> var_1105 = pow(x = mean_squared_35_cast_fp16_to_fp32, y = var_1090)[name = string("op_1105")];
+            string var_1105_to_fp16_dtype_0 = const()[name = string("op_1105_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_1105_to_fp16 = cast(dtype = var_1105_to_fp16_dtype_0, x = var_1105)[name = string("cast_450")];
+            tensor<fp16, [1, 50, 1024]> normed_output_69_cast_fp16 = mul(x = clip_55_cast_fp16, y = var_1105_to_fp16)[name = string("normed_output_69_cast_fp16")];
+            tensor<fp16, [1024]> const_31_to_fp16 = const()[name = string("const_31_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(24813696)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_71_cast_fp16 = mul(x = normed_output_69_cast_fp16, y = const_31_to_fp16)[name = string("normed_output_71_cast_fp16")];
+            string normed_output_71_cast_fp16_to_fp32_dtype_0 = const()[name = string("normed_output_71_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_1118 = const()[name = string("op_1118"), val = fp32(-0x1p-1)];
+            fp32 var_1119 = const()[name = string("op_1119"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_1120 = const()[name = string("op_1120"), val = fp32(-0x1.2a05f2p+33)];
+            tensor<fp32, [1, 50, 1024]> normed_output_71_cast_fp16_to_fp32 = cast(dtype = normed_output_71_cast_fp16_to_fp32_dtype_0, x = normed_output_71_cast_fp16)[name = string("cast_449")];
+            tensor<fp32, [1, 50, 1024]> clip_56 = clip(alpha = var_1120, beta = var_1119, x = normed_output_71_cast_fp16_to_fp32)[name = string("clip_56")];
+            fp32 var_1114_promoted = const()[name = string("op_1114_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_1128 = pow(x = clip_56, y = var_1114_promoted)[name = string("op_1128")];
+            tensor<int32, [1]> var_1130_axes_0 = const()[name = string("op_1130_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1130_keep_dims_0 = const()[name = string("op_1130_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_1130 = reduce_mean(axes = var_1130_axes_0, keep_dims = var_1130_keep_dims_0, x = var_1128)[name = string("op_1130")];
+            string var_1130_to_fp16_dtype_0 = const()[name = string("op_1130_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_1131_to_fp16 = const()[name = string("op_1131_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_1130_to_fp16 = cast(dtype = var_1130_to_fp16_dtype_0, x = var_1130)[name = string("cast_448")];
+            tensor<fp16, [1, 50, 1]> mean_squared_37_cast_fp16 = add(x = var_1130_to_fp16, y = var_1131_to_fp16)[name = string("mean_squared_37_cast_fp16")];
+            string mean_squared_37_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_37_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_37_cast_fp16_to_fp32 = cast(dtype = mean_squared_37_cast_fp16_to_fp32_dtype_0, x = mean_squared_37_cast_fp16)[name = string("cast_447")];
+            tensor<fp32, [1, 50, 1]> var_1133 = pow(x = mean_squared_37_cast_fp16_to_fp32, y = var_1118)[name = string("op_1133")];
+            string clip_56_to_fp16_dtype_0 = const()[name = string("clip_56_to_fp16_dtype_0"), val = string("fp16")];
+            string var_1133_to_fp16_dtype_0 = const()[name = string("op_1133_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_56_to_fp16 = cast(dtype = clip_56_to_fp16_dtype_0, x = clip_56)[name = string("cast_445")];
+            tensor<fp16, [1, 50, 1]> var_1133_to_fp16 = cast(dtype = var_1133_to_fp16_dtype_0, x = var_1133)[name = string("cast_446")];
+            tensor<fp16, [1, 50, 1024]> normed_output_73_cast_fp16 = mul(x = clip_56_to_fp16, y = var_1133_to_fp16)[name = string("normed_output_73_cast_fp16")];
+            tensor<fp16, [1024]> const_32_to_fp16 = const()[name = string("const_32_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(24815808)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_75_cast_fp16 = mul(x = normed_output_73_cast_fp16, y = const_32_to_fp16)[name = string("normed_output_75_cast_fp16")];
+            fp16 feed_forward1s_2_ffw_layer_1_input_min_to_fp16 = const()[name = string("feed_forward1s_2_ffw_layer_1_input_min_to_fp16"), val = fp16(-0x1.ap+3)];
+            fp16 feed_forward1s_2_ffw_layer_1_input_max_to_fp16 = const()[name = string("feed_forward1s_2_ffw_layer_1_input_max_to_fp16"), val = fp16(0x1.9cp+3)];
+            tensor<fp16, [1, 50, 1024]> clip_57_cast_fp16 = clip(alpha = feed_forward1s_2_ffw_layer_1_input_min_to_fp16, beta = feed_forward1s_2_ffw_layer_1_input_max_to_fp16, x = normed_output_75_cast_fp16)[name = string("clip_57_cast_fp16")];
+            tensor<fp16, [4096, 1024]> feed_forward1s_2_ffw_layer_1_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(24817920))), lut = tensor<fp16, [128, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(26915136))))[name = string("feed_forward1s_2_ffw_layer_1_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 4096]> linear_23_cast_fp16 = linear(bias = linear_1_bias_0_to_fp16, weight = feed_forward1s_2_ffw_layer_1_linear_weight_to_fp16_palettized, x = clip_57_cast_fp16)[name = string("linear_23_cast_fp16")];
+            fp16 feed_forward1s_2_ffw_layer_1_output_min_to_fp16 = const()[name = string("feed_forward1s_2_ffw_layer_1_output_min_to_fp16"), val = fp16(-0x1.c6p+4)];
+            fp16 feed_forward1s_2_ffw_layer_1_output_max_to_fp16 = const()[name = string("feed_forward1s_2_ffw_layer_1_output_max_to_fp16"), val = fp16(0x1.c2p+4)];
+            tensor<fp16, [1, 50, 4096]> clip_58_cast_fp16 = clip(alpha = feed_forward1s_2_ffw_layer_1_output_min_to_fp16, beta = feed_forward1s_2_ffw_layer_1_output_max_to_fp16, x = linear_23_cast_fp16)[name = string("clip_58_cast_fp16")];
+            tensor<fp16, [1, 50, 4096]> hidden_states_225_cast_fp16 = silu(x = clip_58_cast_fp16)[name = string("hidden_states_225_cast_fp16")];
+            fp16 feed_forward1s_2_ffw_layer_2_input_min_to_fp16 = const()[name = string("feed_forward1s_2_ffw_layer_2_input_min_to_fp16"), val = fp16(-0x1.28p+3)];
+            fp16 feed_forward1s_2_ffw_layer_2_input_max_to_fp16 = const()[name = string("feed_forward1s_2_ffw_layer_2_input_max_to_fp16"), val = fp16(0x1.26p+3)];
+            tensor<fp16, [1, 50, 4096]> clip_59_cast_fp16 = clip(alpha = feed_forward1s_2_ffw_layer_2_input_min_to_fp16, beta = feed_forward1s_2_ffw_layer_2_input_max_to_fp16, x = hidden_states_225_cast_fp16)[name = string("clip_59_cast_fp16")];
+            tensor<fp16, [1024, 4096]> feed_forward1s_2_ffw_layer_2_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 4096]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(26919296))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(29016512))))[name = string("feed_forward1s_2_ffw_layer_2_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_24_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = feed_forward1s_2_ffw_layer_2_linear_weight_to_fp16_palettized, x = clip_59_cast_fp16)[name = string("linear_24_cast_fp16")];
+            fp16 feed_forward1s_2_ffw_layer_2_output_min_to_fp16 = const()[name = string("feed_forward1s_2_ffw_layer_2_output_min_to_fp16"), val = fp16(-0x1.2p+5)];
+            fp16 feed_forward1s_2_ffw_layer_2_output_max_to_fp16 = const()[name = string("feed_forward1s_2_ffw_layer_2_output_max_to_fp16"), val = fp16(0x1.1ep+5)];
+            tensor<fp16, [1, 50, 1024]> clip_60_cast_fp16 = clip(alpha = feed_forward1s_2_ffw_layer_2_output_min_to_fp16, beta = feed_forward1s_2_ffw_layer_2_output_max_to_fp16, x = linear_24_cast_fp16)[name = string("clip_60_cast_fp16")];
+            string clip_60_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_60_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1024]> clip_60_cast_fp16_to_fp32 = cast(dtype = clip_60_cast_fp16_to_fp32_dtype_0, x = clip_60_cast_fp16)[name = string("cast_444")];
+            tensor<fp32, [1, 50, 1024]> clip_61 = clip(alpha = var_1120, beta = var_1119, x = clip_60_cast_fp16_to_fp32)[name = string("clip_61")];
+            fp32 var_1114_promoted_1 = const()[name = string("op_1114_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_1160 = pow(x = clip_61, y = var_1114_promoted_1)[name = string("op_1160")];
+            tensor<int32, [1]> var_1162_axes_0 = const()[name = string("op_1162_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1162_keep_dims_0 = const()[name = string("op_1162_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_1162 = reduce_mean(axes = var_1162_axes_0, keep_dims = var_1162_keep_dims_0, x = var_1160)[name = string("op_1162")];
+            string var_1162_to_fp16_dtype_0 = const()[name = string("op_1162_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_1163_to_fp16 = const()[name = string("op_1163_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_1162_to_fp16 = cast(dtype = var_1162_to_fp16_dtype_0, x = var_1162)[name = string("cast_443")];
+            tensor<fp16, [1, 50, 1]> mean_squared_39_cast_fp16 = add(x = var_1162_to_fp16, y = var_1163_to_fp16)[name = string("mean_squared_39_cast_fp16")];
+            string mean_squared_39_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_39_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_39_cast_fp16_to_fp32 = cast(dtype = mean_squared_39_cast_fp16_to_fp32_dtype_0, x = mean_squared_39_cast_fp16)[name = string("cast_442")];
+            tensor<fp32, [1, 50, 1]> var_1165 = pow(x = mean_squared_39_cast_fp16_to_fp32, y = var_1118)[name = string("op_1165")];
+            string clip_61_to_fp16_dtype_0 = const()[name = string("clip_61_to_fp16_dtype_0"), val = string("fp16")];
+            string var_1165_to_fp16_dtype_0 = const()[name = string("op_1165_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_61_to_fp16 = cast(dtype = clip_61_to_fp16_dtype_0, x = clip_61)[name = string("cast_440")];
+            tensor<fp16, [1, 50, 1]> var_1165_to_fp16 = cast(dtype = var_1165_to_fp16_dtype_0, x = var_1165)[name = string("cast_441")];
+            tensor<fp16, [1, 50, 1024]> normed_output_77_cast_fp16 = mul(x = clip_61_to_fp16, y = var_1165_to_fp16)[name = string("normed_output_77_cast_fp16")];
+            tensor<fp16, [1024]> const_33_to_fp16 = const()[name = string("const_33_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(29017600)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_79_cast_fp16 = mul(x = normed_output_77_cast_fp16, y = const_33_to_fp16)[name = string("normed_output_79_cast_fp16")];
+            fp16 var_1110_to_fp16 = const()[name = string("op_1110_to_fp16"), val = fp16(0x1p-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_237_cast_fp16 = mul(x = normed_output_79_cast_fp16, y = var_1110_to_fp16)[name = string("hidden_states_237_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_239_cast_fp16 = add(x = hidden_states_237_cast_fp16, y = normed_output_71_cast_fp16)[name = string("hidden_states_239_cast_fp16")];
+            fp16 var_1172_to_fp16 = const()[name = string("op_1172_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_1173_to_fp16 = const()[name = string("op_1173_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_62_cast_fp16 = clip(alpha = var_1172_to_fp16, beta = var_1173_to_fp16, x = hidden_states_239_cast_fp16)[name = string("clip_62_cast_fp16")];
+            string clip_62_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_62_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_1175 = const()[name = string("op_1175"), val = fp32(-0x1p-1)];
+            fp32 var_1179_promoted = const()[name = string("op_1179_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_62_cast_fp16_to_fp32 = cast(dtype = clip_62_cast_fp16_to_fp32_dtype_0, x = clip_62_cast_fp16)[name = string("cast_439")];
+            tensor<fp32, [1, 50, 1024]> var_1185 = pow(x = clip_62_cast_fp16_to_fp32, y = var_1179_promoted)[name = string("op_1185")];
+            tensor<int32, [1]> var_1187_axes_0 = const()[name = string("op_1187_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1187_keep_dims_0 = const()[name = string("op_1187_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_1187 = reduce_mean(axes = var_1187_axes_0, keep_dims = var_1187_keep_dims_0, x = var_1185)[name = string("op_1187")];
+            string var_1187_to_fp16_dtype_0 = const()[name = string("op_1187_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_1188_to_fp16 = const()[name = string("op_1188_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_1187_to_fp16 = cast(dtype = var_1187_to_fp16_dtype_0, x = var_1187)[name = string("cast_438")];
+            tensor<fp16, [1, 50, 1]> mean_squared_41_cast_fp16 = add(x = var_1187_to_fp16, y = var_1188_to_fp16)[name = string("mean_squared_41_cast_fp16")];
+            string mean_squared_41_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_41_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_41_cast_fp16_to_fp32 = cast(dtype = mean_squared_41_cast_fp16_to_fp32_dtype_0, x = mean_squared_41_cast_fp16)[name = string("cast_437")];
+            tensor<fp32, [1, 50, 1]> var_1190 = pow(x = mean_squared_41_cast_fp16_to_fp32, y = var_1175)[name = string("op_1190")];
+            string var_1190_to_fp16_dtype_0 = const()[name = string("op_1190_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_1190_to_fp16 = cast(dtype = var_1190_to_fp16_dtype_0, x = var_1190)[name = string("cast_436")];
+            tensor<fp16, [1, 50, 1024]> normed_output_81_cast_fp16 = mul(x = clip_62_cast_fp16, y = var_1190_to_fp16)[name = string("normed_output_81_cast_fp16")];
+            tensor<fp16, [1024]> const_34_to_fp16 = const()[name = string("const_34_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(29019712)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_83_cast_fp16 = mul(x = normed_output_81_cast_fp16, y = const_34_to_fp16)[name = string("normed_output_83_cast_fp16")];
+            int32 var_1196 = const()[name = string("op_1196"), val = int32(-1)];
+            fp32 var_1197 = const()[name = string("op_1197"), val = fp32(-0x1.dcd65p+29)];
+            fp16 self_attns_2_q_proj_input_min_to_fp16 = const()[name = string("self_attns_2_q_proj_input_min_to_fp16"), val = fp16(-0x1.9p+3)];
+            fp16 self_attns_2_q_proj_input_max_to_fp16 = const()[name = string("self_attns_2_q_proj_input_max_to_fp16"), val = fp16(0x1.8cp+3)];
+            tensor<fp16, [1, 50, 1024]> clip_63_cast_fp16 = clip(alpha = self_attns_2_q_proj_input_min_to_fp16, beta = self_attns_2_q_proj_input_max_to_fp16, x = normed_output_83_cast_fp16)[name = string("clip_63_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_2_q_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(29021824))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(29546176))))[name = string("self_attns_2_q_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_25_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_2_q_proj_linear_weight_to_fp16_palettized, x = clip_63_cast_fp16)[name = string("linear_25_cast_fp16")];
+            fp16 self_attns_2_q_proj_output_min_to_fp16 = const()[name = string("self_attns_2_q_proj_output_min_to_fp16"), val = fp16(-0x1.4ep+4)];
+            fp16 self_attns_2_q_proj_output_max_to_fp16 = const()[name = string("self_attns_2_q_proj_output_max_to_fp16"), val = fp16(0x1.4cp+4)];
+            tensor<fp16, [1, 50, 1024]> clip_64_cast_fp16 = clip(alpha = self_attns_2_q_proj_output_min_to_fp16, beta = self_attns_2_q_proj_output_max_to_fp16, x = linear_25_cast_fp16)[name = string("clip_64_cast_fp16")];
+            tensor<int32, [4]> var_1241 = const()[name = string("op_1241"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> q_5_cast_fp16 = reshape(shape = var_1241, x = clip_64_cast_fp16)[name = string("q_5_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_2_k_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(29547264))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(30071616))))[name = string("self_attns_2_k_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_26_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_2_k_proj_linear_weight_to_fp16_palettized, x = clip_63_cast_fp16)[name = string("linear_26_cast_fp16")];
+            fp16 self_attns_2_k_proj_output_min_to_fp16 = const()[name = string("self_attns_2_k_proj_output_min_to_fp16"), val = fp16(-0x1.4ep+4)];
+            fp16 self_attns_2_k_proj_output_max_to_fp16 = const()[name = string("self_attns_2_k_proj_output_max_to_fp16"), val = fp16(0x1.4cp+4)];
+            tensor<fp16, [1, 50, 1024]> clip_66_cast_fp16 = clip(alpha = self_attns_2_k_proj_output_min_to_fp16, beta = self_attns_2_k_proj_output_max_to_fp16, x = linear_26_cast_fp16)[name = string("clip_66_cast_fp16")];
+            tensor<int32, [4]> var_1253 = const()[name = string("op_1253"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> k_5_cast_fp16 = reshape(shape = var_1253, x = clip_66_cast_fp16)[name = string("k_5_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_2_v_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(30072704))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(30597056))))[name = string("self_attns_2_v_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_27_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_2_v_proj_linear_weight_to_fp16_palettized, x = clip_63_cast_fp16)[name = string("linear_27_cast_fp16")];
+            fp16 self_attns_2_v_proj_output_min_to_fp16 = const()[name = string("self_attns_2_v_proj_output_min_to_fp16"), val = fp16(-0x1.4ep+4)];
+            fp16 self_attns_2_v_proj_output_max_to_fp16 = const()[name = string("self_attns_2_v_proj_output_max_to_fp16"), val = fp16(0x1.4cp+4)];
+            tensor<fp16, [1, 50, 1024]> clip_68_cast_fp16 = clip(alpha = self_attns_2_v_proj_output_min_to_fp16, beta = self_attns_2_v_proj_output_max_to_fp16, x = linear_27_cast_fp16)[name = string("clip_68_cast_fp16")];
+            tensor<int32, [4]> var_1265 = const()[name = string("op_1265"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> input_117_cast_fp16 = reshape(shape = var_1265, x = clip_68_cast_fp16)[name = string("input_117_cast_fp16")];
+            fp16 var_1267_to_fp16 = const()[name = string("op_1267_to_fp16"), val = fp16(0x1.054p-3)];
+            tensor<fp16, [1, 50, 8, 128]> var_1268_cast_fp16 = mul(x = q_5_cast_fp16, y = var_1267_to_fp16)[name = string("op_1268_cast_fp16")];
+            tensor<fp16, [128]> var_1269_to_fp16 = const()[name = string("op_1269_to_fp16"), val = tensor<fp16, [128]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(30598144)))];
+            tensor<fp16, [1, 50, 8, 128]> input_113_cast_fp16 = mul(x = var_1268_cast_fp16, y = var_1269_to_fp16)[name = string("input_113_cast_fp16")];
+            fp16 var_1271_to_fp16 = const()[name = string("op_1271_to_fp16"), val = fp16(0x1.e5p+0)];
+            tensor<fp16, [1, 50, 8, 128]> input_115_cast_fp16 = mul(x = k_5_cast_fp16, y = var_1271_to_fp16)[name = string("input_115_cast_fp16")];
+            tensor<int32, [8]> q_padded_5_pad_0 = const()[name = string("q_padded_5_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 10, 0, 0, 0, 0])];
+            string q_padded_5_mode_0 = const()[name = string("q_padded_5_mode_0"), val = string("constant")];
+            fp16 const_35_to_fp16 = const()[name = string("const_35_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 60, 8, 128]> q_padded_5_cast_fp16 = pad(constant_val = const_35_to_fp16, mode = q_padded_5_mode_0, pad = q_padded_5_pad_0, x = input_113_cast_fp16)[name = string("q_padded_5_cast_fp16")];
+            tensor<int32, [5]> var_1275 = const()[name = string("op_1275"), val = tensor<int32, [5]>([1, 5, 12, 8, 128])];
+            tensor<fp16, [1, 5, 12, 8, 128]> q_blocks_5_cast_fp16 = reshape(shape = var_1275, x = q_padded_5_cast_fp16)[name = string("q_blocks_5_cast_fp16")];
+            tensor<int32, [8]> k_padded_5_pad_0 = const()[name = string("k_padded_5_pad_0"), val = tensor<int32, [8]>([0, 0, 12, 11, 0, 0, 0, 0])];
+            string k_padded_5_mode_0 = const()[name = string("k_padded_5_mode_0"), val = string("constant")];
+            fp16 const_36_to_fp16 = const()[name = string("const_36_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 73, 8, 128]> k_padded_5_cast_fp16 = pad(constant_val = const_36_to_fp16, mode = k_padded_5_mode_0, pad = k_padded_5_pad_0, x = input_115_cast_fp16)[name = string("k_padded_5_cast_fp16")];
+            tensor<int32, [8]> v_padded_5_pad_0 = const()[name = string("v_padded_5_pad_0"), val = tensor<int32, [8]>([0, 0, 12, 11, 0, 0, 0, 0])];
+            string v_padded_5_mode_0 = const()[name = string("v_padded_5_mode_0"), val = string("constant")];
+            fp16 const_37_to_fp16 = const()[name = string("const_37_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 73, 8, 128]> v_padded_5_cast_fp16 = pad(constant_val = const_37_to_fp16, mode = v_padded_5_mode_0, pad = v_padded_5_pad_0, x = input_117_cast_fp16)[name = string("v_padded_5_cast_fp16")];
+            tensor<int32, [4]> var_1282_begin_0 = const()[name = string("op_1282_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1282_end_0 = const()[name = string("op_1282_end_0"), val = tensor<int32, [4]>([1, 24, 8, 128])];
+            tensor<bool, [4]> var_1282_end_mask_0 = const()[name = string("op_1282_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_1282_cast_fp16 = slice_by_index(begin = var_1282_begin_0, end = var_1282_end_0, end_mask = var_1282_end_mask_0, x = k_padded_5_cast_fp16)[name = string("op_1282_cast_fp16")];
+            tensor<int32, [4]> var_1284_begin_0 = const()[name = string("op_1284_begin_0"), val = tensor<int32, [4]>([0, 12, 0, 0])];
+            tensor<int32, [4]> var_1284_end_0 = const()[name = string("op_1284_end_0"), val = tensor<int32, [4]>([1, 36, 8, 128])];
+            tensor<bool, [4]> var_1284_end_mask_0 = const()[name = string("op_1284_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_1284_cast_fp16 = slice_by_index(begin = var_1284_begin_0, end = var_1284_end_0, end_mask = var_1284_end_mask_0, x = k_padded_5_cast_fp16)[name = string("op_1284_cast_fp16")];
+            tensor<int32, [4]> var_1286_begin_0 = const()[name = string("op_1286_begin_0"), val = tensor<int32, [4]>([0, 24, 0, 0])];
+            tensor<int32, [4]> var_1286_end_0 = const()[name = string("op_1286_end_0"), val = tensor<int32, [4]>([1, 48, 8, 128])];
+            tensor<bool, [4]> var_1286_end_mask_0 = const()[name = string("op_1286_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_1286_cast_fp16 = slice_by_index(begin = var_1286_begin_0, end = var_1286_end_0, end_mask = var_1286_end_mask_0, x = k_padded_5_cast_fp16)[name = string("op_1286_cast_fp16")];
+            tensor<int32, [4]> var_1288_begin_0 = const()[name = string("op_1288_begin_0"), val = tensor<int32, [4]>([0, 36, 0, 0])];
+            tensor<int32, [4]> var_1288_end_0 = const()[name = string("op_1288_end_0"), val = tensor<int32, [4]>([1, 60, 8, 128])];
+            tensor<bool, [4]> var_1288_end_mask_0 = const()[name = string("op_1288_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_1288_cast_fp16 = slice_by_index(begin = var_1288_begin_0, end = var_1288_end_0, end_mask = var_1288_end_mask_0, x = k_padded_5_cast_fp16)[name = string("op_1288_cast_fp16")];
+            tensor<int32, [4]> var_1290_begin_0 = const()[name = string("op_1290_begin_0"), val = tensor<int32, [4]>([0, 48, 0, 0])];
+            tensor<int32, [4]> var_1290_end_0 = const()[name = string("op_1290_end_0"), val = tensor<int32, [4]>([1, 72, 8, 128])];
+            tensor<bool, [4]> var_1290_end_mask_0 = const()[name = string("op_1290_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_1290_cast_fp16 = slice_by_index(begin = var_1290_begin_0, end = var_1290_end_0, end_mask = var_1290_end_mask_0, x = k_padded_5_cast_fp16)[name = string("op_1290_cast_fp16")];
+            int32 k_blocks_5_axis_0 = const()[name = string("k_blocks_5_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 5, 24, 8, 128]> k_blocks_5_cast_fp16 = stack(axis = k_blocks_5_axis_0, values = (var_1282_cast_fp16, var_1284_cast_fp16, var_1286_cast_fp16, var_1288_cast_fp16, var_1290_cast_fp16))[name = string("k_blocks_5_cast_fp16")];
+            tensor<int32, [4]> var_1294_begin_0 = const()[name = string("op_1294_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1294_end_0 = const()[name = string("op_1294_end_0"), val = tensor<int32, [4]>([1, 24, 8, 128])];
+            tensor<bool, [4]> var_1294_end_mask_0 = const()[name = string("op_1294_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_1294_cast_fp16 = slice_by_index(begin = var_1294_begin_0, end = var_1294_end_0, end_mask = var_1294_end_mask_0, x = v_padded_5_cast_fp16)[name = string("op_1294_cast_fp16")];
+            tensor<int32, [4]> var_1296_begin_0 = const()[name = string("op_1296_begin_0"), val = tensor<int32, [4]>([0, 12, 0, 0])];
+            tensor<int32, [4]> var_1296_end_0 = const()[name = string("op_1296_end_0"), val = tensor<int32, [4]>([1, 36, 8, 128])];
+            tensor<bool, [4]> var_1296_end_mask_0 = const()[name = string("op_1296_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_1296_cast_fp16 = slice_by_index(begin = var_1296_begin_0, end = var_1296_end_0, end_mask = var_1296_end_mask_0, x = v_padded_5_cast_fp16)[name = string("op_1296_cast_fp16")];
+            tensor<int32, [4]> var_1298_begin_0 = const()[name = string("op_1298_begin_0"), val = tensor<int32, [4]>([0, 24, 0, 0])];
+            tensor<int32, [4]> var_1298_end_0 = const()[name = string("op_1298_end_0"), val = tensor<int32, [4]>([1, 48, 8, 128])];
+            tensor<bool, [4]> var_1298_end_mask_0 = const()[name = string("op_1298_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_1298_cast_fp16 = slice_by_index(begin = var_1298_begin_0, end = var_1298_end_0, end_mask = var_1298_end_mask_0, x = v_padded_5_cast_fp16)[name = string("op_1298_cast_fp16")];
+            tensor<int32, [4]> var_1300_begin_0 = const()[name = string("op_1300_begin_0"), val = tensor<int32, [4]>([0, 36, 0, 0])];
+            tensor<int32, [4]> var_1300_end_0 = const()[name = string("op_1300_end_0"), val = tensor<int32, [4]>([1, 60, 8, 128])];
+            tensor<bool, [4]> var_1300_end_mask_0 = const()[name = string("op_1300_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_1300_cast_fp16 = slice_by_index(begin = var_1300_begin_0, end = var_1300_end_0, end_mask = var_1300_end_mask_0, x = v_padded_5_cast_fp16)[name = string("op_1300_cast_fp16")];
+            tensor<int32, [4]> var_1302_begin_0 = const()[name = string("op_1302_begin_0"), val = tensor<int32, [4]>([0, 48, 0, 0])];
+            tensor<int32, [4]> var_1302_end_0 = const()[name = string("op_1302_end_0"), val = tensor<int32, [4]>([1, 72, 8, 128])];
+            tensor<bool, [4]> var_1302_end_mask_0 = const()[name = string("op_1302_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_1302_cast_fp16 = slice_by_index(begin = var_1302_begin_0, end = var_1302_end_0, end_mask = var_1302_end_mask_0, x = v_padded_5_cast_fp16)[name = string("op_1302_cast_fp16")];
+            int32 v_blocks_5_axis_0 = const()[name = string("v_blocks_5_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 5, 24, 8, 128]> v_blocks_5_cast_fp16 = stack(axis = v_blocks_5_axis_0, values = (var_1294_cast_fp16, var_1296_cast_fp16, var_1298_cast_fp16, var_1300_cast_fp16, var_1302_cast_fp16))[name = string("v_blocks_5_cast_fp16")];
+            tensor<int32, [5]> var_1310 = const()[name = string("op_1310"), val = tensor<int32, [5]>([0, 3, 1, -3, -1])];
+            tensor<int32, [5]> var_1312 = const()[name = string("op_1312"), val = tensor<int32, [5]>([0, 3, 1, -1, -3])];
+            bool matrix_ac_5_transpose_x_0 = const()[name = string("matrix_ac_5_transpose_x_0"), val = bool(false)];
+            bool matrix_ac_5_transpose_y_0 = const()[name = string("matrix_ac_5_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 5, 12, 128]> queries_5_cast_fp16 = transpose(perm = var_1310, x = q_blocks_5_cast_fp16)[name = string("transpose_58")];
+            tensor<fp16, [1, 8, 5, 128, 24]> keys_t_5_cast_fp16 = transpose(perm = var_1312, x = k_blocks_5_cast_fp16)[name = string("transpose_59")];
+            tensor<fp16, [1, 8, 5, 12, 24]> matrix_ac_5_cast_fp16 = matmul(transpose_x = matrix_ac_5_transpose_x_0, transpose_y = matrix_ac_5_transpose_y_0, x = queries_5_cast_fp16, y = keys_t_5_cast_fp16)[name = string("matrix_ac_5_cast_fp16")];
+            tensor<int32, [4]> var_1315 = const()[name = string("op_1315"), val = tensor<int32, [4]>([1, 8, 60, 128])];
+            tensor<fp16, [1, 8, 60, 128]> q_flat_5_cast_fp16 = reshape(shape = var_1315, x = queries_5_cast_fp16)[name = string("q_flat_5_cast_fp16")];
+            bool matrix_bd_21_transpose_x_0 = const()[name = string("matrix_bd_21_transpose_x_0"), val = bool(false)];
+            bool matrix_bd_21_transpose_y_0 = const()[name = string("matrix_bd_21_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [8, 128, 13]> rel_k_t_5_to_fp16 = const()[name = string("rel_k_t_5_to_fp16"), val = tensor<fp16, [8, 128, 13]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(30598464)))];
+            tensor<fp16, [1, 8, 60, 13]> matrix_bd_21_cast_fp16 = matmul(transpose_x = matrix_bd_21_transpose_x_0, transpose_y = matrix_bd_21_transpose_y_0, x = q_flat_5_cast_fp16, y = rel_k_t_5_to_fp16)[name = string("matrix_bd_21_cast_fp16")];
+            tensor<int32, [5]> var_1320 = const()[name = string("op_1320"), val = tensor<int32, [5]>([1, 8, 5, 12, 13])];
+            tensor<fp16, [1, 8, 5, 12, 13]> input_119_cast_fp16 = reshape(shape = var_1320, x = matrix_bd_21_cast_fp16)[name = string("input_119_cast_fp16")];
+            tensor<int32, [10]> matrix_bd_23_pad_0 = const()[name = string("matrix_bd_23_pad_0"), val = tensor<int32, [10]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(30625152)))];
+            string matrix_bd_23_mode_0 = const()[name = string("matrix_bd_23_mode_0"), val = string("constant")];
+            fp16 const_39_to_fp16 = const()[name = string("const_39_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 8, 5, 12, 25]> matrix_bd_23_cast_fp16 = pad(constant_val = const_39_to_fp16, mode = matrix_bd_23_mode_0, pad = matrix_bd_23_pad_0, x = input_119_cast_fp16)[name = string("matrix_bd_23_cast_fp16")];
+            tensor<int32, [4]> var_1324 = const()[name = string("op_1324"), val = tensor<int32, [4]>([1, 8, 5, 300])];
+            tensor<fp16, [1, 8, 5, 300]> matrix_bd_25_cast_fp16 = reshape(shape = var_1324, x = matrix_bd_23_cast_fp16)[name = string("matrix_bd_25_cast_fp16")];
+            tensor<int32, [4]> matrix_bd_27_begin_0 = const()[name = string("matrix_bd_27_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> matrix_bd_27_end_0 = const()[name = string("matrix_bd_27_end_0"), val = tensor<int32, [4]>([1, 8, 5, 288])];
+            tensor<bool, [4]> matrix_bd_27_end_mask_0 = const()[name = string("matrix_bd_27_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 8, 5, 288]> matrix_bd_27_cast_fp16 = slice_by_index(begin = matrix_bd_27_begin_0, end = matrix_bd_27_end_0, end_mask = matrix_bd_27_end_mask_0, x = matrix_bd_25_cast_fp16)[name = string("matrix_bd_27_cast_fp16")];
+            tensor<int32, [5]> var_1330 = const()[name = string("op_1330"), val = tensor<int32, [5]>([1, 8, 5, 12, 24])];
+            tensor<fp16, [1, 8, 5, 12, 24]> matrix_bd_29_cast_fp16 = reshape(shape = var_1330, x = matrix_bd_27_cast_fp16)[name = string("matrix_bd_29_cast_fp16")];
+            tensor<fp16, [1, 8, 5, 12, 24]> attn_13_cast_fp16 = add(x = matrix_ac_5_cast_fp16, y = matrix_bd_29_cast_fp16)[name = string("attn_13_cast_fp16")];
+            fp16 _inversed_1333_y_0_to_fp16 = const()[name = string("_inversed_1333_y_0_to_fp16"), val = fp16(0x1.47cp-6)];
+            tensor<fp16, [1, 8, 5, 12, 24]> _inversed_1333_cast_fp16 = mul(x = attn_13_cast_fp16, y = _inversed_1333_y_0_to_fp16)[name = string("_inversed_1333_cast_fp16")];
+            string _inversed_1333_cast_fp16_to_fp32_dtype_0 = const()[name = string("_inversed_1333_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 8, 5, 12, 24]> _inversed_1333_cast_fp16_to_fp32 = cast(dtype = _inversed_1333_cast_fp16_to_fp32_dtype_0, x = _inversed_1333_cast_fp16)[name = string("cast_435")];
+            tensor<fp32, [1, 8, 5, 12, 24]> var_1334 = tanh(x = _inversed_1333_cast_fp16_to_fp32)[name = string("op_1334")];
+            string var_1334_to_fp16_dtype_0 = const()[name = string("op_1334_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 self_attns_2_softcap_to_fp16 = const()[name = string("self_attns_2_softcap_to_fp16"), val = fp16(0x1.9p+5)];
+            tensor<fp16, [1, 8, 5, 12, 24]> var_1334_to_fp16 = cast(dtype = var_1334_to_fp16_dtype_0, x = var_1334)[name = string("cast_434")];
+            tensor<fp16, [1, 8, 5, 12, 24]> attn_15_cast_fp16 = mul(x = var_1334_to_fp16, y = self_attns_2_softcap_to_fp16)[name = string("attn_15_cast_fp16")];
+            string attn_15_cast_fp16_to_fp32_dtype_0 = const()[name = string("attn_15_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 8, 5, 12, 24]> attn_15_cast_fp16_to_fp32 = cast(dtype = attn_15_cast_fp16_to_fp32_dtype_0, x = attn_15_cast_fp16)[name = string("cast_433")];
+            tensor<fp32, [1, 8, 5, 12, 24]> input_121 = select(a = var_1197, b = attn_15_cast_fp16_to_fp32, cond = var_460)[name = string("input_121")];
+            tensor<fp32, [1, 8, 5, 12, 24]> var_1338 = softmax(axis = var_1196, x = input_121)[name = string("op_1338")];
+            tensor<int32, [5]> var_1340 = const()[name = string("op_1340"), val = tensor<int32, [5]>([0, 3, 1, -3, -1])];
+            bool out_13_transpose_x_0 = const()[name = string("out_13_transpose_x_0"), val = bool(false)];
+            bool out_13_transpose_y_0 = const()[name = string("out_13_transpose_y_0"), val = bool(false)];
+            string var_1338_to_fp16_dtype_0 = const()[name = string("op_1338_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 8, 5, 24, 128]> values_t_5_cast_fp16 = transpose(perm = var_1340, x = v_blocks_5_cast_fp16)[name = string("transpose_57")];
+            tensor<fp16, [1, 8, 5, 12, 24]> var_1338_to_fp16 = cast(dtype = var_1338_to_fp16_dtype_0, x = var_1338)[name = string("cast_432")];
+            tensor<fp16, [1, 8, 5, 12, 128]> out_13_cast_fp16 = matmul(transpose_x = out_13_transpose_x_0, transpose_y = out_13_transpose_y_0, x = var_1338_to_fp16, y = values_t_5_cast_fp16)[name = string("out_13_cast_fp16")];
+            tensor<int32, [5]> var_1343 = const()[name = string("op_1343"), val = tensor<int32, [5]>([0, 2, 3, 1, 4])];
+            tensor<int32, [3]> var_1345 = const()[name = string("op_1345"), val = tensor<int32, [3]>([1, 60, 1024])];
+            tensor<fp16, [1, 5, 12, 8, 128]> var_1344_cast_fp16 = transpose(perm = var_1343, x = out_13_cast_fp16)[name = string("transpose_56")];
+            tensor<fp16, [1, 60, 1024]> out_15_cast_fp16 = reshape(shape = var_1345, x = var_1344_cast_fp16)[name = string("out_15_cast_fp16")];
+            tensor<int32, [3]> var_1348_begin_0 = const()[name = string("op_1348_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_1348_end_0 = const()[name = string("op_1348_end_0"), val = tensor<int32, [3]>([1, 50, 1024])];
+            tensor<bool, [3]> var_1348_end_mask_0 = const()[name = string("op_1348_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp16, [1, 50, 1024]> var_1348_cast_fp16 = slice_by_index(begin = var_1348_begin_0, end = var_1348_end_0, end_mask = var_1348_end_mask_0, x = out_15_cast_fp16)[name = string("op_1348_cast_fp16")];
+            fp16 self_attns_2_post_input_min_to_fp16 = const()[name = string("self_attns_2_post_input_min_to_fp16"), val = fp16(-0x1.22p+4)];
+            fp16 self_attns_2_post_input_max_to_fp16 = const()[name = string("self_attns_2_post_input_max_to_fp16"), val = fp16(0x1.2p+4)];
+            tensor<fp16, [1, 50, 1024]> clip_69_cast_fp16 = clip(alpha = self_attns_2_post_input_min_to_fp16, beta = self_attns_2_post_input_max_to_fp16, x = var_1348_cast_fp16)[name = string("clip_69_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_2_post_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(30625280))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(31149632))))[name = string("self_attns_2_post_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_29_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_2_post_linear_weight_to_fp16_palettized, x = clip_69_cast_fp16)[name = string("linear_29_cast_fp16")];
+            fp16 self_attns_2_post_output_min_to_fp16 = const()[name = string("self_attns_2_post_output_min_to_fp16"), val = fp16(-0x1.eep+5)];
+            fp16 self_attns_2_post_output_max_to_fp16 = const()[name = string("self_attns_2_post_output_max_to_fp16"), val = fp16(0x1.eap+5)];
+            tensor<fp16, [1, 50, 1024]> clip_70_cast_fp16 = clip(alpha = self_attns_2_post_output_min_to_fp16, beta = self_attns_2_post_output_max_to_fp16, x = linear_29_cast_fp16)[name = string("clip_70_cast_fp16")];
+            fp16 var_1360_to_fp16 = const()[name = string("op_1360_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_1361_to_fp16 = const()[name = string("op_1361_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_71_cast_fp16 = clip(alpha = var_1360_to_fp16, beta = var_1361_to_fp16, x = clip_70_cast_fp16)[name = string("clip_71_cast_fp16")];
+            string clip_71_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_71_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_1363 = const()[name = string("op_1363"), val = fp32(-0x1p-1)];
+            fp32 var_1367_promoted = const()[name = string("op_1367_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_71_cast_fp16_to_fp32 = cast(dtype = clip_71_cast_fp16_to_fp32_dtype_0, x = clip_71_cast_fp16)[name = string("cast_431")];
+            tensor<fp32, [1, 50, 1024]> var_1373 = pow(x = clip_71_cast_fp16_to_fp32, y = var_1367_promoted)[name = string("op_1373")];
+            tensor<int32, [1]> var_1375_axes_0 = const()[name = string("op_1375_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1375_keep_dims_0 = const()[name = string("op_1375_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_1375 = reduce_mean(axes = var_1375_axes_0, keep_dims = var_1375_keep_dims_0, x = var_1373)[name = string("op_1375")];
+            string var_1375_to_fp16_dtype_0 = const()[name = string("op_1375_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_1376_to_fp16 = const()[name = string("op_1376_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_1375_to_fp16 = cast(dtype = var_1375_to_fp16_dtype_0, x = var_1375)[name = string("cast_430")];
+            tensor<fp16, [1, 50, 1]> mean_squared_43_cast_fp16 = add(x = var_1375_to_fp16, y = var_1376_to_fp16)[name = string("mean_squared_43_cast_fp16")];
+            string mean_squared_43_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_43_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_43_cast_fp16_to_fp32 = cast(dtype = mean_squared_43_cast_fp16_to_fp32_dtype_0, x = mean_squared_43_cast_fp16)[name = string("cast_429")];
+            tensor<fp32, [1, 50, 1]> var_1378 = pow(x = mean_squared_43_cast_fp16_to_fp32, y = var_1363)[name = string("op_1378")];
+            string var_1378_to_fp16_dtype_0 = const()[name = string("op_1378_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_1378_to_fp16 = cast(dtype = var_1378_to_fp16_dtype_0, x = var_1378)[name = string("cast_428")];
+            tensor<fp16, [1, 50, 1024]> normed_output_85_cast_fp16 = mul(x = clip_71_cast_fp16, y = var_1378_to_fp16)[name = string("normed_output_85_cast_fp16")];
+            tensor<fp16, [1024]> const_40_to_fp16 = const()[name = string("const_40_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(31150720)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_87_cast_fp16 = mul(x = normed_output_85_cast_fp16, y = const_40_to_fp16)[name = string("normed_output_87_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_265_cast_fp16 = add(x = normed_output_87_cast_fp16, y = hidden_states_239_cast_fp16)[name = string("hidden_states_265_cast_fp16")];
+            string hidden_states_265_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_265_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_1385 = const()[name = string("op_1385"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_1386 = const()[name = string("op_1386"), val = fp32(-0x1.2a05f2p+33)];
+            fp32 var_1398 = const()[name = string("op_1398"), val = fp32(-0x1p-1)];
+            fp32 var_1394_promoted = const()[name = string("op_1394_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> hidden_states_265_cast_fp16_to_fp32 = cast(dtype = hidden_states_265_cast_fp16_to_fp32_dtype_0, x = hidden_states_265_cast_fp16)[name = string("cast_427")];
+            tensor<fp32, [1, 50, 1024]> var_1406 = pow(x = hidden_states_265_cast_fp16_to_fp32, y = var_1394_promoted)[name = string("op_1406")];
+            tensor<int32, [1]> var_1408_axes_0 = const()[name = string("op_1408_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1408_keep_dims_0 = const()[name = string("op_1408_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_1408 = reduce_mean(axes = var_1408_axes_0, keep_dims = var_1408_keep_dims_0, x = var_1406)[name = string("op_1408")];
+            string var_1408_to_fp16_dtype_0 = const()[name = string("op_1408_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_1409_to_fp16 = const()[name = string("op_1409_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_1408_to_fp16 = cast(dtype = var_1408_to_fp16_dtype_0, x = var_1408)[name = string("cast_426")];
+            tensor<fp16, [1, 50, 1]> mean_squared_45_cast_fp16 = add(x = var_1408_to_fp16, y = var_1409_to_fp16)[name = string("mean_squared_45_cast_fp16")];
+            string mean_squared_45_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_45_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_45_cast_fp16_to_fp32 = cast(dtype = mean_squared_45_cast_fp16_to_fp32_dtype_0, x = mean_squared_45_cast_fp16)[name = string("cast_425")];
+            tensor<fp32, [1, 50, 1]> var_1411 = pow(x = mean_squared_45_cast_fp16_to_fp32, y = var_1398)[name = string("op_1411")];
+            string var_1411_to_fp16_dtype_0 = const()[name = string("op_1411_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_1411_to_fp16 = cast(dtype = var_1411_to_fp16_dtype_0, x = var_1411)[name = string("cast_424")];
+            tensor<fp16, [1, 50, 1024]> normed_output_89_cast_fp16 = mul(x = hidden_states_265_cast_fp16, y = var_1411_to_fp16)[name = string("normed_output_89_cast_fp16")];
+            tensor<fp16, [1024]> const_41_to_fp16 = const()[name = string("const_41_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(31152832)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_91_cast_fp16 = mul(x = normed_output_89_cast_fp16, y = const_41_to_fp16)[name = string("normed_output_91_cast_fp16")];
+            fp16 lconv1ds_2_linear_start_input_min_to_fp16 = const()[name = string("lconv1ds_2_linear_start_input_min_to_fp16"), val = fp16(-0x1.7cp+3)];
+            fp16 lconv1ds_2_linear_start_input_max_to_fp16 = const()[name = string("lconv1ds_2_linear_start_input_max_to_fp16"), val = fp16(0x1.78p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_72_cast_fp16 = clip(alpha = lconv1ds_2_linear_start_input_min_to_fp16, beta = lconv1ds_2_linear_start_input_max_to_fp16, x = normed_output_91_cast_fp16)[name = string("clip_72_cast_fp16")];
+            tensor<fp16, [2048, 1024]> lconv1ds_2_linear_start_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(31154944))), lut = tensor<fp16, [64, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(32203584))))[name = string("lconv1ds_2_linear_start_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 2048]> linear_30_cast_fp16 = linear(bias = linear_8_bias_0_to_fp16, weight = lconv1ds_2_linear_start_linear_weight_to_fp16_palettized, x = clip_72_cast_fp16)[name = string("linear_30_cast_fp16")];
+            fp16 lconv1ds_2_linear_start_output_min_to_fp16 = const()[name = string("lconv1ds_2_linear_start_output_min_to_fp16"), val = fp16(-0x1.a8p+4)];
+            fp16 lconv1ds_2_linear_start_output_max_to_fp16 = const()[name = string("lconv1ds_2_linear_start_output_max_to_fp16"), val = fp16(0x1.a4p+4)];
+            tensor<fp16, [1, 50, 2048]> clip_73_cast_fp16 = clip(alpha = lconv1ds_2_linear_start_output_min_to_fp16, beta = lconv1ds_2_linear_start_output_max_to_fp16, x = linear_30_cast_fp16)[name = string("clip_73_cast_fp16")];
+            int32 hidden_states_273_split_num_splits_0 = const()[name = string("hidden_states_273_split_num_splits_0"), val = int32(2)];
+            int32 hidden_states_273_split_axis_0 = const()[name = string("hidden_states_273_split_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_273_split_cast_fp16_0, tensor<fp16, [1, 50, 1024]> hidden_states_273_split_cast_fp16_1 = split(axis = hidden_states_273_split_axis_0, num_splits = hidden_states_273_split_num_splits_0, x = clip_73_cast_fp16)[name = string("hidden_states_273_split_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_273_split_1_sigmoid_cast_fp16 = sigmoid(x = hidden_states_273_split_cast_fp16_1)[name = string("hidden_states_273_split_1_sigmoid_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_273_cast_fp16 = mul(x = hidden_states_273_split_cast_fp16_0, y = hidden_states_273_split_1_sigmoid_cast_fp16)[name = string("hidden_states_273_cast_fp16")];
+            tensor<int32, [3]> input_129_perm_0 = const()[name = string("input_129_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [6]> input_131_pad_0 = const()[name = string("input_131_pad_0"), val = tensor<int32, [6]>([0, 0, 0, 0, 4, 0])];
+            string input_131_mode_0 = const()[name = string("input_131_mode_0"), val = string("constant")];
+            fp16 const_42_to_fp16 = const()[name = string("const_42_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 1024, 50]> input_129_cast_fp16 = transpose(perm = input_129_perm_0, x = hidden_states_273_cast_fp16)[name = string("transpose_55")];
+            tensor<fp16, [1, 1024, 54]> input_131_cast_fp16 = pad(constant_val = const_42_to_fp16, mode = input_131_mode_0, pad = input_131_pad_0, x = input_129_cast_fp16)[name = string("input_131_cast_fp16")];
+            string var_1437_pad_type_0 = const()[name = string("op_1437_pad_type_0"), val = string("valid")];
+            int32 var_1437_groups_0 = const()[name = string("op_1437_groups_0"), val = int32(1024)];
+            tensor<int32, [1]> var_1437_strides_0 = const()[name = string("op_1437_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_1437_pad_0 = const()[name = string("op_1437_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_1437_dilations_0 = const()[name = string("op_1437_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1024, 1, 5]> lconv1ds_2_depthwise_conv1d_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1, 5]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(32205696))), lut = tensor<fp16, [32, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(32208320))))[name = string("lconv1ds_2_depthwise_conv1d_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 50]> var_1437_cast_fp16 = conv(dilations = var_1437_dilations_0, groups = var_1437_groups_0, pad = var_1437_pad_0, pad_type = var_1437_pad_type_0, strides = var_1437_strides_0, weight = lconv1ds_2_depthwise_conv1d_weight_to_fp16_palettized, x = input_131_cast_fp16)[name = string("op_1437_cast_fp16")];
+            tensor<int32, [3]> hidden_states_275_perm_0 = const()[name = string("hidden_states_275_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            string hidden_states_275_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_275_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_275_cast_fp16 = transpose(perm = hidden_states_275_perm_0, x = var_1437_cast_fp16)[name = string("transpose_54")];
+            tensor<fp32, [1, 50, 1024]> hidden_states_275_cast_fp16_to_fp32 = cast(dtype = hidden_states_275_cast_fp16_to_fp32_dtype_0, x = hidden_states_275_cast_fp16)[name = string("cast_423")];
+            tensor<fp32, [1, 50, 1024]> clip_74 = clip(alpha = var_1386, beta = var_1385, x = hidden_states_275_cast_fp16_to_fp32)[name = string("clip_74")];
+            fp32 var_1394_promoted_1 = const()[name = string("op_1394_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_1442 = pow(x = clip_74, y = var_1394_promoted_1)[name = string("op_1442")];
+            tensor<int32, [1]> var_1444_axes_0 = const()[name = string("op_1444_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1444_keep_dims_0 = const()[name = string("op_1444_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_1444 = reduce_mean(axes = var_1444_axes_0, keep_dims = var_1444_keep_dims_0, x = var_1442)[name = string("op_1444")];
+            string var_1444_to_fp16_dtype_0 = const()[name = string("op_1444_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_1445_to_fp16 = const()[name = string("op_1445_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_1444_to_fp16 = cast(dtype = var_1444_to_fp16_dtype_0, x = var_1444)[name = string("cast_422")];
+            tensor<fp16, [1, 50, 1]> mean_squared_47_cast_fp16 = add(x = var_1444_to_fp16, y = var_1445_to_fp16)[name = string("mean_squared_47_cast_fp16")];
+            string mean_squared_47_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_47_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_47_cast_fp16_to_fp32 = cast(dtype = mean_squared_47_cast_fp16_to_fp32_dtype_0, x = mean_squared_47_cast_fp16)[name = string("cast_421")];
+            tensor<fp32, [1, 50, 1]> var_1447 = pow(x = mean_squared_47_cast_fp16_to_fp32, y = var_1398)[name = string("op_1447")];
+            string clip_74_to_fp16_dtype_0 = const()[name = string("clip_74_to_fp16_dtype_0"), val = string("fp16")];
+            string var_1447_to_fp16_dtype_0 = const()[name = string("op_1447_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_74_to_fp16 = cast(dtype = clip_74_to_fp16_dtype_0, x = clip_74)[name = string("cast_419")];
+            tensor<fp16, [1, 50, 1]> var_1447_to_fp16 = cast(dtype = var_1447_to_fp16_dtype_0, x = var_1447)[name = string("cast_420")];
+            tensor<fp16, [1, 50, 1024]> normed_output_93_cast_fp16 = mul(x = clip_74_to_fp16, y = var_1447_to_fp16)[name = string("normed_output_93_cast_fp16")];
+            tensor<fp16, [1024]> const_43_to_fp16 = const()[name = string("const_43_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(32209408)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_95_cast_fp16 = mul(x = normed_output_93_cast_fp16, y = const_43_to_fp16)[name = string("normed_output_95_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_281_cast_fp16 = silu(x = normed_output_95_cast_fp16)[name = string("hidden_states_281_cast_fp16")];
+            fp16 lconv1ds_2_linear_end_input_min_to_fp16 = const()[name = string("lconv1ds_2_linear_end_input_min_to_fp16"), val = fp16(-0x1.9p+2)];
+            fp16 lconv1ds_2_linear_end_input_max_to_fp16 = const()[name = string("lconv1ds_2_linear_end_input_max_to_fp16"), val = fp16(0x1.8ep+2)];
+            tensor<fp16, [1, 50, 1024]> clip_75_cast_fp16 = clip(alpha = lconv1ds_2_linear_end_input_min_to_fp16, beta = lconv1ds_2_linear_end_input_max_to_fp16, x = hidden_states_281_cast_fp16)[name = string("clip_75_cast_fp16")];
+            tensor<fp16, [1024, 1024]> lconv1ds_2_linear_end_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(32211520))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(32735872))))[name = string("lconv1ds_2_linear_end_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_31_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = lconv1ds_2_linear_end_linear_weight_to_fp16_palettized, x = clip_75_cast_fp16)[name = string("linear_31_cast_fp16")];
+            fp16 lconv1ds_2_linear_end_output_min_to_fp16 = const()[name = string("lconv1ds_2_linear_end_output_min_to_fp16"), val = fp16(-0x1.98p+2)];
+            fp16 lconv1ds_2_linear_end_output_max_to_fp16 = const()[name = string("lconv1ds_2_linear_end_output_max_to_fp16"), val = fp16(0x1.94p+2)];
+            tensor<fp16, [1, 50, 1024]> clip_76_cast_fp16 = clip(alpha = lconv1ds_2_linear_end_output_min_to_fp16, beta = lconv1ds_2_linear_end_output_max_to_fp16, x = linear_31_cast_fp16)[name = string("clip_76_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_287_cast_fp16 = add(x = clip_76_cast_fp16, y = hidden_states_265_cast_fp16)[name = string("hidden_states_287_cast_fp16")];
+            string hidden_states_287_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_287_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_1471 = const()[name = string("op_1471"), val = fp32(-0x1p-1)];
+            fp32 var_1472 = const()[name = string("op_1472"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_1473 = const()[name = string("op_1473"), val = fp32(-0x1.2a05f2p+33)];
+            tensor<fp32, [1, 50, 1024]> hidden_states_287_cast_fp16_to_fp32 = cast(dtype = hidden_states_287_cast_fp16_to_fp32_dtype_0, x = hidden_states_287_cast_fp16)[name = string("cast_418")];
+            tensor<fp32, [1, 50, 1024]> clip_77 = clip(alpha = var_1473, beta = var_1472, x = hidden_states_287_cast_fp16_to_fp32)[name = string("clip_77")];
+            fp32 var_1467_promoted = const()[name = string("op_1467_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_1481 = pow(x = clip_77, y = var_1467_promoted)[name = string("op_1481")];
+            tensor<int32, [1]> var_1483_axes_0 = const()[name = string("op_1483_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1483_keep_dims_0 = const()[name = string("op_1483_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_1483 = reduce_mean(axes = var_1483_axes_0, keep_dims = var_1483_keep_dims_0, x = var_1481)[name = string("op_1483")];
+            string var_1483_to_fp16_dtype_0 = const()[name = string("op_1483_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_1484_to_fp16 = const()[name = string("op_1484_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_1483_to_fp16 = cast(dtype = var_1483_to_fp16_dtype_0, x = var_1483)[name = string("cast_417")];
+            tensor<fp16, [1, 50, 1]> mean_squared_49_cast_fp16 = add(x = var_1483_to_fp16, y = var_1484_to_fp16)[name = string("mean_squared_49_cast_fp16")];
+            string mean_squared_49_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_49_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_49_cast_fp16_to_fp32 = cast(dtype = mean_squared_49_cast_fp16_to_fp32_dtype_0, x = mean_squared_49_cast_fp16)[name = string("cast_416")];
+            tensor<fp32, [1, 50, 1]> var_1486 = pow(x = mean_squared_49_cast_fp16_to_fp32, y = var_1471)[name = string("op_1486")];
+            string clip_77_to_fp16_dtype_0 = const()[name = string("clip_77_to_fp16_dtype_0"), val = string("fp16")];
+            string var_1486_to_fp16_dtype_0 = const()[name = string("op_1486_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_77_to_fp16 = cast(dtype = clip_77_to_fp16_dtype_0, x = clip_77)[name = string("cast_414")];
+            tensor<fp16, [1, 50, 1]> var_1486_to_fp16 = cast(dtype = var_1486_to_fp16_dtype_0, x = var_1486)[name = string("cast_415")];
+            tensor<fp16, [1, 50, 1024]> normed_output_97_cast_fp16 = mul(x = clip_77_to_fp16, y = var_1486_to_fp16)[name = string("normed_output_97_cast_fp16")];
+            tensor<fp16, [1024]> const_44_to_fp16 = const()[name = string("const_44_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(32736960)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_99_cast_fp16 = mul(x = normed_output_97_cast_fp16, y = const_44_to_fp16)[name = string("normed_output_99_cast_fp16")];
+            fp16 feed_forward2s_2_ffw_layer_1_input_min_to_fp16 = const()[name = string("feed_forward2s_2_ffw_layer_1_input_min_to_fp16"), val = fp16(-0x1.7cp+3)];
+            fp16 feed_forward2s_2_ffw_layer_1_input_max_to_fp16 = const()[name = string("feed_forward2s_2_ffw_layer_1_input_max_to_fp16"), val = fp16(0x1.78p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_78_cast_fp16 = clip(alpha = feed_forward2s_2_ffw_layer_1_input_min_to_fp16, beta = feed_forward2s_2_ffw_layer_1_input_max_to_fp16, x = normed_output_99_cast_fp16)[name = string("clip_78_cast_fp16")];
+            tensor<fp16, [4096, 1024]> feed_forward2s_2_ffw_layer_1_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(32739072))), lut = tensor<fp16, [128, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(34836288))))[name = string("feed_forward2s_2_ffw_layer_1_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 4096]> linear_32_cast_fp16 = linear(bias = linear_1_bias_0_to_fp16, weight = feed_forward2s_2_ffw_layer_1_linear_weight_to_fp16_palettized, x = clip_78_cast_fp16)[name = string("linear_32_cast_fp16")];
+            fp16 feed_forward2s_2_ffw_layer_1_output_min_to_fp16 = const()[name = string("feed_forward2s_2_ffw_layer_1_output_min_to_fp16"), val = fp16(-0x1.b8p+4)];
+            fp16 feed_forward2s_2_ffw_layer_1_output_max_to_fp16 = const()[name = string("feed_forward2s_2_ffw_layer_1_output_max_to_fp16"), val = fp16(0x1.b6p+4)];
+            tensor<fp16, [1, 50, 4096]> clip_79_cast_fp16 = clip(alpha = feed_forward2s_2_ffw_layer_1_output_min_to_fp16, beta = feed_forward2s_2_ffw_layer_1_output_max_to_fp16, x = linear_32_cast_fp16)[name = string("clip_79_cast_fp16")];
+            tensor<fp16, [1, 50, 4096]> hidden_states_297_cast_fp16 = silu(x = clip_79_cast_fp16)[name = string("hidden_states_297_cast_fp16")];
+            fp16 feed_forward2s_2_ffw_layer_2_input_min_to_fp16 = const()[name = string("feed_forward2s_2_ffw_layer_2_input_min_to_fp16"), val = fp16(-0x1.28p+3)];
+            fp16 feed_forward2s_2_ffw_layer_2_input_max_to_fp16 = const()[name = string("feed_forward2s_2_ffw_layer_2_input_max_to_fp16"), val = fp16(0x1.26p+3)];
+            tensor<fp16, [1, 50, 4096]> clip_80_cast_fp16 = clip(alpha = feed_forward2s_2_ffw_layer_2_input_min_to_fp16, beta = feed_forward2s_2_ffw_layer_2_input_max_to_fp16, x = hidden_states_297_cast_fp16)[name = string("clip_80_cast_fp16")];
+            tensor<fp16, [1024, 4096]> feed_forward2s_2_ffw_layer_2_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 4096]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(34840448))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(36937664))))[name = string("feed_forward2s_2_ffw_layer_2_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_33_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = feed_forward2s_2_ffw_layer_2_linear_weight_to_fp16_palettized, x = clip_80_cast_fp16)[name = string("linear_33_cast_fp16")];
+            fp16 feed_forward2s_2_ffw_layer_2_output_min_to_fp16 = const()[name = string("feed_forward2s_2_ffw_layer_2_output_min_to_fp16"), val = fp16(-0x1.38p+5)];
+            fp16 feed_forward2s_2_ffw_layer_2_output_max_to_fp16 = const()[name = string("feed_forward2s_2_ffw_layer_2_output_max_to_fp16"), val = fp16(0x1.36p+5)];
+            tensor<fp16, [1, 50, 1024]> clip_81_cast_fp16 = clip(alpha = feed_forward2s_2_ffw_layer_2_output_min_to_fp16, beta = feed_forward2s_2_ffw_layer_2_output_max_to_fp16, x = linear_33_cast_fp16)[name = string("clip_81_cast_fp16")];
+            string clip_81_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_81_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1024]> clip_81_cast_fp16_to_fp32 = cast(dtype = clip_81_cast_fp16_to_fp32_dtype_0, x = clip_81_cast_fp16)[name = string("cast_413")];
+            tensor<fp32, [1, 50, 1024]> clip_82 = clip(alpha = var_1473, beta = var_1472, x = clip_81_cast_fp16_to_fp32)[name = string("clip_82")];
+            fp32 var_1467_promoted_1 = const()[name = string("op_1467_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_1513 = pow(x = clip_82, y = var_1467_promoted_1)[name = string("op_1513")];
+            tensor<int32, [1]> var_1515_axes_0 = const()[name = string("op_1515_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1515_keep_dims_0 = const()[name = string("op_1515_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_1515 = reduce_mean(axes = var_1515_axes_0, keep_dims = var_1515_keep_dims_0, x = var_1513)[name = string("op_1515")];
+            string var_1515_to_fp16_dtype_0 = const()[name = string("op_1515_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_1516_to_fp16 = const()[name = string("op_1516_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_1515_to_fp16 = cast(dtype = var_1515_to_fp16_dtype_0, x = var_1515)[name = string("cast_412")];
+            tensor<fp16, [1, 50, 1]> mean_squared_51_cast_fp16 = add(x = var_1515_to_fp16, y = var_1516_to_fp16)[name = string("mean_squared_51_cast_fp16")];
+            string mean_squared_51_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_51_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_51_cast_fp16_to_fp32 = cast(dtype = mean_squared_51_cast_fp16_to_fp32_dtype_0, x = mean_squared_51_cast_fp16)[name = string("cast_411")];
+            tensor<fp32, [1, 50, 1]> var_1518 = pow(x = mean_squared_51_cast_fp16_to_fp32, y = var_1471)[name = string("op_1518")];
+            string clip_82_to_fp16_dtype_0 = const()[name = string("clip_82_to_fp16_dtype_0"), val = string("fp16")];
+            string var_1518_to_fp16_dtype_0 = const()[name = string("op_1518_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_82_to_fp16 = cast(dtype = clip_82_to_fp16_dtype_0, x = clip_82)[name = string("cast_409")];
+            tensor<fp16, [1, 50, 1]> var_1518_to_fp16 = cast(dtype = var_1518_to_fp16_dtype_0, x = var_1518)[name = string("cast_410")];
+            tensor<fp16, [1, 50, 1024]> normed_output_101_cast_fp16 = mul(x = clip_82_to_fp16, y = var_1518_to_fp16)[name = string("normed_output_101_cast_fp16")];
+            tensor<fp16, [1024]> const_45_to_fp16 = const()[name = string("const_45_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(36938752)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_103_cast_fp16 = mul(x = normed_output_101_cast_fp16, y = const_45_to_fp16)[name = string("normed_output_103_cast_fp16")];
+            fp16 var_1463_to_fp16 = const()[name = string("op_1463_to_fp16"), val = fp16(0x1p-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_309_cast_fp16 = mul(x = normed_output_103_cast_fp16, y = var_1463_to_fp16)[name = string("hidden_states_309_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_311_cast_fp16 = add(x = hidden_states_309_cast_fp16, y = hidden_states_287_cast_fp16)[name = string("hidden_states_311_cast_fp16")];
+            fp16 var_1525_to_fp16 = const()[name = string("op_1525_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_1526_to_fp16 = const()[name = string("op_1526_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_83_cast_fp16 = clip(alpha = var_1525_to_fp16, beta = var_1526_to_fp16, x = hidden_states_311_cast_fp16)[name = string("clip_83_cast_fp16")];
+            string clip_83_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_83_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_1528 = const()[name = string("op_1528"), val = fp32(-0x1p-1)];
+            fp32 var_1532_promoted = const()[name = string("op_1532_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_83_cast_fp16_to_fp32 = cast(dtype = clip_83_cast_fp16_to_fp32_dtype_0, x = clip_83_cast_fp16)[name = string("cast_408")];
+            tensor<fp32, [1, 50, 1024]> var_1538 = pow(x = clip_83_cast_fp16_to_fp32, y = var_1532_promoted)[name = string("op_1538")];
+            tensor<int32, [1]> var_1540_axes_0 = const()[name = string("op_1540_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1540_keep_dims_0 = const()[name = string("op_1540_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_1540 = reduce_mean(axes = var_1540_axes_0, keep_dims = var_1540_keep_dims_0, x = var_1538)[name = string("op_1540")];
+            string var_1540_to_fp16_dtype_0 = const()[name = string("op_1540_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_1541_to_fp16 = const()[name = string("op_1541_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_1540_to_fp16 = cast(dtype = var_1540_to_fp16_dtype_0, x = var_1540)[name = string("cast_407")];
+            tensor<fp16, [1, 50, 1]> mean_squared_53_cast_fp16 = add(x = var_1540_to_fp16, y = var_1541_to_fp16)[name = string("mean_squared_53_cast_fp16")];
+            string mean_squared_53_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_53_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_53_cast_fp16_to_fp32 = cast(dtype = mean_squared_53_cast_fp16_to_fp32_dtype_0, x = mean_squared_53_cast_fp16)[name = string("cast_406")];
+            tensor<fp32, [1, 50, 1]> var_1543 = pow(x = mean_squared_53_cast_fp16_to_fp32, y = var_1528)[name = string("op_1543")];
+            string var_1543_to_fp16_dtype_0 = const()[name = string("op_1543_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_1543_to_fp16 = cast(dtype = var_1543_to_fp16_dtype_0, x = var_1543)[name = string("cast_405")];
+            tensor<fp16, [1, 50, 1024]> normed_output_105_cast_fp16 = mul(x = clip_83_cast_fp16, y = var_1543_to_fp16)[name = string("normed_output_105_cast_fp16")];
+            tensor<fp16, [1024]> const_46_to_fp16 = const()[name = string("const_46_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(36940864)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_107_cast_fp16 = mul(x = normed_output_105_cast_fp16, y = const_46_to_fp16)[name = string("normed_output_107_cast_fp16")];
+            string normed_output_107_cast_fp16_to_fp32_dtype_0 = const()[name = string("normed_output_107_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_1556 = const()[name = string("op_1556"), val = fp32(-0x1p-1)];
+            fp32 var_1557 = const()[name = string("op_1557"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_1558 = const()[name = string("op_1558"), val = fp32(-0x1.2a05f2p+33)];
+            tensor<fp32, [1, 50, 1024]> normed_output_107_cast_fp16_to_fp32 = cast(dtype = normed_output_107_cast_fp16_to_fp32_dtype_0, x = normed_output_107_cast_fp16)[name = string("cast_404")];
+            tensor<fp32, [1, 50, 1024]> clip_84 = clip(alpha = var_1558, beta = var_1557, x = normed_output_107_cast_fp16_to_fp32)[name = string("clip_84")];
+            fp32 var_1552_promoted = const()[name = string("op_1552_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_1566 = pow(x = clip_84, y = var_1552_promoted)[name = string("op_1566")];
+            tensor<int32, [1]> var_1568_axes_0 = const()[name = string("op_1568_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1568_keep_dims_0 = const()[name = string("op_1568_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_1568 = reduce_mean(axes = var_1568_axes_0, keep_dims = var_1568_keep_dims_0, x = var_1566)[name = string("op_1568")];
+            string var_1568_to_fp16_dtype_0 = const()[name = string("op_1568_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_1569_to_fp16 = const()[name = string("op_1569_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_1568_to_fp16 = cast(dtype = var_1568_to_fp16_dtype_0, x = var_1568)[name = string("cast_403")];
+            tensor<fp16, [1, 50, 1]> mean_squared_55_cast_fp16 = add(x = var_1568_to_fp16, y = var_1569_to_fp16)[name = string("mean_squared_55_cast_fp16")];
+            string mean_squared_55_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_55_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_55_cast_fp16_to_fp32 = cast(dtype = mean_squared_55_cast_fp16_to_fp32_dtype_0, x = mean_squared_55_cast_fp16)[name = string("cast_402")];
+            tensor<fp32, [1, 50, 1]> var_1571 = pow(x = mean_squared_55_cast_fp16_to_fp32, y = var_1556)[name = string("op_1571")];
+            string clip_84_to_fp16_dtype_0 = const()[name = string("clip_84_to_fp16_dtype_0"), val = string("fp16")];
+            string var_1571_to_fp16_dtype_0 = const()[name = string("op_1571_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_84_to_fp16 = cast(dtype = clip_84_to_fp16_dtype_0, x = clip_84)[name = string("cast_400")];
+            tensor<fp16, [1, 50, 1]> var_1571_to_fp16 = cast(dtype = var_1571_to_fp16_dtype_0, x = var_1571)[name = string("cast_401")];
+            tensor<fp16, [1, 50, 1024]> normed_output_109_cast_fp16 = mul(x = clip_84_to_fp16, y = var_1571_to_fp16)[name = string("normed_output_109_cast_fp16")];
+            tensor<fp16, [1024]> const_47_to_fp16 = const()[name = string("const_47_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(36942976)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_111_cast_fp16 = mul(x = normed_output_109_cast_fp16, y = const_47_to_fp16)[name = string("normed_output_111_cast_fp16")];
+            fp16 feed_forward1s_3_ffw_layer_1_input_min_to_fp16 = const()[name = string("feed_forward1s_3_ffw_layer_1_input_min_to_fp16"), val = fp16(-0x1.8ap+3)];
+            fp16 feed_forward1s_3_ffw_layer_1_input_max_to_fp16 = const()[name = string("feed_forward1s_3_ffw_layer_1_input_max_to_fp16"), val = fp16(0x1.88p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_85_cast_fp16 = clip(alpha = feed_forward1s_3_ffw_layer_1_input_min_to_fp16, beta = feed_forward1s_3_ffw_layer_1_input_max_to_fp16, x = normed_output_111_cast_fp16)[name = string("clip_85_cast_fp16")];
+            tensor<fp16, [4096, 1024]> feed_forward1s_3_ffw_layer_1_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(36945088))), lut = tensor<fp16, [128, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(39042304))))[name = string("feed_forward1s_3_ffw_layer_1_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 4096]> linear_34_cast_fp16 = linear(bias = linear_1_bias_0_to_fp16, weight = feed_forward1s_3_ffw_layer_1_linear_weight_to_fp16_palettized, x = clip_85_cast_fp16)[name = string("linear_34_cast_fp16")];
+            fp16 feed_forward1s_3_ffw_layer_1_output_min_to_fp16 = const()[name = string("feed_forward1s_3_ffw_layer_1_output_min_to_fp16"), val = fp16(-0x1.bp+4)];
+            fp16 feed_forward1s_3_ffw_layer_1_output_max_to_fp16 = const()[name = string("feed_forward1s_3_ffw_layer_1_output_max_to_fp16"), val = fp16(0x1.acp+4)];
+            tensor<fp16, [1, 50, 4096]> clip_86_cast_fp16 = clip(alpha = feed_forward1s_3_ffw_layer_1_output_min_to_fp16, beta = feed_forward1s_3_ffw_layer_1_output_max_to_fp16, x = linear_34_cast_fp16)[name = string("clip_86_cast_fp16")];
+            tensor<fp16, [1, 50, 4096]> hidden_states_327_cast_fp16 = silu(x = clip_86_cast_fp16)[name = string("hidden_states_327_cast_fp16")];
+            fp16 feed_forward1s_3_ffw_layer_2_input_min_to_fp16 = const()[name = string("feed_forward1s_3_ffw_layer_2_input_min_to_fp16"), val = fp16(-0x1.26p+3)];
+            fp16 feed_forward1s_3_ffw_layer_2_input_max_to_fp16 = const()[name = string("feed_forward1s_3_ffw_layer_2_input_max_to_fp16"), val = fp16(0x1.24p+3)];
+            tensor<fp16, [1, 50, 4096]> clip_87_cast_fp16 = clip(alpha = feed_forward1s_3_ffw_layer_2_input_min_to_fp16, beta = feed_forward1s_3_ffw_layer_2_input_max_to_fp16, x = hidden_states_327_cast_fp16)[name = string("clip_87_cast_fp16")];
+            tensor<fp16, [1024, 4096]> feed_forward1s_3_ffw_layer_2_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 4096]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(39046464))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(41143680))))[name = string("feed_forward1s_3_ffw_layer_2_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_35_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = feed_forward1s_3_ffw_layer_2_linear_weight_to_fp16_palettized, x = clip_87_cast_fp16)[name = string("linear_35_cast_fp16")];
+            fp16 feed_forward1s_3_ffw_layer_2_output_min_to_fp16 = const()[name = string("feed_forward1s_3_ffw_layer_2_output_min_to_fp16"), val = fp16(-0x1.48p+5)];
+            fp16 feed_forward1s_3_ffw_layer_2_output_max_to_fp16 = const()[name = string("feed_forward1s_3_ffw_layer_2_output_max_to_fp16"), val = fp16(0x1.46p+5)];
+            tensor<fp16, [1, 50, 1024]> clip_88_cast_fp16 = clip(alpha = feed_forward1s_3_ffw_layer_2_output_min_to_fp16, beta = feed_forward1s_3_ffw_layer_2_output_max_to_fp16, x = linear_35_cast_fp16)[name = string("clip_88_cast_fp16")];
+            string clip_88_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_88_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1024]> clip_88_cast_fp16_to_fp32 = cast(dtype = clip_88_cast_fp16_to_fp32_dtype_0, x = clip_88_cast_fp16)[name = string("cast_399")];
+            tensor<fp32, [1, 50, 1024]> clip_89 = clip(alpha = var_1558, beta = var_1557, x = clip_88_cast_fp16_to_fp32)[name = string("clip_89")];
+            fp32 var_1552_promoted_1 = const()[name = string("op_1552_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_1598 = pow(x = clip_89, y = var_1552_promoted_1)[name = string("op_1598")];
+            tensor<int32, [1]> var_1600_axes_0 = const()[name = string("op_1600_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1600_keep_dims_0 = const()[name = string("op_1600_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_1600 = reduce_mean(axes = var_1600_axes_0, keep_dims = var_1600_keep_dims_0, x = var_1598)[name = string("op_1600")];
+            string var_1600_to_fp16_dtype_0 = const()[name = string("op_1600_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_1601_to_fp16 = const()[name = string("op_1601_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_1600_to_fp16 = cast(dtype = var_1600_to_fp16_dtype_0, x = var_1600)[name = string("cast_398")];
+            tensor<fp16, [1, 50, 1]> mean_squared_57_cast_fp16 = add(x = var_1600_to_fp16, y = var_1601_to_fp16)[name = string("mean_squared_57_cast_fp16")];
+            string mean_squared_57_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_57_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_57_cast_fp16_to_fp32 = cast(dtype = mean_squared_57_cast_fp16_to_fp32_dtype_0, x = mean_squared_57_cast_fp16)[name = string("cast_397")];
+            tensor<fp32, [1, 50, 1]> var_1603 = pow(x = mean_squared_57_cast_fp16_to_fp32, y = var_1556)[name = string("op_1603")];
+            string clip_89_to_fp16_dtype_0 = const()[name = string("clip_89_to_fp16_dtype_0"), val = string("fp16")];
+            string var_1603_to_fp16_dtype_0 = const()[name = string("op_1603_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_89_to_fp16 = cast(dtype = clip_89_to_fp16_dtype_0, x = clip_89)[name = string("cast_395")];
+            tensor<fp16, [1, 50, 1]> var_1603_to_fp16 = cast(dtype = var_1603_to_fp16_dtype_0, x = var_1603)[name = string("cast_396")];
+            tensor<fp16, [1, 50, 1024]> normed_output_113_cast_fp16 = mul(x = clip_89_to_fp16, y = var_1603_to_fp16)[name = string("normed_output_113_cast_fp16")];
+            tensor<fp16, [1024]> const_48_to_fp16 = const()[name = string("const_48_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(41144768)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_115_cast_fp16 = mul(x = normed_output_113_cast_fp16, y = const_48_to_fp16)[name = string("normed_output_115_cast_fp16")];
+            fp16 var_1548_to_fp16 = const()[name = string("op_1548_to_fp16"), val = fp16(0x1p-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_339_cast_fp16 = mul(x = normed_output_115_cast_fp16, y = var_1548_to_fp16)[name = string("hidden_states_339_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_341_cast_fp16 = add(x = hidden_states_339_cast_fp16, y = normed_output_107_cast_fp16)[name = string("hidden_states_341_cast_fp16")];
+            fp16 var_1610_to_fp16 = const()[name = string("op_1610_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_1611_to_fp16 = const()[name = string("op_1611_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_90_cast_fp16 = clip(alpha = var_1610_to_fp16, beta = var_1611_to_fp16, x = hidden_states_341_cast_fp16)[name = string("clip_90_cast_fp16")];
+            string clip_90_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_90_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_1613 = const()[name = string("op_1613"), val = fp32(-0x1p-1)];
+            fp32 var_1617_promoted = const()[name = string("op_1617_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_90_cast_fp16_to_fp32 = cast(dtype = clip_90_cast_fp16_to_fp32_dtype_0, x = clip_90_cast_fp16)[name = string("cast_394")];
+            tensor<fp32, [1, 50, 1024]> var_1623 = pow(x = clip_90_cast_fp16_to_fp32, y = var_1617_promoted)[name = string("op_1623")];
+            tensor<int32, [1]> var_1625_axes_0 = const()[name = string("op_1625_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1625_keep_dims_0 = const()[name = string("op_1625_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_1625 = reduce_mean(axes = var_1625_axes_0, keep_dims = var_1625_keep_dims_0, x = var_1623)[name = string("op_1625")];
+            string var_1625_to_fp16_dtype_0 = const()[name = string("op_1625_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_1626_to_fp16 = const()[name = string("op_1626_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_1625_to_fp16 = cast(dtype = var_1625_to_fp16_dtype_0, x = var_1625)[name = string("cast_393")];
+            tensor<fp16, [1, 50, 1]> mean_squared_59_cast_fp16 = add(x = var_1625_to_fp16, y = var_1626_to_fp16)[name = string("mean_squared_59_cast_fp16")];
+            string mean_squared_59_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_59_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_59_cast_fp16_to_fp32 = cast(dtype = mean_squared_59_cast_fp16_to_fp32_dtype_0, x = mean_squared_59_cast_fp16)[name = string("cast_392")];
+            tensor<fp32, [1, 50, 1]> var_1628 = pow(x = mean_squared_59_cast_fp16_to_fp32, y = var_1613)[name = string("op_1628")];
+            string var_1628_to_fp16_dtype_0 = const()[name = string("op_1628_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_1628_to_fp16 = cast(dtype = var_1628_to_fp16_dtype_0, x = var_1628)[name = string("cast_391")];
+            tensor<fp16, [1, 50, 1024]> normed_output_117_cast_fp16 = mul(x = clip_90_cast_fp16, y = var_1628_to_fp16)[name = string("normed_output_117_cast_fp16")];
+            tensor<fp16, [1024]> const_49_to_fp16 = const()[name = string("const_49_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(41146880)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_119_cast_fp16 = mul(x = normed_output_117_cast_fp16, y = const_49_to_fp16)[name = string("normed_output_119_cast_fp16")];
+            int32 var_1634 = const()[name = string("op_1634"), val = int32(-1)];
+            fp32 var_1635 = const()[name = string("op_1635"), val = fp32(-0x1.dcd65p+29)];
+            fp16 self_attns_3_q_proj_input_min_to_fp16 = const()[name = string("self_attns_3_q_proj_input_min_to_fp16"), val = fp16(-0x1.7ap+3)];
+            fp16 self_attns_3_q_proj_input_max_to_fp16 = const()[name = string("self_attns_3_q_proj_input_max_to_fp16"), val = fp16(0x1.78p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_91_cast_fp16 = clip(alpha = self_attns_3_q_proj_input_min_to_fp16, beta = self_attns_3_q_proj_input_max_to_fp16, x = normed_output_119_cast_fp16)[name = string("clip_91_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_3_q_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(41148992))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(41673344))))[name = string("self_attns_3_q_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_36_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_3_q_proj_linear_weight_to_fp16_palettized, x = clip_91_cast_fp16)[name = string("linear_36_cast_fp16")];
+            fp16 self_attns_3_q_proj_output_min_to_fp16 = const()[name = string("self_attns_3_q_proj_output_min_to_fp16"), val = fp16(-0x1.2cp+4)];
+            fp16 self_attns_3_q_proj_output_max_to_fp16 = const()[name = string("self_attns_3_q_proj_output_max_to_fp16"), val = fp16(0x1.2ap+4)];
+            tensor<fp16, [1, 50, 1024]> clip_92_cast_fp16 = clip(alpha = self_attns_3_q_proj_output_min_to_fp16, beta = self_attns_3_q_proj_output_max_to_fp16, x = linear_36_cast_fp16)[name = string("clip_92_cast_fp16")];
+            tensor<int32, [4]> var_1679 = const()[name = string("op_1679"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> q_7_cast_fp16 = reshape(shape = var_1679, x = clip_92_cast_fp16)[name = string("q_7_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_3_k_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(41674432))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(42198784))))[name = string("self_attns_3_k_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_37_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_3_k_proj_linear_weight_to_fp16_palettized, x = clip_91_cast_fp16)[name = string("linear_37_cast_fp16")];
+            fp16 self_attns_3_k_proj_output_min_to_fp16 = const()[name = string("self_attns_3_k_proj_output_min_to_fp16"), val = fp16(-0x1.2cp+4)];
+            fp16 self_attns_3_k_proj_output_max_to_fp16 = const()[name = string("self_attns_3_k_proj_output_max_to_fp16"), val = fp16(0x1.2ap+4)];
+            tensor<fp16, [1, 50, 1024]> clip_94_cast_fp16 = clip(alpha = self_attns_3_k_proj_output_min_to_fp16, beta = self_attns_3_k_proj_output_max_to_fp16, x = linear_37_cast_fp16)[name = string("clip_94_cast_fp16")];
+            tensor<int32, [4]> var_1691 = const()[name = string("op_1691"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> k_7_cast_fp16 = reshape(shape = var_1691, x = clip_94_cast_fp16)[name = string("k_7_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_3_v_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(42199872))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(42724224))))[name = string("self_attns_3_v_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_38_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_3_v_proj_linear_weight_to_fp16_palettized, x = clip_91_cast_fp16)[name = string("linear_38_cast_fp16")];
+            fp16 self_attns_3_v_proj_output_min_to_fp16 = const()[name = string("self_attns_3_v_proj_output_min_to_fp16"), val = fp16(-0x1.2cp+4)];
+            fp16 self_attns_3_v_proj_output_max_to_fp16 = const()[name = string("self_attns_3_v_proj_output_max_to_fp16"), val = fp16(0x1.2ap+4)];
+            tensor<fp16, [1, 50, 1024]> clip_96_cast_fp16 = clip(alpha = self_attns_3_v_proj_output_min_to_fp16, beta = self_attns_3_v_proj_output_max_to_fp16, x = linear_38_cast_fp16)[name = string("clip_96_cast_fp16")];
+            tensor<int32, [4]> var_1703 = const()[name = string("op_1703"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> input_159_cast_fp16 = reshape(shape = var_1703, x = clip_96_cast_fp16)[name = string("input_159_cast_fp16")];
+            fp16 var_1705_to_fp16 = const()[name = string("op_1705_to_fp16"), val = fp16(0x1.054p-3)];
+            tensor<fp16, [1, 50, 8, 128]> var_1706_cast_fp16 = mul(x = q_7_cast_fp16, y = var_1705_to_fp16)[name = string("op_1706_cast_fp16")];
+            tensor<fp16, [128]> var_1707_to_fp16 = const()[name = string("op_1707_to_fp16"), val = tensor<fp16, [128]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(42725312)))];
+            tensor<fp16, [1, 50, 8, 128]> input_155_cast_fp16 = mul(x = var_1706_cast_fp16, y = var_1707_to_fp16)[name = string("input_155_cast_fp16")];
+            fp16 var_1709_to_fp16 = const()[name = string("op_1709_to_fp16"), val = fp16(0x1.e5p+0)];
+            tensor<fp16, [1, 50, 8, 128]> input_157_cast_fp16 = mul(x = k_7_cast_fp16, y = var_1709_to_fp16)[name = string("input_157_cast_fp16")];
+            tensor<int32, [8]> q_padded_7_pad_0 = const()[name = string("q_padded_7_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 10, 0, 0, 0, 0])];
+            string q_padded_7_mode_0 = const()[name = string("q_padded_7_mode_0"), val = string("constant")];
+            fp16 const_50_to_fp16 = const()[name = string("const_50_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 60, 8, 128]> q_padded_7_cast_fp16 = pad(constant_val = const_50_to_fp16, mode = q_padded_7_mode_0, pad = q_padded_7_pad_0, x = input_155_cast_fp16)[name = string("q_padded_7_cast_fp16")];
+            tensor<int32, [5]> var_1713 = const()[name = string("op_1713"), val = tensor<int32, [5]>([1, 5, 12, 8, 128])];
+            tensor<fp16, [1, 5, 12, 8, 128]> q_blocks_7_cast_fp16 = reshape(shape = var_1713, x = q_padded_7_cast_fp16)[name = string("q_blocks_7_cast_fp16")];
+            tensor<int32, [8]> k_padded_7_pad_0 = const()[name = string("k_padded_7_pad_0"), val = tensor<int32, [8]>([0, 0, 12, 11, 0, 0, 0, 0])];
+            string k_padded_7_mode_0 = const()[name = string("k_padded_7_mode_0"), val = string("constant")];
+            fp16 const_51_to_fp16 = const()[name = string("const_51_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 73, 8, 128]> k_padded_7_cast_fp16 = pad(constant_val = const_51_to_fp16, mode = k_padded_7_mode_0, pad = k_padded_7_pad_0, x = input_157_cast_fp16)[name = string("k_padded_7_cast_fp16")];
+            tensor<int32, [8]> v_padded_7_pad_0 = const()[name = string("v_padded_7_pad_0"), val = tensor<int32, [8]>([0, 0, 12, 11, 0, 0, 0, 0])];
+            string v_padded_7_mode_0 = const()[name = string("v_padded_7_mode_0"), val = string("constant")];
+            fp16 const_52_to_fp16 = const()[name = string("const_52_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 73, 8, 128]> v_padded_7_cast_fp16 = pad(constant_val = const_52_to_fp16, mode = v_padded_7_mode_0, pad = v_padded_7_pad_0, x = input_159_cast_fp16)[name = string("v_padded_7_cast_fp16")];
+            tensor<int32, [4]> var_1720_begin_0 = const()[name = string("op_1720_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1720_end_0 = const()[name = string("op_1720_end_0"), val = tensor<int32, [4]>([1, 24, 8, 128])];
+            tensor<bool, [4]> var_1720_end_mask_0 = const()[name = string("op_1720_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_1720_cast_fp16 = slice_by_index(begin = var_1720_begin_0, end = var_1720_end_0, end_mask = var_1720_end_mask_0, x = k_padded_7_cast_fp16)[name = string("op_1720_cast_fp16")];
+            tensor<int32, [4]> var_1722_begin_0 = const()[name = string("op_1722_begin_0"), val = tensor<int32, [4]>([0, 12, 0, 0])];
+            tensor<int32, [4]> var_1722_end_0 = const()[name = string("op_1722_end_0"), val = tensor<int32, [4]>([1, 36, 8, 128])];
+            tensor<bool, [4]> var_1722_end_mask_0 = const()[name = string("op_1722_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_1722_cast_fp16 = slice_by_index(begin = var_1722_begin_0, end = var_1722_end_0, end_mask = var_1722_end_mask_0, x = k_padded_7_cast_fp16)[name = string("op_1722_cast_fp16")];
+            tensor<int32, [4]> var_1724_begin_0 = const()[name = string("op_1724_begin_0"), val = tensor<int32, [4]>([0, 24, 0, 0])];
+            tensor<int32, [4]> var_1724_end_0 = const()[name = string("op_1724_end_0"), val = tensor<int32, [4]>([1, 48, 8, 128])];
+            tensor<bool, [4]> var_1724_end_mask_0 = const()[name = string("op_1724_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_1724_cast_fp16 = slice_by_index(begin = var_1724_begin_0, end = var_1724_end_0, end_mask = var_1724_end_mask_0, x = k_padded_7_cast_fp16)[name = string("op_1724_cast_fp16")];
+            tensor<int32, [4]> var_1726_begin_0 = const()[name = string("op_1726_begin_0"), val = tensor<int32, [4]>([0, 36, 0, 0])];
+            tensor<int32, [4]> var_1726_end_0 = const()[name = string("op_1726_end_0"), val = tensor<int32, [4]>([1, 60, 8, 128])];
+            tensor<bool, [4]> var_1726_end_mask_0 = const()[name = string("op_1726_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_1726_cast_fp16 = slice_by_index(begin = var_1726_begin_0, end = var_1726_end_0, end_mask = var_1726_end_mask_0, x = k_padded_7_cast_fp16)[name = string("op_1726_cast_fp16")];
+            tensor<int32, [4]> var_1728_begin_0 = const()[name = string("op_1728_begin_0"), val = tensor<int32, [4]>([0, 48, 0, 0])];
+            tensor<int32, [4]> var_1728_end_0 = const()[name = string("op_1728_end_0"), val = tensor<int32, [4]>([1, 72, 8, 128])];
+            tensor<bool, [4]> var_1728_end_mask_0 = const()[name = string("op_1728_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_1728_cast_fp16 = slice_by_index(begin = var_1728_begin_0, end = var_1728_end_0, end_mask = var_1728_end_mask_0, x = k_padded_7_cast_fp16)[name = string("op_1728_cast_fp16")];
+            int32 k_blocks_7_axis_0 = const()[name = string("k_blocks_7_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 5, 24, 8, 128]> k_blocks_7_cast_fp16 = stack(axis = k_blocks_7_axis_0, values = (var_1720_cast_fp16, var_1722_cast_fp16, var_1724_cast_fp16, var_1726_cast_fp16, var_1728_cast_fp16))[name = string("k_blocks_7_cast_fp16")];
+            tensor<int32, [4]> var_1732_begin_0 = const()[name = string("op_1732_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1732_end_0 = const()[name = string("op_1732_end_0"), val = tensor<int32, [4]>([1, 24, 8, 128])];
+            tensor<bool, [4]> var_1732_end_mask_0 = const()[name = string("op_1732_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_1732_cast_fp16 = slice_by_index(begin = var_1732_begin_0, end = var_1732_end_0, end_mask = var_1732_end_mask_0, x = v_padded_7_cast_fp16)[name = string("op_1732_cast_fp16")];
+            tensor<int32, [4]> var_1734_begin_0 = const()[name = string("op_1734_begin_0"), val = tensor<int32, [4]>([0, 12, 0, 0])];
+            tensor<int32, [4]> var_1734_end_0 = const()[name = string("op_1734_end_0"), val = tensor<int32, [4]>([1, 36, 8, 128])];
+            tensor<bool, [4]> var_1734_end_mask_0 = const()[name = string("op_1734_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_1734_cast_fp16 = slice_by_index(begin = var_1734_begin_0, end = var_1734_end_0, end_mask = var_1734_end_mask_0, x = v_padded_7_cast_fp16)[name = string("op_1734_cast_fp16")];
+            tensor<int32, [4]> var_1736_begin_0 = const()[name = string("op_1736_begin_0"), val = tensor<int32, [4]>([0, 24, 0, 0])];
+            tensor<int32, [4]> var_1736_end_0 = const()[name = string("op_1736_end_0"), val = tensor<int32, [4]>([1, 48, 8, 128])];
+            tensor<bool, [4]> var_1736_end_mask_0 = const()[name = string("op_1736_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_1736_cast_fp16 = slice_by_index(begin = var_1736_begin_0, end = var_1736_end_0, end_mask = var_1736_end_mask_0, x = v_padded_7_cast_fp16)[name = string("op_1736_cast_fp16")];
+            tensor<int32, [4]> var_1738_begin_0 = const()[name = string("op_1738_begin_0"), val = tensor<int32, [4]>([0, 36, 0, 0])];
+            tensor<int32, [4]> var_1738_end_0 = const()[name = string("op_1738_end_0"), val = tensor<int32, [4]>([1, 60, 8, 128])];
+            tensor<bool, [4]> var_1738_end_mask_0 = const()[name = string("op_1738_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_1738_cast_fp16 = slice_by_index(begin = var_1738_begin_0, end = var_1738_end_0, end_mask = var_1738_end_mask_0, x = v_padded_7_cast_fp16)[name = string("op_1738_cast_fp16")];
+            tensor<int32, [4]> var_1740_begin_0 = const()[name = string("op_1740_begin_0"), val = tensor<int32, [4]>([0, 48, 0, 0])];
+            tensor<int32, [4]> var_1740_end_0 = const()[name = string("op_1740_end_0"), val = tensor<int32, [4]>([1, 72, 8, 128])];
+            tensor<bool, [4]> var_1740_end_mask_0 = const()[name = string("op_1740_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_1740_cast_fp16 = slice_by_index(begin = var_1740_begin_0, end = var_1740_end_0, end_mask = var_1740_end_mask_0, x = v_padded_7_cast_fp16)[name = string("op_1740_cast_fp16")];
+            int32 v_blocks_7_axis_0 = const()[name = string("v_blocks_7_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 5, 24, 8, 128]> v_blocks_7_cast_fp16 = stack(axis = v_blocks_7_axis_0, values = (var_1732_cast_fp16, var_1734_cast_fp16, var_1736_cast_fp16, var_1738_cast_fp16, var_1740_cast_fp16))[name = string("v_blocks_7_cast_fp16")];
+            tensor<int32, [5]> var_1748 = const()[name = string("op_1748"), val = tensor<int32, [5]>([0, 3, 1, -3, -1])];
+            tensor<int32, [5]> var_1750 = const()[name = string("op_1750"), val = tensor<int32, [5]>([0, 3, 1, -1, -3])];
+            bool matrix_ac_7_transpose_x_0 = const()[name = string("matrix_ac_7_transpose_x_0"), val = bool(false)];
+            bool matrix_ac_7_transpose_y_0 = const()[name = string("matrix_ac_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 5, 12, 128]> queries_7_cast_fp16 = transpose(perm = var_1748, x = q_blocks_7_cast_fp16)[name = string("transpose_52")];
+            tensor<fp16, [1, 8, 5, 128, 24]> keys_t_7_cast_fp16 = transpose(perm = var_1750, x = k_blocks_7_cast_fp16)[name = string("transpose_53")];
+            tensor<fp16, [1, 8, 5, 12, 24]> matrix_ac_7_cast_fp16 = matmul(transpose_x = matrix_ac_7_transpose_x_0, transpose_y = matrix_ac_7_transpose_y_0, x = queries_7_cast_fp16, y = keys_t_7_cast_fp16)[name = string("matrix_ac_7_cast_fp16")];
+            tensor<int32, [4]> var_1753 = const()[name = string("op_1753"), val = tensor<int32, [4]>([1, 8, 60, 128])];
+            tensor<fp16, [1, 8, 60, 128]> q_flat_7_cast_fp16 = reshape(shape = var_1753, x = queries_7_cast_fp16)[name = string("q_flat_7_cast_fp16")];
+            bool matrix_bd_31_transpose_x_0 = const()[name = string("matrix_bd_31_transpose_x_0"), val = bool(false)];
+            bool matrix_bd_31_transpose_y_0 = const()[name = string("matrix_bd_31_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [8, 128, 13]> rel_k_t_7_to_fp16 = const()[name = string("rel_k_t_7_to_fp16"), val = tensor<fp16, [8, 128, 13]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(42725632)))];
+            tensor<fp16, [1, 8, 60, 13]> matrix_bd_31_cast_fp16 = matmul(transpose_x = matrix_bd_31_transpose_x_0, transpose_y = matrix_bd_31_transpose_y_0, x = q_flat_7_cast_fp16, y = rel_k_t_7_to_fp16)[name = string("matrix_bd_31_cast_fp16")];
+            tensor<int32, [5]> var_1758 = const()[name = string("op_1758"), val = tensor<int32, [5]>([1, 8, 5, 12, 13])];
+            tensor<fp16, [1, 8, 5, 12, 13]> input_161_cast_fp16 = reshape(shape = var_1758, x = matrix_bd_31_cast_fp16)[name = string("input_161_cast_fp16")];
+            tensor<int32, [10]> matrix_bd_33_pad_0 = const()[name = string("matrix_bd_33_pad_0"), val = tensor<int32, [10]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(42752320)))];
+            string matrix_bd_33_mode_0 = const()[name = string("matrix_bd_33_mode_0"), val = string("constant")];
+            fp16 const_54_to_fp16 = const()[name = string("const_54_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 8, 5, 12, 25]> matrix_bd_33_cast_fp16 = pad(constant_val = const_54_to_fp16, mode = matrix_bd_33_mode_0, pad = matrix_bd_33_pad_0, x = input_161_cast_fp16)[name = string("matrix_bd_33_cast_fp16")];
+            tensor<int32, [4]> var_1762 = const()[name = string("op_1762"), val = tensor<int32, [4]>([1, 8, 5, 300])];
+            tensor<fp16, [1, 8, 5, 300]> matrix_bd_35_cast_fp16 = reshape(shape = var_1762, x = matrix_bd_33_cast_fp16)[name = string("matrix_bd_35_cast_fp16")];
+            tensor<int32, [4]> matrix_bd_37_begin_0 = const()[name = string("matrix_bd_37_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> matrix_bd_37_end_0 = const()[name = string("matrix_bd_37_end_0"), val = tensor<int32, [4]>([1, 8, 5, 288])];
+            tensor<bool, [4]> matrix_bd_37_end_mask_0 = const()[name = string("matrix_bd_37_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 8, 5, 288]> matrix_bd_37_cast_fp16 = slice_by_index(begin = matrix_bd_37_begin_0, end = matrix_bd_37_end_0, end_mask = matrix_bd_37_end_mask_0, x = matrix_bd_35_cast_fp16)[name = string("matrix_bd_37_cast_fp16")];
+            tensor<int32, [5]> var_1768 = const()[name = string("op_1768"), val = tensor<int32, [5]>([1, 8, 5, 12, 24])];
+            tensor<fp16, [1, 8, 5, 12, 24]> matrix_bd_39_cast_fp16 = reshape(shape = var_1768, x = matrix_bd_37_cast_fp16)[name = string("matrix_bd_39_cast_fp16")];
+            tensor<fp16, [1, 8, 5, 12, 24]> attn_19_cast_fp16 = add(x = matrix_ac_7_cast_fp16, y = matrix_bd_39_cast_fp16)[name = string("attn_19_cast_fp16")];
+            fp16 _inversed_1771_y_0_to_fp16 = const()[name = string("_inversed_1771_y_0_to_fp16"), val = fp16(0x1.47cp-6)];
+            tensor<fp16, [1, 8, 5, 12, 24]> _inversed_1771_cast_fp16 = mul(x = attn_19_cast_fp16, y = _inversed_1771_y_0_to_fp16)[name = string("_inversed_1771_cast_fp16")];
+            string _inversed_1771_cast_fp16_to_fp32_dtype_0 = const()[name = string("_inversed_1771_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 8, 5, 12, 24]> _inversed_1771_cast_fp16_to_fp32 = cast(dtype = _inversed_1771_cast_fp16_to_fp32_dtype_0, x = _inversed_1771_cast_fp16)[name = string("cast_390")];
+            tensor<fp32, [1, 8, 5, 12, 24]> var_1772 = tanh(x = _inversed_1771_cast_fp16_to_fp32)[name = string("op_1772")];
+            string var_1772_to_fp16_dtype_0 = const()[name = string("op_1772_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 self_attns_3_softcap_to_fp16 = const()[name = string("self_attns_3_softcap_to_fp16"), val = fp16(0x1.9p+5)];
+            tensor<fp16, [1, 8, 5, 12, 24]> var_1772_to_fp16 = cast(dtype = var_1772_to_fp16_dtype_0, x = var_1772)[name = string("cast_389")];
+            tensor<fp16, [1, 8, 5, 12, 24]> attn_21_cast_fp16 = mul(x = var_1772_to_fp16, y = self_attns_3_softcap_to_fp16)[name = string("attn_21_cast_fp16")];
+            string attn_21_cast_fp16_to_fp32_dtype_0 = const()[name = string("attn_21_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 8, 5, 12, 24]> attn_21_cast_fp16_to_fp32 = cast(dtype = attn_21_cast_fp16_to_fp32_dtype_0, x = attn_21_cast_fp16)[name = string("cast_388")];
+            tensor<fp32, [1, 8, 5, 12, 24]> input_163 = select(a = var_1635, b = attn_21_cast_fp16_to_fp32, cond = var_460)[name = string("input_163")];
+            tensor<fp32, [1, 8, 5, 12, 24]> var_1776 = softmax(axis = var_1634, x = input_163)[name = string("op_1776")];
+            tensor<int32, [5]> var_1778 = const()[name = string("op_1778"), val = tensor<int32, [5]>([0, 3, 1, -3, -1])];
+            bool out_19_transpose_x_0 = const()[name = string("out_19_transpose_x_0"), val = bool(false)];
+            bool out_19_transpose_y_0 = const()[name = string("out_19_transpose_y_0"), val = bool(false)];
+            string var_1776_to_fp16_dtype_0 = const()[name = string("op_1776_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 8, 5, 24, 128]> values_t_7_cast_fp16 = transpose(perm = var_1778, x = v_blocks_7_cast_fp16)[name = string("transpose_51")];
+            tensor<fp16, [1, 8, 5, 12, 24]> var_1776_to_fp16 = cast(dtype = var_1776_to_fp16_dtype_0, x = var_1776)[name = string("cast_387")];
+            tensor<fp16, [1, 8, 5, 12, 128]> out_19_cast_fp16 = matmul(transpose_x = out_19_transpose_x_0, transpose_y = out_19_transpose_y_0, x = var_1776_to_fp16, y = values_t_7_cast_fp16)[name = string("out_19_cast_fp16")];
+            tensor<int32, [5]> var_1781 = const()[name = string("op_1781"), val = tensor<int32, [5]>([0, 2, 3, 1, 4])];
+            tensor<int32, [3]> var_1783 = const()[name = string("op_1783"), val = tensor<int32, [3]>([1, 60, 1024])];
+            tensor<fp16, [1, 5, 12, 8, 128]> var_1782_cast_fp16 = transpose(perm = var_1781, x = out_19_cast_fp16)[name = string("transpose_50")];
+            tensor<fp16, [1, 60, 1024]> out_21_cast_fp16 = reshape(shape = var_1783, x = var_1782_cast_fp16)[name = string("out_21_cast_fp16")];
+            tensor<int32, [3]> var_1786_begin_0 = const()[name = string("op_1786_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_1786_end_0 = const()[name = string("op_1786_end_0"), val = tensor<int32, [3]>([1, 50, 1024])];
+            tensor<bool, [3]> var_1786_end_mask_0 = const()[name = string("op_1786_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp16, [1, 50, 1024]> var_1786_cast_fp16 = slice_by_index(begin = var_1786_begin_0, end = var_1786_end_0, end_mask = var_1786_end_mask_0, x = out_21_cast_fp16)[name = string("op_1786_cast_fp16")];
+            fp16 self_attns_3_post_input_min_to_fp16 = const()[name = string("self_attns_3_post_input_min_to_fp16"), val = fp16(-0x1.08p+4)];
+            fp16 self_attns_3_post_input_max_to_fp16 = const()[name = string("self_attns_3_post_input_max_to_fp16"), val = fp16(0x1.06p+4)];
+            tensor<fp16, [1, 50, 1024]> clip_97_cast_fp16 = clip(alpha = self_attns_3_post_input_min_to_fp16, beta = self_attns_3_post_input_max_to_fp16, x = var_1786_cast_fp16)[name = string("clip_97_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_3_post_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(42752448))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(43276800))))[name = string("self_attns_3_post_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_40_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_3_post_linear_weight_to_fp16_palettized, x = clip_97_cast_fp16)[name = string("linear_40_cast_fp16")];
+            fp16 self_attns_3_post_output_min_to_fp16 = const()[name = string("self_attns_3_post_output_min_to_fp16"), val = fp16(-0x1.72p+5)];
+            fp16 self_attns_3_post_output_max_to_fp16 = const()[name = string("self_attns_3_post_output_max_to_fp16"), val = fp16(0x1.7p+5)];
+            tensor<fp16, [1, 50, 1024]> clip_98_cast_fp16 = clip(alpha = self_attns_3_post_output_min_to_fp16, beta = self_attns_3_post_output_max_to_fp16, x = linear_40_cast_fp16)[name = string("clip_98_cast_fp16")];
+            fp16 var_1798_to_fp16 = const()[name = string("op_1798_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_1799_to_fp16 = const()[name = string("op_1799_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_99_cast_fp16 = clip(alpha = var_1798_to_fp16, beta = var_1799_to_fp16, x = clip_98_cast_fp16)[name = string("clip_99_cast_fp16")];
+            string clip_99_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_99_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_1801 = const()[name = string("op_1801"), val = fp32(-0x1p-1)];
+            fp32 var_1805_promoted = const()[name = string("op_1805_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_99_cast_fp16_to_fp32 = cast(dtype = clip_99_cast_fp16_to_fp32_dtype_0, x = clip_99_cast_fp16)[name = string("cast_386")];
+            tensor<fp32, [1, 50, 1024]> var_1811 = pow(x = clip_99_cast_fp16_to_fp32, y = var_1805_promoted)[name = string("op_1811")];
+            tensor<int32, [1]> var_1813_axes_0 = const()[name = string("op_1813_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1813_keep_dims_0 = const()[name = string("op_1813_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_1813 = reduce_mean(axes = var_1813_axes_0, keep_dims = var_1813_keep_dims_0, x = var_1811)[name = string("op_1813")];
+            string var_1813_to_fp16_dtype_0 = const()[name = string("op_1813_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_1814_to_fp16 = const()[name = string("op_1814_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_1813_to_fp16 = cast(dtype = var_1813_to_fp16_dtype_0, x = var_1813)[name = string("cast_385")];
+            tensor<fp16, [1, 50, 1]> mean_squared_61_cast_fp16 = add(x = var_1813_to_fp16, y = var_1814_to_fp16)[name = string("mean_squared_61_cast_fp16")];
+            string mean_squared_61_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_61_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_61_cast_fp16_to_fp32 = cast(dtype = mean_squared_61_cast_fp16_to_fp32_dtype_0, x = mean_squared_61_cast_fp16)[name = string("cast_384")];
+            tensor<fp32, [1, 50, 1]> var_1816 = pow(x = mean_squared_61_cast_fp16_to_fp32, y = var_1801)[name = string("op_1816")];
+            string var_1816_to_fp16_dtype_0 = const()[name = string("op_1816_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_1816_to_fp16 = cast(dtype = var_1816_to_fp16_dtype_0, x = var_1816)[name = string("cast_383")];
+            tensor<fp16, [1, 50, 1024]> normed_output_121_cast_fp16 = mul(x = clip_99_cast_fp16, y = var_1816_to_fp16)[name = string("normed_output_121_cast_fp16")];
+            tensor<fp16, [1024]> const_55_to_fp16 = const()[name = string("const_55_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(43277888)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_123_cast_fp16 = mul(x = normed_output_121_cast_fp16, y = const_55_to_fp16)[name = string("normed_output_123_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_367_cast_fp16 = add(x = normed_output_123_cast_fp16, y = hidden_states_341_cast_fp16)[name = string("hidden_states_367_cast_fp16")];
+            string hidden_states_367_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_367_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_1823 = const()[name = string("op_1823"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_1824 = const()[name = string("op_1824"), val = fp32(-0x1.2a05f2p+33)];
+            fp32 var_1836 = const()[name = string("op_1836"), val = fp32(-0x1p-1)];
+            fp32 var_1832_promoted = const()[name = string("op_1832_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> hidden_states_367_cast_fp16_to_fp32 = cast(dtype = hidden_states_367_cast_fp16_to_fp32_dtype_0, x = hidden_states_367_cast_fp16)[name = string("cast_382")];
+            tensor<fp32, [1, 50, 1024]> var_1844 = pow(x = hidden_states_367_cast_fp16_to_fp32, y = var_1832_promoted)[name = string("op_1844")];
+            tensor<int32, [1]> var_1846_axes_0 = const()[name = string("op_1846_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1846_keep_dims_0 = const()[name = string("op_1846_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_1846 = reduce_mean(axes = var_1846_axes_0, keep_dims = var_1846_keep_dims_0, x = var_1844)[name = string("op_1846")];
+            string var_1846_to_fp16_dtype_0 = const()[name = string("op_1846_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_1847_to_fp16 = const()[name = string("op_1847_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_1846_to_fp16 = cast(dtype = var_1846_to_fp16_dtype_0, x = var_1846)[name = string("cast_381")];
+            tensor<fp16, [1, 50, 1]> mean_squared_63_cast_fp16 = add(x = var_1846_to_fp16, y = var_1847_to_fp16)[name = string("mean_squared_63_cast_fp16")];
+            string mean_squared_63_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_63_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_63_cast_fp16_to_fp32 = cast(dtype = mean_squared_63_cast_fp16_to_fp32_dtype_0, x = mean_squared_63_cast_fp16)[name = string("cast_380")];
+            tensor<fp32, [1, 50, 1]> var_1849 = pow(x = mean_squared_63_cast_fp16_to_fp32, y = var_1836)[name = string("op_1849")];
+            string var_1849_to_fp16_dtype_0 = const()[name = string("op_1849_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_1849_to_fp16 = cast(dtype = var_1849_to_fp16_dtype_0, x = var_1849)[name = string("cast_379")];
+            tensor<fp16, [1, 50, 1024]> normed_output_125_cast_fp16 = mul(x = hidden_states_367_cast_fp16, y = var_1849_to_fp16)[name = string("normed_output_125_cast_fp16")];
+            tensor<fp16, [1024]> const_56_to_fp16 = const()[name = string("const_56_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(43280000)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_127_cast_fp16 = mul(x = normed_output_125_cast_fp16, y = const_56_to_fp16)[name = string("normed_output_127_cast_fp16")];
+            fp16 lconv1ds_3_linear_start_input_min_to_fp16 = const()[name = string("lconv1ds_3_linear_start_input_min_to_fp16"), val = fp16(-0x1.68p+3)];
+            fp16 lconv1ds_3_linear_start_input_max_to_fp16 = const()[name = string("lconv1ds_3_linear_start_input_max_to_fp16"), val = fp16(0x1.66p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_100_cast_fp16 = clip(alpha = lconv1ds_3_linear_start_input_min_to_fp16, beta = lconv1ds_3_linear_start_input_max_to_fp16, x = normed_output_127_cast_fp16)[name = string("clip_100_cast_fp16")];
+            tensor<fp16, [2048, 1024]> lconv1ds_3_linear_start_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(43282112))), lut = tensor<fp16, [64, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(44330752))))[name = string("lconv1ds_3_linear_start_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 2048]> linear_41_cast_fp16 = linear(bias = linear_8_bias_0_to_fp16, weight = lconv1ds_3_linear_start_linear_weight_to_fp16_palettized, x = clip_100_cast_fp16)[name = string("linear_41_cast_fp16")];
+            fp16 lconv1ds_3_linear_start_output_min_to_fp16 = const()[name = string("lconv1ds_3_linear_start_output_min_to_fp16"), val = fp16(-0x1.9p+4)];
+            fp16 lconv1ds_3_linear_start_output_max_to_fp16 = const()[name = string("lconv1ds_3_linear_start_output_max_to_fp16"), val = fp16(0x1.8cp+4)];
+            tensor<fp16, [1, 50, 2048]> clip_101_cast_fp16 = clip(alpha = lconv1ds_3_linear_start_output_min_to_fp16, beta = lconv1ds_3_linear_start_output_max_to_fp16, x = linear_41_cast_fp16)[name = string("clip_101_cast_fp16")];
+            int32 hidden_states_375_split_num_splits_0 = const()[name = string("hidden_states_375_split_num_splits_0"), val = int32(2)];
+            int32 hidden_states_375_split_axis_0 = const()[name = string("hidden_states_375_split_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_375_split_cast_fp16_0, tensor<fp16, [1, 50, 1024]> hidden_states_375_split_cast_fp16_1 = split(axis = hidden_states_375_split_axis_0, num_splits = hidden_states_375_split_num_splits_0, x = clip_101_cast_fp16)[name = string("hidden_states_375_split_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_375_split_1_sigmoid_cast_fp16 = sigmoid(x = hidden_states_375_split_cast_fp16_1)[name = string("hidden_states_375_split_1_sigmoid_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_375_cast_fp16 = mul(x = hidden_states_375_split_cast_fp16_0, y = hidden_states_375_split_1_sigmoid_cast_fp16)[name = string("hidden_states_375_cast_fp16")];
+            tensor<int32, [3]> input_171_perm_0 = const()[name = string("input_171_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [6]> input_173_pad_0 = const()[name = string("input_173_pad_0"), val = tensor<int32, [6]>([0, 0, 0, 0, 4, 0])];
+            string input_173_mode_0 = const()[name = string("input_173_mode_0"), val = string("constant")];
+            fp16 const_57_to_fp16 = const()[name = string("const_57_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 1024, 50]> input_171_cast_fp16 = transpose(perm = input_171_perm_0, x = hidden_states_375_cast_fp16)[name = string("transpose_49")];
+            tensor<fp16, [1, 1024, 54]> input_173_cast_fp16 = pad(constant_val = const_57_to_fp16, mode = input_173_mode_0, pad = input_173_pad_0, x = input_171_cast_fp16)[name = string("input_173_cast_fp16")];
+            string var_1875_pad_type_0 = const()[name = string("op_1875_pad_type_0"), val = string("valid")];
+            int32 var_1875_groups_0 = const()[name = string("op_1875_groups_0"), val = int32(1024)];
+            tensor<int32, [1]> var_1875_strides_0 = const()[name = string("op_1875_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_1875_pad_0 = const()[name = string("op_1875_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_1875_dilations_0 = const()[name = string("op_1875_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1024, 1, 5]> lconv1ds_3_depthwise_conv1d_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1, 5]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(44332864))), lut = tensor<fp16, [32, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(44335488))))[name = string("lconv1ds_3_depthwise_conv1d_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 50]> var_1875_cast_fp16 = conv(dilations = var_1875_dilations_0, groups = var_1875_groups_0, pad = var_1875_pad_0, pad_type = var_1875_pad_type_0, strides = var_1875_strides_0, weight = lconv1ds_3_depthwise_conv1d_weight_to_fp16_palettized, x = input_173_cast_fp16)[name = string("op_1875_cast_fp16")];
+            tensor<int32, [3]> hidden_states_377_perm_0 = const()[name = string("hidden_states_377_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            string hidden_states_377_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_377_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_377_cast_fp16 = transpose(perm = hidden_states_377_perm_0, x = var_1875_cast_fp16)[name = string("transpose_48")];
+            tensor<fp32, [1, 50, 1024]> hidden_states_377_cast_fp16_to_fp32 = cast(dtype = hidden_states_377_cast_fp16_to_fp32_dtype_0, x = hidden_states_377_cast_fp16)[name = string("cast_378")];
+            tensor<fp32, [1, 50, 1024]> clip_102 = clip(alpha = var_1824, beta = var_1823, x = hidden_states_377_cast_fp16_to_fp32)[name = string("clip_102")];
+            fp32 var_1832_promoted_1 = const()[name = string("op_1832_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_1880 = pow(x = clip_102, y = var_1832_promoted_1)[name = string("op_1880")];
+            tensor<int32, [1]> var_1882_axes_0 = const()[name = string("op_1882_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1882_keep_dims_0 = const()[name = string("op_1882_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_1882 = reduce_mean(axes = var_1882_axes_0, keep_dims = var_1882_keep_dims_0, x = var_1880)[name = string("op_1882")];
+            string var_1882_to_fp16_dtype_0 = const()[name = string("op_1882_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_1883_to_fp16 = const()[name = string("op_1883_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_1882_to_fp16 = cast(dtype = var_1882_to_fp16_dtype_0, x = var_1882)[name = string("cast_377")];
+            tensor<fp16, [1, 50, 1]> mean_squared_65_cast_fp16 = add(x = var_1882_to_fp16, y = var_1883_to_fp16)[name = string("mean_squared_65_cast_fp16")];
+            string mean_squared_65_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_65_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_65_cast_fp16_to_fp32 = cast(dtype = mean_squared_65_cast_fp16_to_fp32_dtype_0, x = mean_squared_65_cast_fp16)[name = string("cast_376")];
+            tensor<fp32, [1, 50, 1]> var_1885 = pow(x = mean_squared_65_cast_fp16_to_fp32, y = var_1836)[name = string("op_1885")];
+            string clip_102_to_fp16_dtype_0 = const()[name = string("clip_102_to_fp16_dtype_0"), val = string("fp16")];
+            string var_1885_to_fp16_dtype_0 = const()[name = string("op_1885_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_102_to_fp16 = cast(dtype = clip_102_to_fp16_dtype_0, x = clip_102)[name = string("cast_374")];
+            tensor<fp16, [1, 50, 1]> var_1885_to_fp16 = cast(dtype = var_1885_to_fp16_dtype_0, x = var_1885)[name = string("cast_375")];
+            tensor<fp16, [1, 50, 1024]> normed_output_129_cast_fp16 = mul(x = clip_102_to_fp16, y = var_1885_to_fp16)[name = string("normed_output_129_cast_fp16")];
+            tensor<fp16, [1024]> const_58_to_fp16 = const()[name = string("const_58_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(44336576)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_131_cast_fp16 = mul(x = normed_output_129_cast_fp16, y = const_58_to_fp16)[name = string("normed_output_131_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_383_cast_fp16 = silu(x = normed_output_131_cast_fp16)[name = string("hidden_states_383_cast_fp16")];
+            fp16 lconv1ds_3_linear_end_input_min_to_fp16 = const()[name = string("lconv1ds_3_linear_end_input_min_to_fp16"), val = fp16(-0x1.e4p+2)];
+            fp16 lconv1ds_3_linear_end_input_max_to_fp16 = const()[name = string("lconv1ds_3_linear_end_input_max_to_fp16"), val = fp16(0x1.ep+2)];
+            tensor<fp16, [1, 50, 1024]> clip_103_cast_fp16 = clip(alpha = lconv1ds_3_linear_end_input_min_to_fp16, beta = lconv1ds_3_linear_end_input_max_to_fp16, x = hidden_states_383_cast_fp16)[name = string("clip_103_cast_fp16")];
+            tensor<fp16, [1024, 1024]> lconv1ds_3_linear_end_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(44338688))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(44863040))))[name = string("lconv1ds_3_linear_end_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_42_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = lconv1ds_3_linear_end_linear_weight_to_fp16_palettized, x = clip_103_cast_fp16)[name = string("linear_42_cast_fp16")];
+            fp16 lconv1ds_3_linear_end_output_min_to_fp16 = const()[name = string("lconv1ds_3_linear_end_output_min_to_fp16"), val = fp16(-0x1.fcp+2)];
+            fp16 lconv1ds_3_linear_end_output_max_to_fp16 = const()[name = string("lconv1ds_3_linear_end_output_max_to_fp16"), val = fp16(0x1.f8p+2)];
+            tensor<fp16, [1, 50, 1024]> clip_104_cast_fp16 = clip(alpha = lconv1ds_3_linear_end_output_min_to_fp16, beta = lconv1ds_3_linear_end_output_max_to_fp16, x = linear_42_cast_fp16)[name = string("clip_104_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_389_cast_fp16 = add(x = clip_104_cast_fp16, y = hidden_states_367_cast_fp16)[name = string("hidden_states_389_cast_fp16")];
+            string hidden_states_389_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_389_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_1909 = const()[name = string("op_1909"), val = fp32(-0x1p-1)];
+            fp32 var_1910 = const()[name = string("op_1910"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_1911 = const()[name = string("op_1911"), val = fp32(-0x1.2a05f2p+33)];
+            tensor<fp32, [1, 50, 1024]> hidden_states_389_cast_fp16_to_fp32 = cast(dtype = hidden_states_389_cast_fp16_to_fp32_dtype_0, x = hidden_states_389_cast_fp16)[name = string("cast_373")];
+            tensor<fp32, [1, 50, 1024]> clip_105 = clip(alpha = var_1911, beta = var_1910, x = hidden_states_389_cast_fp16_to_fp32)[name = string("clip_105")];
+            fp32 var_1905_promoted = const()[name = string("op_1905_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_1919 = pow(x = clip_105, y = var_1905_promoted)[name = string("op_1919")];
+            tensor<int32, [1]> var_1921_axes_0 = const()[name = string("op_1921_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1921_keep_dims_0 = const()[name = string("op_1921_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_1921 = reduce_mean(axes = var_1921_axes_0, keep_dims = var_1921_keep_dims_0, x = var_1919)[name = string("op_1921")];
+            string var_1921_to_fp16_dtype_0 = const()[name = string("op_1921_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_1922_to_fp16 = const()[name = string("op_1922_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_1921_to_fp16 = cast(dtype = var_1921_to_fp16_dtype_0, x = var_1921)[name = string("cast_372")];
+            tensor<fp16, [1, 50, 1]> mean_squared_67_cast_fp16 = add(x = var_1921_to_fp16, y = var_1922_to_fp16)[name = string("mean_squared_67_cast_fp16")];
+            string mean_squared_67_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_67_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_67_cast_fp16_to_fp32 = cast(dtype = mean_squared_67_cast_fp16_to_fp32_dtype_0, x = mean_squared_67_cast_fp16)[name = string("cast_371")];
+            tensor<fp32, [1, 50, 1]> var_1924 = pow(x = mean_squared_67_cast_fp16_to_fp32, y = var_1909)[name = string("op_1924")];
+            string clip_105_to_fp16_dtype_0 = const()[name = string("clip_105_to_fp16_dtype_0"), val = string("fp16")];
+            string var_1924_to_fp16_dtype_0 = const()[name = string("op_1924_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_105_to_fp16 = cast(dtype = clip_105_to_fp16_dtype_0, x = clip_105)[name = string("cast_369")];
+            tensor<fp16, [1, 50, 1]> var_1924_to_fp16 = cast(dtype = var_1924_to_fp16_dtype_0, x = var_1924)[name = string("cast_370")];
+            tensor<fp16, [1, 50, 1024]> normed_output_133_cast_fp16 = mul(x = clip_105_to_fp16, y = var_1924_to_fp16)[name = string("normed_output_133_cast_fp16")];
+            tensor<fp16, [1024]> const_59_to_fp16 = const()[name = string("const_59_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(44864128)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_135_cast_fp16 = mul(x = normed_output_133_cast_fp16, y = const_59_to_fp16)[name = string("normed_output_135_cast_fp16")];
+            fp16 feed_forward2s_3_ffw_layer_1_input_min_to_fp16 = const()[name = string("feed_forward2s_3_ffw_layer_1_input_min_to_fp16"), val = fp16(-0x1.bap+3)];
+            fp16 feed_forward2s_3_ffw_layer_1_input_max_to_fp16 = const()[name = string("feed_forward2s_3_ffw_layer_1_input_max_to_fp16"), val = fp16(0x1.b6p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_106_cast_fp16 = clip(alpha = feed_forward2s_3_ffw_layer_1_input_min_to_fp16, beta = feed_forward2s_3_ffw_layer_1_input_max_to_fp16, x = normed_output_135_cast_fp16)[name = string("clip_106_cast_fp16")];
+            tensor<fp16, [4096, 1024]> feed_forward2s_3_ffw_layer_1_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(44866240))), lut = tensor<fp16, [128, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(46963456))))[name = string("feed_forward2s_3_ffw_layer_1_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 4096]> linear_43_cast_fp16 = linear(bias = linear_1_bias_0_to_fp16, weight = feed_forward2s_3_ffw_layer_1_linear_weight_to_fp16_palettized, x = clip_106_cast_fp16)[name = string("linear_43_cast_fp16")];
+            fp16 feed_forward2s_3_ffw_layer_1_output_min_to_fp16 = const()[name = string("feed_forward2s_3_ffw_layer_1_output_min_to_fp16"), val = fp16(-0x1.d8p+4)];
+            fp16 feed_forward2s_3_ffw_layer_1_output_max_to_fp16 = const()[name = string("feed_forward2s_3_ffw_layer_1_output_max_to_fp16"), val = fp16(0x1.d4p+4)];
+            tensor<fp16, [1, 50, 4096]> clip_107_cast_fp16 = clip(alpha = feed_forward2s_3_ffw_layer_1_output_min_to_fp16, beta = feed_forward2s_3_ffw_layer_1_output_max_to_fp16, x = linear_43_cast_fp16)[name = string("clip_107_cast_fp16")];
+            tensor<fp16, [1, 50, 4096]> hidden_states_399_cast_fp16 = silu(x = clip_107_cast_fp16)[name = string("hidden_states_399_cast_fp16")];
+            fp16 feed_forward2s_3_ffw_layer_2_input_min_to_fp16 = const()[name = string("feed_forward2s_3_ffw_layer_2_input_min_to_fp16"), val = fp16(-0x1.acp+3)];
+            fp16 feed_forward2s_3_ffw_layer_2_input_max_to_fp16 = const()[name = string("feed_forward2s_3_ffw_layer_2_input_max_to_fp16"), val = fp16(0x1.a8p+3)];
+            tensor<fp16, [1, 50, 4096]> clip_108_cast_fp16 = clip(alpha = feed_forward2s_3_ffw_layer_2_input_min_to_fp16, beta = feed_forward2s_3_ffw_layer_2_input_max_to_fp16, x = hidden_states_399_cast_fp16)[name = string("clip_108_cast_fp16")];
+            tensor<fp16, [1024, 4096]> feed_forward2s_3_ffw_layer_2_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 4096]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(46967616))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(49064832))))[name = string("feed_forward2s_3_ffw_layer_2_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_44_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = feed_forward2s_3_ffw_layer_2_linear_weight_to_fp16_palettized, x = clip_108_cast_fp16)[name = string("linear_44_cast_fp16")];
+            fp16 feed_forward2s_3_ffw_layer_2_output_min_to_fp16 = const()[name = string("feed_forward2s_3_ffw_layer_2_output_min_to_fp16"), val = fp16(-0x1.74p+6)];
+            fp16 feed_forward2s_3_ffw_layer_2_output_max_to_fp16 = const()[name = string("feed_forward2s_3_ffw_layer_2_output_max_to_fp16"), val = fp16(0x1.72p+6)];
+            tensor<fp16, [1, 50, 1024]> clip_109_cast_fp16 = clip(alpha = feed_forward2s_3_ffw_layer_2_output_min_to_fp16, beta = feed_forward2s_3_ffw_layer_2_output_max_to_fp16, x = linear_44_cast_fp16)[name = string("clip_109_cast_fp16")];
+            string clip_109_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_109_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1024]> clip_109_cast_fp16_to_fp32 = cast(dtype = clip_109_cast_fp16_to_fp32_dtype_0, x = clip_109_cast_fp16)[name = string("cast_368")];
+            tensor<fp32, [1, 50, 1024]> clip_110 = clip(alpha = var_1911, beta = var_1910, x = clip_109_cast_fp16_to_fp32)[name = string("clip_110")];
+            fp32 var_1905_promoted_1 = const()[name = string("op_1905_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_1951 = pow(x = clip_110, y = var_1905_promoted_1)[name = string("op_1951")];
+            tensor<int32, [1]> var_1953_axes_0 = const()[name = string("op_1953_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1953_keep_dims_0 = const()[name = string("op_1953_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_1953 = reduce_mean(axes = var_1953_axes_0, keep_dims = var_1953_keep_dims_0, x = var_1951)[name = string("op_1953")];
+            string var_1953_to_fp16_dtype_0 = const()[name = string("op_1953_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_1954_to_fp16 = const()[name = string("op_1954_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_1953_to_fp16 = cast(dtype = var_1953_to_fp16_dtype_0, x = var_1953)[name = string("cast_367")];
+            tensor<fp16, [1, 50, 1]> mean_squared_69_cast_fp16 = add(x = var_1953_to_fp16, y = var_1954_to_fp16)[name = string("mean_squared_69_cast_fp16")];
+            string mean_squared_69_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_69_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_69_cast_fp16_to_fp32 = cast(dtype = mean_squared_69_cast_fp16_to_fp32_dtype_0, x = mean_squared_69_cast_fp16)[name = string("cast_366")];
+            tensor<fp32, [1, 50, 1]> var_1956 = pow(x = mean_squared_69_cast_fp16_to_fp32, y = var_1909)[name = string("op_1956")];
+            string clip_110_to_fp16_dtype_0 = const()[name = string("clip_110_to_fp16_dtype_0"), val = string("fp16")];
+            string var_1956_to_fp16_dtype_0 = const()[name = string("op_1956_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_110_to_fp16 = cast(dtype = clip_110_to_fp16_dtype_0, x = clip_110)[name = string("cast_364")];
+            tensor<fp16, [1, 50, 1]> var_1956_to_fp16 = cast(dtype = var_1956_to_fp16_dtype_0, x = var_1956)[name = string("cast_365")];
+            tensor<fp16, [1, 50, 1024]> normed_output_137_cast_fp16 = mul(x = clip_110_to_fp16, y = var_1956_to_fp16)[name = string("normed_output_137_cast_fp16")];
+            tensor<fp16, [1024]> const_60_to_fp16 = const()[name = string("const_60_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(49065920)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_139_cast_fp16 = mul(x = normed_output_137_cast_fp16, y = const_60_to_fp16)[name = string("normed_output_139_cast_fp16")];
+            fp16 var_1901_to_fp16 = const()[name = string("op_1901_to_fp16"), val = fp16(0x1p-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_411_cast_fp16 = mul(x = normed_output_139_cast_fp16, y = var_1901_to_fp16)[name = string("hidden_states_411_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_413_cast_fp16 = add(x = hidden_states_411_cast_fp16, y = hidden_states_389_cast_fp16)[name = string("hidden_states_413_cast_fp16")];
+            fp16 var_1963_to_fp16 = const()[name = string("op_1963_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_1964_to_fp16 = const()[name = string("op_1964_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_111_cast_fp16 = clip(alpha = var_1963_to_fp16, beta = var_1964_to_fp16, x = hidden_states_413_cast_fp16)[name = string("clip_111_cast_fp16")];
+            string clip_111_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_111_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_1966 = const()[name = string("op_1966"), val = fp32(-0x1p-1)];
+            fp32 var_1970_promoted = const()[name = string("op_1970_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_111_cast_fp16_to_fp32 = cast(dtype = clip_111_cast_fp16_to_fp32_dtype_0, x = clip_111_cast_fp16)[name = string("cast_363")];
+            tensor<fp32, [1, 50, 1024]> var_1976 = pow(x = clip_111_cast_fp16_to_fp32, y = var_1970_promoted)[name = string("op_1976")];
+            tensor<int32, [1]> var_1978_axes_0 = const()[name = string("op_1978_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1978_keep_dims_0 = const()[name = string("op_1978_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_1978 = reduce_mean(axes = var_1978_axes_0, keep_dims = var_1978_keep_dims_0, x = var_1976)[name = string("op_1978")];
+            string var_1978_to_fp16_dtype_0 = const()[name = string("op_1978_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_1979_to_fp16 = const()[name = string("op_1979_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_1978_to_fp16 = cast(dtype = var_1978_to_fp16_dtype_0, x = var_1978)[name = string("cast_362")];
+            tensor<fp16, [1, 50, 1]> mean_squared_71_cast_fp16 = add(x = var_1978_to_fp16, y = var_1979_to_fp16)[name = string("mean_squared_71_cast_fp16")];
+            string mean_squared_71_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_71_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_71_cast_fp16_to_fp32 = cast(dtype = mean_squared_71_cast_fp16_to_fp32_dtype_0, x = mean_squared_71_cast_fp16)[name = string("cast_361")];
+            tensor<fp32, [1, 50, 1]> var_1981 = pow(x = mean_squared_71_cast_fp16_to_fp32, y = var_1966)[name = string("op_1981")];
+            string var_1981_to_fp16_dtype_0 = const()[name = string("op_1981_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_1981_to_fp16 = cast(dtype = var_1981_to_fp16_dtype_0, x = var_1981)[name = string("cast_360")];
+            tensor<fp16, [1, 50, 1024]> normed_output_141_cast_fp16 = mul(x = clip_111_cast_fp16, y = var_1981_to_fp16)[name = string("normed_output_141_cast_fp16")];
+            tensor<fp16, [1024]> const_61_to_fp16 = const()[name = string("const_61_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(49068032)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_143_cast_fp16 = mul(x = normed_output_141_cast_fp16, y = const_61_to_fp16)[name = string("normed_output_143_cast_fp16")];
+            string normed_output_143_cast_fp16_to_fp32_dtype_0 = const()[name = string("normed_output_143_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_1994 = const()[name = string("op_1994"), val = fp32(-0x1p-1)];
+            fp32 var_1995 = const()[name = string("op_1995"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_1996 = const()[name = string("op_1996"), val = fp32(-0x1.2a05f2p+33)];
+            tensor<fp32, [1, 50, 1024]> normed_output_143_cast_fp16_to_fp32 = cast(dtype = normed_output_143_cast_fp16_to_fp32_dtype_0, x = normed_output_143_cast_fp16)[name = string("cast_359")];
+            tensor<fp32, [1, 50, 1024]> clip_112 = clip(alpha = var_1996, beta = var_1995, x = normed_output_143_cast_fp16_to_fp32)[name = string("clip_112")];
+            fp32 var_1990_promoted = const()[name = string("op_1990_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_2004 = pow(x = clip_112, y = var_1990_promoted)[name = string("op_2004")];
+            tensor<int32, [1]> var_2006_axes_0 = const()[name = string("op_2006_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2006_keep_dims_0 = const()[name = string("op_2006_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_2006 = reduce_mean(axes = var_2006_axes_0, keep_dims = var_2006_keep_dims_0, x = var_2004)[name = string("op_2006")];
+            string var_2006_to_fp16_dtype_0 = const()[name = string("op_2006_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_2007_to_fp16 = const()[name = string("op_2007_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_2006_to_fp16 = cast(dtype = var_2006_to_fp16_dtype_0, x = var_2006)[name = string("cast_358")];
+            tensor<fp16, [1, 50, 1]> mean_squared_73_cast_fp16 = add(x = var_2006_to_fp16, y = var_2007_to_fp16)[name = string("mean_squared_73_cast_fp16")];
+            string mean_squared_73_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_73_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_73_cast_fp16_to_fp32 = cast(dtype = mean_squared_73_cast_fp16_to_fp32_dtype_0, x = mean_squared_73_cast_fp16)[name = string("cast_357")];
+            tensor<fp32, [1, 50, 1]> var_2009 = pow(x = mean_squared_73_cast_fp16_to_fp32, y = var_1994)[name = string("op_2009")];
+            string clip_112_to_fp16_dtype_0 = const()[name = string("clip_112_to_fp16_dtype_0"), val = string("fp16")];
+            string var_2009_to_fp16_dtype_0 = const()[name = string("op_2009_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_112_to_fp16 = cast(dtype = clip_112_to_fp16_dtype_0, x = clip_112)[name = string("cast_355")];
+            tensor<fp16, [1, 50, 1]> var_2009_to_fp16 = cast(dtype = var_2009_to_fp16_dtype_0, x = var_2009)[name = string("cast_356")];
+            tensor<fp16, [1, 50, 1024]> normed_output_145_cast_fp16 = mul(x = clip_112_to_fp16, y = var_2009_to_fp16)[name = string("normed_output_145_cast_fp16")];
+            tensor<fp16, [1024]> const_62_to_fp16 = const()[name = string("const_62_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(49070144)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_147_cast_fp16 = mul(x = normed_output_145_cast_fp16, y = const_62_to_fp16)[name = string("normed_output_147_cast_fp16")];
+            fp16 feed_forward1s_4_ffw_layer_1_input_min_to_fp16 = const()[name = string("feed_forward1s_4_ffw_layer_1_input_min_to_fp16"), val = fp16(-0x1.8ap+3)];
+            fp16 feed_forward1s_4_ffw_layer_1_input_max_to_fp16 = const()[name = string("feed_forward1s_4_ffw_layer_1_input_max_to_fp16"), val = fp16(0x1.86p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_113_cast_fp16 = clip(alpha = feed_forward1s_4_ffw_layer_1_input_min_to_fp16, beta = feed_forward1s_4_ffw_layer_1_input_max_to_fp16, x = normed_output_147_cast_fp16)[name = string("clip_113_cast_fp16")];
+            tensor<fp16, [4096, 1024]> feed_forward1s_4_ffw_layer_1_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(49072256))), lut = tensor<fp16, [128, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(51169472))))[name = string("feed_forward1s_4_ffw_layer_1_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 4096]> linear_45_cast_fp16 = linear(bias = linear_1_bias_0_to_fp16, weight = feed_forward1s_4_ffw_layer_1_linear_weight_to_fp16_palettized, x = clip_113_cast_fp16)[name = string("linear_45_cast_fp16")];
+            fp16 feed_forward1s_4_ffw_layer_1_output_min_to_fp16 = const()[name = string("feed_forward1s_4_ffw_layer_1_output_min_to_fp16"), val = fp16(-0x1.ap+4)];
+            fp16 feed_forward1s_4_ffw_layer_1_output_max_to_fp16 = const()[name = string("feed_forward1s_4_ffw_layer_1_output_max_to_fp16"), val = fp16(0x1.9cp+4)];
+            tensor<fp16, [1, 50, 4096]> clip_114_cast_fp16 = clip(alpha = feed_forward1s_4_ffw_layer_1_output_min_to_fp16, beta = feed_forward1s_4_ffw_layer_1_output_max_to_fp16, x = linear_45_cast_fp16)[name = string("clip_114_cast_fp16")];
+            tensor<fp16, [1, 50, 4096]> hidden_states_429_cast_fp16 = silu(x = clip_114_cast_fp16)[name = string("hidden_states_429_cast_fp16")];
+            fp16 feed_forward1s_4_ffw_layer_2_input_min_to_fp16 = const()[name = string("feed_forward1s_4_ffw_layer_2_input_min_to_fp16"), val = fp16(-0x1.32p+3)];
+            fp16 feed_forward1s_4_ffw_layer_2_input_max_to_fp16 = const()[name = string("feed_forward1s_4_ffw_layer_2_input_max_to_fp16"), val = fp16(0x1.2ep+3)];
+            tensor<fp16, [1, 50, 4096]> clip_115_cast_fp16 = clip(alpha = feed_forward1s_4_ffw_layer_2_input_min_to_fp16, beta = feed_forward1s_4_ffw_layer_2_input_max_to_fp16, x = hidden_states_429_cast_fp16)[name = string("clip_115_cast_fp16")];
+            tensor<fp16, [1024, 4096]> feed_forward1s_4_ffw_layer_2_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 4096]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(51173632))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53270848))))[name = string("feed_forward1s_4_ffw_layer_2_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_46_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = feed_forward1s_4_ffw_layer_2_linear_weight_to_fp16_palettized, x = clip_115_cast_fp16)[name = string("linear_46_cast_fp16")];
+            fp16 feed_forward1s_4_ffw_layer_2_output_min_to_fp16 = const()[name = string("feed_forward1s_4_ffw_layer_2_output_min_to_fp16"), val = fp16(-0x1.7cp+5)];
+            fp16 feed_forward1s_4_ffw_layer_2_output_max_to_fp16 = const()[name = string("feed_forward1s_4_ffw_layer_2_output_max_to_fp16"), val = fp16(0x1.78p+5)];
+            tensor<fp16, [1, 50, 1024]> clip_116_cast_fp16 = clip(alpha = feed_forward1s_4_ffw_layer_2_output_min_to_fp16, beta = feed_forward1s_4_ffw_layer_2_output_max_to_fp16, x = linear_46_cast_fp16)[name = string("clip_116_cast_fp16")];
+            string clip_116_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_116_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1024]> clip_116_cast_fp16_to_fp32 = cast(dtype = clip_116_cast_fp16_to_fp32_dtype_0, x = clip_116_cast_fp16)[name = string("cast_354")];
+            tensor<fp32, [1, 50, 1024]> clip_117 = clip(alpha = var_1996, beta = var_1995, x = clip_116_cast_fp16_to_fp32)[name = string("clip_117")];
+            fp32 var_1990_promoted_1 = const()[name = string("op_1990_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_2036 = pow(x = clip_117, y = var_1990_promoted_1)[name = string("op_2036")];
+            tensor<int32, [1]> var_2038_axes_0 = const()[name = string("op_2038_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2038_keep_dims_0 = const()[name = string("op_2038_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_2038 = reduce_mean(axes = var_2038_axes_0, keep_dims = var_2038_keep_dims_0, x = var_2036)[name = string("op_2038")];
+            string var_2038_to_fp16_dtype_0 = const()[name = string("op_2038_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_2039_to_fp16 = const()[name = string("op_2039_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_2038_to_fp16 = cast(dtype = var_2038_to_fp16_dtype_0, x = var_2038)[name = string("cast_353")];
+            tensor<fp16, [1, 50, 1]> mean_squared_75_cast_fp16 = add(x = var_2038_to_fp16, y = var_2039_to_fp16)[name = string("mean_squared_75_cast_fp16")];
+            string mean_squared_75_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_75_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_75_cast_fp16_to_fp32 = cast(dtype = mean_squared_75_cast_fp16_to_fp32_dtype_0, x = mean_squared_75_cast_fp16)[name = string("cast_352")];
+            tensor<fp32, [1, 50, 1]> var_2041 = pow(x = mean_squared_75_cast_fp16_to_fp32, y = var_1994)[name = string("op_2041")];
+            string clip_117_to_fp16_dtype_0 = const()[name = string("clip_117_to_fp16_dtype_0"), val = string("fp16")];
+            string var_2041_to_fp16_dtype_0 = const()[name = string("op_2041_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_117_to_fp16 = cast(dtype = clip_117_to_fp16_dtype_0, x = clip_117)[name = string("cast_350")];
+            tensor<fp16, [1, 50, 1]> var_2041_to_fp16 = cast(dtype = var_2041_to_fp16_dtype_0, x = var_2041)[name = string("cast_351")];
+            tensor<fp16, [1, 50, 1024]> normed_output_149_cast_fp16 = mul(x = clip_117_to_fp16, y = var_2041_to_fp16)[name = string("normed_output_149_cast_fp16")];
+            tensor<fp16, [1024]> const_63_to_fp16 = const()[name = string("const_63_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53271936)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_151_cast_fp16 = mul(x = normed_output_149_cast_fp16, y = const_63_to_fp16)[name = string("normed_output_151_cast_fp16")];
+            fp16 var_1986_to_fp16 = const()[name = string("op_1986_to_fp16"), val = fp16(0x1p-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_441_cast_fp16 = mul(x = normed_output_151_cast_fp16, y = var_1986_to_fp16)[name = string("hidden_states_441_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_443_cast_fp16 = add(x = hidden_states_441_cast_fp16, y = normed_output_143_cast_fp16)[name = string("hidden_states_443_cast_fp16")];
+            fp16 var_2048_to_fp16 = const()[name = string("op_2048_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_2049_to_fp16 = const()[name = string("op_2049_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_118_cast_fp16 = clip(alpha = var_2048_to_fp16, beta = var_2049_to_fp16, x = hidden_states_443_cast_fp16)[name = string("clip_118_cast_fp16")];
+            string clip_118_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_118_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_2051 = const()[name = string("op_2051"), val = fp32(-0x1p-1)];
+            fp32 var_2055_promoted = const()[name = string("op_2055_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_118_cast_fp16_to_fp32 = cast(dtype = clip_118_cast_fp16_to_fp32_dtype_0, x = clip_118_cast_fp16)[name = string("cast_349")];
+            tensor<fp32, [1, 50, 1024]> var_2061 = pow(x = clip_118_cast_fp16_to_fp32, y = var_2055_promoted)[name = string("op_2061")];
+            tensor<int32, [1]> var_2063_axes_0 = const()[name = string("op_2063_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2063_keep_dims_0 = const()[name = string("op_2063_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_2063 = reduce_mean(axes = var_2063_axes_0, keep_dims = var_2063_keep_dims_0, x = var_2061)[name = string("op_2063")];
+            string var_2063_to_fp16_dtype_0 = const()[name = string("op_2063_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_2064_to_fp16 = const()[name = string("op_2064_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_2063_to_fp16 = cast(dtype = var_2063_to_fp16_dtype_0, x = var_2063)[name = string("cast_348")];
+            tensor<fp16, [1, 50, 1]> mean_squared_77_cast_fp16 = add(x = var_2063_to_fp16, y = var_2064_to_fp16)[name = string("mean_squared_77_cast_fp16")];
+            string mean_squared_77_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_77_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_77_cast_fp16_to_fp32 = cast(dtype = mean_squared_77_cast_fp16_to_fp32_dtype_0, x = mean_squared_77_cast_fp16)[name = string("cast_347")];
+            tensor<fp32, [1, 50, 1]> var_2066 = pow(x = mean_squared_77_cast_fp16_to_fp32, y = var_2051)[name = string("op_2066")];
+            string var_2066_to_fp16_dtype_0 = const()[name = string("op_2066_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_2066_to_fp16 = cast(dtype = var_2066_to_fp16_dtype_0, x = var_2066)[name = string("cast_346")];
+            tensor<fp16, [1, 50, 1024]> normed_output_153_cast_fp16 = mul(x = clip_118_cast_fp16, y = var_2066_to_fp16)[name = string("normed_output_153_cast_fp16")];
+            tensor<fp16, [1024]> const_64_to_fp16 = const()[name = string("const_64_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53274048)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_155_cast_fp16 = mul(x = normed_output_153_cast_fp16, y = const_64_to_fp16)[name = string("normed_output_155_cast_fp16")];
+            int32 var_2072 = const()[name = string("op_2072"), val = int32(-1)];
+            fp32 var_2073 = const()[name = string("op_2073"), val = fp32(-0x1.dcd65p+29)];
+            fp16 self_attns_4_q_proj_input_min_to_fp16 = const()[name = string("self_attns_4_q_proj_input_min_to_fp16"), val = fp16(-0x1.54p+3)];
+            fp16 self_attns_4_q_proj_input_max_to_fp16 = const()[name = string("self_attns_4_q_proj_input_max_to_fp16"), val = fp16(0x1.52p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_119_cast_fp16 = clip(alpha = self_attns_4_q_proj_input_min_to_fp16, beta = self_attns_4_q_proj_input_max_to_fp16, x = normed_output_155_cast_fp16)[name = string("clip_119_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_4_q_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53276160))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53800512))))[name = string("self_attns_4_q_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_47_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_4_q_proj_linear_weight_to_fp16_palettized, x = clip_119_cast_fp16)[name = string("linear_47_cast_fp16")];
+            fp16 self_attns_4_q_proj_output_min_to_fp16 = const()[name = string("self_attns_4_q_proj_output_min_to_fp16"), val = fp16(-0x1.3p+4)];
+            fp16 self_attns_4_q_proj_output_max_to_fp16 = const()[name = string("self_attns_4_q_proj_output_max_to_fp16"), val = fp16(0x1.2ep+4)];
+            tensor<fp16, [1, 50, 1024]> clip_120_cast_fp16 = clip(alpha = self_attns_4_q_proj_output_min_to_fp16, beta = self_attns_4_q_proj_output_max_to_fp16, x = linear_47_cast_fp16)[name = string("clip_120_cast_fp16")];
+            tensor<int32, [4]> var_2117 = const()[name = string("op_2117"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> q_9_cast_fp16 = reshape(shape = var_2117, x = clip_120_cast_fp16)[name = string("q_9_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_4_k_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(53801600))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(54325952))))[name = string("self_attns_4_k_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_48_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_4_k_proj_linear_weight_to_fp16_palettized, x = clip_119_cast_fp16)[name = string("linear_48_cast_fp16")];
+            fp16 self_attns_4_k_proj_output_min_to_fp16 = const()[name = string("self_attns_4_k_proj_output_min_to_fp16"), val = fp16(-0x1.3p+4)];
+            fp16 self_attns_4_k_proj_output_max_to_fp16 = const()[name = string("self_attns_4_k_proj_output_max_to_fp16"), val = fp16(0x1.2ep+4)];
+            tensor<fp16, [1, 50, 1024]> clip_122_cast_fp16 = clip(alpha = self_attns_4_k_proj_output_min_to_fp16, beta = self_attns_4_k_proj_output_max_to_fp16, x = linear_48_cast_fp16)[name = string("clip_122_cast_fp16")];
+            tensor<int32, [4]> var_2129 = const()[name = string("op_2129"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> k_9_cast_fp16 = reshape(shape = var_2129, x = clip_122_cast_fp16)[name = string("k_9_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_4_v_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(54327040))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(54851392))))[name = string("self_attns_4_v_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_49_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_4_v_proj_linear_weight_to_fp16_palettized, x = clip_119_cast_fp16)[name = string("linear_49_cast_fp16")];
+            fp16 self_attns_4_v_proj_output_min_to_fp16 = const()[name = string("self_attns_4_v_proj_output_min_to_fp16"), val = fp16(-0x1.3p+4)];
+            fp16 self_attns_4_v_proj_output_max_to_fp16 = const()[name = string("self_attns_4_v_proj_output_max_to_fp16"), val = fp16(0x1.2ep+4)];
+            tensor<fp16, [1, 50, 1024]> clip_124_cast_fp16 = clip(alpha = self_attns_4_v_proj_output_min_to_fp16, beta = self_attns_4_v_proj_output_max_to_fp16, x = linear_49_cast_fp16)[name = string("clip_124_cast_fp16")];
+            tensor<int32, [4]> var_2141 = const()[name = string("op_2141"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> input_201_cast_fp16 = reshape(shape = var_2141, x = clip_124_cast_fp16)[name = string("input_201_cast_fp16")];
+            fp16 var_2143_to_fp16 = const()[name = string("op_2143_to_fp16"), val = fp16(0x1.054p-3)];
+            tensor<fp16, [1, 50, 8, 128]> var_2144_cast_fp16 = mul(x = q_9_cast_fp16, y = var_2143_to_fp16)[name = string("op_2144_cast_fp16")];
+            tensor<fp16, [128]> var_2145_to_fp16 = const()[name = string("op_2145_to_fp16"), val = tensor<fp16, [128]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(54852480)))];
+            tensor<fp16, [1, 50, 8, 128]> input_197_cast_fp16 = mul(x = var_2144_cast_fp16, y = var_2145_to_fp16)[name = string("input_197_cast_fp16")];
+            fp16 var_2147_to_fp16 = const()[name = string("op_2147_to_fp16"), val = fp16(0x1.e5p+0)];
+            tensor<fp16, [1, 50, 8, 128]> input_199_cast_fp16 = mul(x = k_9_cast_fp16, y = var_2147_to_fp16)[name = string("input_199_cast_fp16")];
+            tensor<int32, [8]> q_padded_9_pad_0 = const()[name = string("q_padded_9_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 10, 0, 0, 0, 0])];
+            string q_padded_9_mode_0 = const()[name = string("q_padded_9_mode_0"), val = string("constant")];
+            fp16 const_65_to_fp16 = const()[name = string("const_65_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 60, 8, 128]> q_padded_9_cast_fp16 = pad(constant_val = const_65_to_fp16, mode = q_padded_9_mode_0, pad = q_padded_9_pad_0, x = input_197_cast_fp16)[name = string("q_padded_9_cast_fp16")];
+            tensor<int32, [5]> var_2151 = const()[name = string("op_2151"), val = tensor<int32, [5]>([1, 5, 12, 8, 128])];
+            tensor<fp16, [1, 5, 12, 8, 128]> q_blocks_9_cast_fp16 = reshape(shape = var_2151, x = q_padded_9_cast_fp16)[name = string("q_blocks_9_cast_fp16")];
+            tensor<int32, [8]> k_padded_9_pad_0 = const()[name = string("k_padded_9_pad_0"), val = tensor<int32, [8]>([0, 0, 12, 11, 0, 0, 0, 0])];
+            string k_padded_9_mode_0 = const()[name = string("k_padded_9_mode_0"), val = string("constant")];
+            fp16 const_66_to_fp16 = const()[name = string("const_66_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 73, 8, 128]> k_padded_9_cast_fp16 = pad(constant_val = const_66_to_fp16, mode = k_padded_9_mode_0, pad = k_padded_9_pad_0, x = input_199_cast_fp16)[name = string("k_padded_9_cast_fp16")];
+            tensor<int32, [8]> v_padded_9_pad_0 = const()[name = string("v_padded_9_pad_0"), val = tensor<int32, [8]>([0, 0, 12, 11, 0, 0, 0, 0])];
+            string v_padded_9_mode_0 = const()[name = string("v_padded_9_mode_0"), val = string("constant")];
+            fp16 const_67_to_fp16 = const()[name = string("const_67_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 73, 8, 128]> v_padded_9_cast_fp16 = pad(constant_val = const_67_to_fp16, mode = v_padded_9_mode_0, pad = v_padded_9_pad_0, x = input_201_cast_fp16)[name = string("v_padded_9_cast_fp16")];
+            tensor<int32, [4]> var_2158_begin_0 = const()[name = string("op_2158_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2158_end_0 = const()[name = string("op_2158_end_0"), val = tensor<int32, [4]>([1, 24, 8, 128])];
+            tensor<bool, [4]> var_2158_end_mask_0 = const()[name = string("op_2158_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_2158_cast_fp16 = slice_by_index(begin = var_2158_begin_0, end = var_2158_end_0, end_mask = var_2158_end_mask_0, x = k_padded_9_cast_fp16)[name = string("op_2158_cast_fp16")];
+            tensor<int32, [4]> var_2160_begin_0 = const()[name = string("op_2160_begin_0"), val = tensor<int32, [4]>([0, 12, 0, 0])];
+            tensor<int32, [4]> var_2160_end_0 = const()[name = string("op_2160_end_0"), val = tensor<int32, [4]>([1, 36, 8, 128])];
+            tensor<bool, [4]> var_2160_end_mask_0 = const()[name = string("op_2160_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_2160_cast_fp16 = slice_by_index(begin = var_2160_begin_0, end = var_2160_end_0, end_mask = var_2160_end_mask_0, x = k_padded_9_cast_fp16)[name = string("op_2160_cast_fp16")];
+            tensor<int32, [4]> var_2162_begin_0 = const()[name = string("op_2162_begin_0"), val = tensor<int32, [4]>([0, 24, 0, 0])];
+            tensor<int32, [4]> var_2162_end_0 = const()[name = string("op_2162_end_0"), val = tensor<int32, [4]>([1, 48, 8, 128])];
+            tensor<bool, [4]> var_2162_end_mask_0 = const()[name = string("op_2162_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_2162_cast_fp16 = slice_by_index(begin = var_2162_begin_0, end = var_2162_end_0, end_mask = var_2162_end_mask_0, x = k_padded_9_cast_fp16)[name = string("op_2162_cast_fp16")];
+            tensor<int32, [4]> var_2164_begin_0 = const()[name = string("op_2164_begin_0"), val = tensor<int32, [4]>([0, 36, 0, 0])];
+            tensor<int32, [4]> var_2164_end_0 = const()[name = string("op_2164_end_0"), val = tensor<int32, [4]>([1, 60, 8, 128])];
+            tensor<bool, [4]> var_2164_end_mask_0 = const()[name = string("op_2164_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_2164_cast_fp16 = slice_by_index(begin = var_2164_begin_0, end = var_2164_end_0, end_mask = var_2164_end_mask_0, x = k_padded_9_cast_fp16)[name = string("op_2164_cast_fp16")];
+            tensor<int32, [4]> var_2166_begin_0 = const()[name = string("op_2166_begin_0"), val = tensor<int32, [4]>([0, 48, 0, 0])];
+            tensor<int32, [4]> var_2166_end_0 = const()[name = string("op_2166_end_0"), val = tensor<int32, [4]>([1, 72, 8, 128])];
+            tensor<bool, [4]> var_2166_end_mask_0 = const()[name = string("op_2166_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_2166_cast_fp16 = slice_by_index(begin = var_2166_begin_0, end = var_2166_end_0, end_mask = var_2166_end_mask_0, x = k_padded_9_cast_fp16)[name = string("op_2166_cast_fp16")];
+            int32 k_blocks_9_axis_0 = const()[name = string("k_blocks_9_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 5, 24, 8, 128]> k_blocks_9_cast_fp16 = stack(axis = k_blocks_9_axis_0, values = (var_2158_cast_fp16, var_2160_cast_fp16, var_2162_cast_fp16, var_2164_cast_fp16, var_2166_cast_fp16))[name = string("k_blocks_9_cast_fp16")];
+            tensor<int32, [4]> var_2170_begin_0 = const()[name = string("op_2170_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2170_end_0 = const()[name = string("op_2170_end_0"), val = tensor<int32, [4]>([1, 24, 8, 128])];
+            tensor<bool, [4]> var_2170_end_mask_0 = const()[name = string("op_2170_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_2170_cast_fp16 = slice_by_index(begin = var_2170_begin_0, end = var_2170_end_0, end_mask = var_2170_end_mask_0, x = v_padded_9_cast_fp16)[name = string("op_2170_cast_fp16")];
+            tensor<int32, [4]> var_2172_begin_0 = const()[name = string("op_2172_begin_0"), val = tensor<int32, [4]>([0, 12, 0, 0])];
+            tensor<int32, [4]> var_2172_end_0 = const()[name = string("op_2172_end_0"), val = tensor<int32, [4]>([1, 36, 8, 128])];
+            tensor<bool, [4]> var_2172_end_mask_0 = const()[name = string("op_2172_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_2172_cast_fp16 = slice_by_index(begin = var_2172_begin_0, end = var_2172_end_0, end_mask = var_2172_end_mask_0, x = v_padded_9_cast_fp16)[name = string("op_2172_cast_fp16")];
+            tensor<int32, [4]> var_2174_begin_0 = const()[name = string("op_2174_begin_0"), val = tensor<int32, [4]>([0, 24, 0, 0])];
+            tensor<int32, [4]> var_2174_end_0 = const()[name = string("op_2174_end_0"), val = tensor<int32, [4]>([1, 48, 8, 128])];
+            tensor<bool, [4]> var_2174_end_mask_0 = const()[name = string("op_2174_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_2174_cast_fp16 = slice_by_index(begin = var_2174_begin_0, end = var_2174_end_0, end_mask = var_2174_end_mask_0, x = v_padded_9_cast_fp16)[name = string("op_2174_cast_fp16")];
+            tensor<int32, [4]> var_2176_begin_0 = const()[name = string("op_2176_begin_0"), val = tensor<int32, [4]>([0, 36, 0, 0])];
+            tensor<int32, [4]> var_2176_end_0 = const()[name = string("op_2176_end_0"), val = tensor<int32, [4]>([1, 60, 8, 128])];
+            tensor<bool, [4]> var_2176_end_mask_0 = const()[name = string("op_2176_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_2176_cast_fp16 = slice_by_index(begin = var_2176_begin_0, end = var_2176_end_0, end_mask = var_2176_end_mask_0, x = v_padded_9_cast_fp16)[name = string("op_2176_cast_fp16")];
+            tensor<int32, [4]> var_2178_begin_0 = const()[name = string("op_2178_begin_0"), val = tensor<int32, [4]>([0, 48, 0, 0])];
+            tensor<int32, [4]> var_2178_end_0 = const()[name = string("op_2178_end_0"), val = tensor<int32, [4]>([1, 72, 8, 128])];
+            tensor<bool, [4]> var_2178_end_mask_0 = const()[name = string("op_2178_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_2178_cast_fp16 = slice_by_index(begin = var_2178_begin_0, end = var_2178_end_0, end_mask = var_2178_end_mask_0, x = v_padded_9_cast_fp16)[name = string("op_2178_cast_fp16")];
+            int32 v_blocks_9_axis_0 = const()[name = string("v_blocks_9_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 5, 24, 8, 128]> v_blocks_9_cast_fp16 = stack(axis = v_blocks_9_axis_0, values = (var_2170_cast_fp16, var_2172_cast_fp16, var_2174_cast_fp16, var_2176_cast_fp16, var_2178_cast_fp16))[name = string("v_blocks_9_cast_fp16")];
+            tensor<int32, [5]> var_2186 = const()[name = string("op_2186"), val = tensor<int32, [5]>([0, 3, 1, -3, -1])];
+            tensor<int32, [5]> var_2188 = const()[name = string("op_2188"), val = tensor<int32, [5]>([0, 3, 1, -1, -3])];
+            bool matrix_ac_9_transpose_x_0 = const()[name = string("matrix_ac_9_transpose_x_0"), val = bool(false)];
+            bool matrix_ac_9_transpose_y_0 = const()[name = string("matrix_ac_9_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 5, 12, 128]> queries_9_cast_fp16 = transpose(perm = var_2186, x = q_blocks_9_cast_fp16)[name = string("transpose_46")];
+            tensor<fp16, [1, 8, 5, 128, 24]> keys_t_9_cast_fp16 = transpose(perm = var_2188, x = k_blocks_9_cast_fp16)[name = string("transpose_47")];
+            tensor<fp16, [1, 8, 5, 12, 24]> matrix_ac_9_cast_fp16 = matmul(transpose_x = matrix_ac_9_transpose_x_0, transpose_y = matrix_ac_9_transpose_y_0, x = queries_9_cast_fp16, y = keys_t_9_cast_fp16)[name = string("matrix_ac_9_cast_fp16")];
+            tensor<int32, [4]> var_2191 = const()[name = string("op_2191"), val = tensor<int32, [4]>([1, 8, 60, 128])];
+            tensor<fp16, [1, 8, 60, 128]> q_flat_9_cast_fp16 = reshape(shape = var_2191, x = queries_9_cast_fp16)[name = string("q_flat_9_cast_fp16")];
+            bool matrix_bd_41_transpose_x_0 = const()[name = string("matrix_bd_41_transpose_x_0"), val = bool(false)];
+            bool matrix_bd_41_transpose_y_0 = const()[name = string("matrix_bd_41_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [8, 128, 13]> rel_k_t_9_to_fp16 = const()[name = string("rel_k_t_9_to_fp16"), val = tensor<fp16, [8, 128, 13]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(54852800)))];
+            tensor<fp16, [1, 8, 60, 13]> matrix_bd_41_cast_fp16 = matmul(transpose_x = matrix_bd_41_transpose_x_0, transpose_y = matrix_bd_41_transpose_y_0, x = q_flat_9_cast_fp16, y = rel_k_t_9_to_fp16)[name = string("matrix_bd_41_cast_fp16")];
+            tensor<int32, [5]> var_2196 = const()[name = string("op_2196"), val = tensor<int32, [5]>([1, 8, 5, 12, 13])];
+            tensor<fp16, [1, 8, 5, 12, 13]> input_203_cast_fp16 = reshape(shape = var_2196, x = matrix_bd_41_cast_fp16)[name = string("input_203_cast_fp16")];
+            tensor<int32, [10]> matrix_bd_43_pad_0 = const()[name = string("matrix_bd_43_pad_0"), val = tensor<int32, [10]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(54879488)))];
+            string matrix_bd_43_mode_0 = const()[name = string("matrix_bd_43_mode_0"), val = string("constant")];
+            fp16 const_69_to_fp16 = const()[name = string("const_69_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 8, 5, 12, 25]> matrix_bd_43_cast_fp16 = pad(constant_val = const_69_to_fp16, mode = matrix_bd_43_mode_0, pad = matrix_bd_43_pad_0, x = input_203_cast_fp16)[name = string("matrix_bd_43_cast_fp16")];
+            tensor<int32, [4]> var_2200 = const()[name = string("op_2200"), val = tensor<int32, [4]>([1, 8, 5, 300])];
+            tensor<fp16, [1, 8, 5, 300]> matrix_bd_45_cast_fp16 = reshape(shape = var_2200, x = matrix_bd_43_cast_fp16)[name = string("matrix_bd_45_cast_fp16")];
+            tensor<int32, [4]> matrix_bd_47_begin_0 = const()[name = string("matrix_bd_47_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> matrix_bd_47_end_0 = const()[name = string("matrix_bd_47_end_0"), val = tensor<int32, [4]>([1, 8, 5, 288])];
+            tensor<bool, [4]> matrix_bd_47_end_mask_0 = const()[name = string("matrix_bd_47_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 8, 5, 288]> matrix_bd_47_cast_fp16 = slice_by_index(begin = matrix_bd_47_begin_0, end = matrix_bd_47_end_0, end_mask = matrix_bd_47_end_mask_0, x = matrix_bd_45_cast_fp16)[name = string("matrix_bd_47_cast_fp16")];
+            tensor<int32, [5]> var_2206 = const()[name = string("op_2206"), val = tensor<int32, [5]>([1, 8, 5, 12, 24])];
+            tensor<fp16, [1, 8, 5, 12, 24]> matrix_bd_49_cast_fp16 = reshape(shape = var_2206, x = matrix_bd_47_cast_fp16)[name = string("matrix_bd_49_cast_fp16")];
+            tensor<fp16, [1, 8, 5, 12, 24]> attn_25_cast_fp16 = add(x = matrix_ac_9_cast_fp16, y = matrix_bd_49_cast_fp16)[name = string("attn_25_cast_fp16")];
+            fp16 _inversed_2209_y_0_to_fp16 = const()[name = string("_inversed_2209_y_0_to_fp16"), val = fp16(0x1.47cp-6)];
+            tensor<fp16, [1, 8, 5, 12, 24]> _inversed_2209_cast_fp16 = mul(x = attn_25_cast_fp16, y = _inversed_2209_y_0_to_fp16)[name = string("_inversed_2209_cast_fp16")];
+            string _inversed_2209_cast_fp16_to_fp32_dtype_0 = const()[name = string("_inversed_2209_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 8, 5, 12, 24]> _inversed_2209_cast_fp16_to_fp32 = cast(dtype = _inversed_2209_cast_fp16_to_fp32_dtype_0, x = _inversed_2209_cast_fp16)[name = string("cast_345")];
+            tensor<fp32, [1, 8, 5, 12, 24]> var_2210 = tanh(x = _inversed_2209_cast_fp16_to_fp32)[name = string("op_2210")];
+            string var_2210_to_fp16_dtype_0 = const()[name = string("op_2210_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 self_attns_4_softcap_to_fp16 = const()[name = string("self_attns_4_softcap_to_fp16"), val = fp16(0x1.9p+5)];
+            tensor<fp16, [1, 8, 5, 12, 24]> var_2210_to_fp16 = cast(dtype = var_2210_to_fp16_dtype_0, x = var_2210)[name = string("cast_344")];
+            tensor<fp16, [1, 8, 5, 12, 24]> attn_27_cast_fp16 = mul(x = var_2210_to_fp16, y = self_attns_4_softcap_to_fp16)[name = string("attn_27_cast_fp16")];
+            string attn_27_cast_fp16_to_fp32_dtype_0 = const()[name = string("attn_27_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 8, 5, 12, 24]> attn_27_cast_fp16_to_fp32 = cast(dtype = attn_27_cast_fp16_to_fp32_dtype_0, x = attn_27_cast_fp16)[name = string("cast_343")];
+            tensor<fp32, [1, 8, 5, 12, 24]> input_205 = select(a = var_2073, b = attn_27_cast_fp16_to_fp32, cond = var_460)[name = string("input_205")];
+            tensor<fp32, [1, 8, 5, 12, 24]> var_2214 = softmax(axis = var_2072, x = input_205)[name = string("op_2214")];
+            tensor<int32, [5]> var_2216 = const()[name = string("op_2216"), val = tensor<int32, [5]>([0, 3, 1, -3, -1])];
+            bool out_25_transpose_x_0 = const()[name = string("out_25_transpose_x_0"), val = bool(false)];
+            bool out_25_transpose_y_0 = const()[name = string("out_25_transpose_y_0"), val = bool(false)];
+            string var_2214_to_fp16_dtype_0 = const()[name = string("op_2214_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 8, 5, 24, 128]> values_t_9_cast_fp16 = transpose(perm = var_2216, x = v_blocks_9_cast_fp16)[name = string("transpose_45")];
+            tensor<fp16, [1, 8, 5, 12, 24]> var_2214_to_fp16 = cast(dtype = var_2214_to_fp16_dtype_0, x = var_2214)[name = string("cast_342")];
+            tensor<fp16, [1, 8, 5, 12, 128]> out_25_cast_fp16 = matmul(transpose_x = out_25_transpose_x_0, transpose_y = out_25_transpose_y_0, x = var_2214_to_fp16, y = values_t_9_cast_fp16)[name = string("out_25_cast_fp16")];
+            tensor<int32, [5]> var_2219 = const()[name = string("op_2219"), val = tensor<int32, [5]>([0, 2, 3, 1, 4])];
+            tensor<int32, [3]> var_2221 = const()[name = string("op_2221"), val = tensor<int32, [3]>([1, 60, 1024])];
+            tensor<fp16, [1, 5, 12, 8, 128]> var_2220_cast_fp16 = transpose(perm = var_2219, x = out_25_cast_fp16)[name = string("transpose_44")];
+            tensor<fp16, [1, 60, 1024]> out_27_cast_fp16 = reshape(shape = var_2221, x = var_2220_cast_fp16)[name = string("out_27_cast_fp16")];
+            tensor<int32, [3]> var_2224_begin_0 = const()[name = string("op_2224_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_2224_end_0 = const()[name = string("op_2224_end_0"), val = tensor<int32, [3]>([1, 50, 1024])];
+            tensor<bool, [3]> var_2224_end_mask_0 = const()[name = string("op_2224_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp16, [1, 50, 1024]> var_2224_cast_fp16 = slice_by_index(begin = var_2224_begin_0, end = var_2224_end_0, end_mask = var_2224_end_mask_0, x = out_27_cast_fp16)[name = string("op_2224_cast_fp16")];
+            fp16 self_attns_4_post_input_min_to_fp16 = const()[name = string("self_attns_4_post_input_min_to_fp16"), val = fp16(-0x1.28p+4)];
+            fp16 self_attns_4_post_input_max_to_fp16 = const()[name = string("self_attns_4_post_input_max_to_fp16"), val = fp16(0x1.26p+4)];
+            tensor<fp16, [1, 50, 1024]> clip_125_cast_fp16 = clip(alpha = self_attns_4_post_input_min_to_fp16, beta = self_attns_4_post_input_max_to_fp16, x = var_2224_cast_fp16)[name = string("clip_125_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_4_post_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(54879616))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(55403968))))[name = string("self_attns_4_post_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_51_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_4_post_linear_weight_to_fp16_palettized, x = clip_125_cast_fp16)[name = string("linear_51_cast_fp16")];
+            fp16 self_attns_4_post_output_min_to_fp16 = const()[name = string("self_attns_4_post_output_min_to_fp16"), val = fp16(-0x1.acp+5)];
+            fp16 self_attns_4_post_output_max_to_fp16 = const()[name = string("self_attns_4_post_output_max_to_fp16"), val = fp16(0x1.a8p+5)];
+            tensor<fp16, [1, 50, 1024]> clip_126_cast_fp16 = clip(alpha = self_attns_4_post_output_min_to_fp16, beta = self_attns_4_post_output_max_to_fp16, x = linear_51_cast_fp16)[name = string("clip_126_cast_fp16")];
+            fp16 var_2236_to_fp16 = const()[name = string("op_2236_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_2237_to_fp16 = const()[name = string("op_2237_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_127_cast_fp16 = clip(alpha = var_2236_to_fp16, beta = var_2237_to_fp16, x = clip_126_cast_fp16)[name = string("clip_127_cast_fp16")];
+            string clip_127_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_127_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_2239 = const()[name = string("op_2239"), val = fp32(-0x1p-1)];
+            fp32 var_2243_promoted = const()[name = string("op_2243_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_127_cast_fp16_to_fp32 = cast(dtype = clip_127_cast_fp16_to_fp32_dtype_0, x = clip_127_cast_fp16)[name = string("cast_341")];
+            tensor<fp32, [1, 50, 1024]> var_2249 = pow(x = clip_127_cast_fp16_to_fp32, y = var_2243_promoted)[name = string("op_2249")];
+            tensor<int32, [1]> var_2251_axes_0 = const()[name = string("op_2251_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2251_keep_dims_0 = const()[name = string("op_2251_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_2251 = reduce_mean(axes = var_2251_axes_0, keep_dims = var_2251_keep_dims_0, x = var_2249)[name = string("op_2251")];
+            string var_2251_to_fp16_dtype_0 = const()[name = string("op_2251_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_2252_to_fp16 = const()[name = string("op_2252_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_2251_to_fp16 = cast(dtype = var_2251_to_fp16_dtype_0, x = var_2251)[name = string("cast_340")];
+            tensor<fp16, [1, 50, 1]> mean_squared_79_cast_fp16 = add(x = var_2251_to_fp16, y = var_2252_to_fp16)[name = string("mean_squared_79_cast_fp16")];
+            string mean_squared_79_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_79_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_79_cast_fp16_to_fp32 = cast(dtype = mean_squared_79_cast_fp16_to_fp32_dtype_0, x = mean_squared_79_cast_fp16)[name = string("cast_339")];
+            tensor<fp32, [1, 50, 1]> var_2254 = pow(x = mean_squared_79_cast_fp16_to_fp32, y = var_2239)[name = string("op_2254")];
+            string var_2254_to_fp16_dtype_0 = const()[name = string("op_2254_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_2254_to_fp16 = cast(dtype = var_2254_to_fp16_dtype_0, x = var_2254)[name = string("cast_338")];
+            tensor<fp16, [1, 50, 1024]> normed_output_157_cast_fp16 = mul(x = clip_127_cast_fp16, y = var_2254_to_fp16)[name = string("normed_output_157_cast_fp16")];
+            tensor<fp16, [1024]> const_70_to_fp16 = const()[name = string("const_70_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(55405056)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_159_cast_fp16 = mul(x = normed_output_157_cast_fp16, y = const_70_to_fp16)[name = string("normed_output_159_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_469_cast_fp16 = add(x = normed_output_159_cast_fp16, y = hidden_states_443_cast_fp16)[name = string("hidden_states_469_cast_fp16")];
+            string hidden_states_469_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_469_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_2261 = const()[name = string("op_2261"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_2262 = const()[name = string("op_2262"), val = fp32(-0x1.2a05f2p+33)];
+            fp32 var_2274 = const()[name = string("op_2274"), val = fp32(-0x1p-1)];
+            fp32 var_2270_promoted = const()[name = string("op_2270_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> hidden_states_469_cast_fp16_to_fp32 = cast(dtype = hidden_states_469_cast_fp16_to_fp32_dtype_0, x = hidden_states_469_cast_fp16)[name = string("cast_337")];
+            tensor<fp32, [1, 50, 1024]> var_2282 = pow(x = hidden_states_469_cast_fp16_to_fp32, y = var_2270_promoted)[name = string("op_2282")];
+            tensor<int32, [1]> var_2284_axes_0 = const()[name = string("op_2284_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2284_keep_dims_0 = const()[name = string("op_2284_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_2284 = reduce_mean(axes = var_2284_axes_0, keep_dims = var_2284_keep_dims_0, x = var_2282)[name = string("op_2284")];
+            string var_2284_to_fp16_dtype_0 = const()[name = string("op_2284_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_2285_to_fp16 = const()[name = string("op_2285_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_2284_to_fp16 = cast(dtype = var_2284_to_fp16_dtype_0, x = var_2284)[name = string("cast_336")];
+            tensor<fp16, [1, 50, 1]> mean_squared_81_cast_fp16 = add(x = var_2284_to_fp16, y = var_2285_to_fp16)[name = string("mean_squared_81_cast_fp16")];
+            string mean_squared_81_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_81_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_81_cast_fp16_to_fp32 = cast(dtype = mean_squared_81_cast_fp16_to_fp32_dtype_0, x = mean_squared_81_cast_fp16)[name = string("cast_335")];
+            tensor<fp32, [1, 50, 1]> var_2287 = pow(x = mean_squared_81_cast_fp16_to_fp32, y = var_2274)[name = string("op_2287")];
+            string var_2287_to_fp16_dtype_0 = const()[name = string("op_2287_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_2287_to_fp16 = cast(dtype = var_2287_to_fp16_dtype_0, x = var_2287)[name = string("cast_334")];
+            tensor<fp16, [1, 50, 1024]> normed_output_161_cast_fp16 = mul(x = hidden_states_469_cast_fp16, y = var_2287_to_fp16)[name = string("normed_output_161_cast_fp16")];
+            tensor<fp16, [1024]> const_71_to_fp16 = const()[name = string("const_71_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(55407168)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_163_cast_fp16 = mul(x = normed_output_161_cast_fp16, y = const_71_to_fp16)[name = string("normed_output_163_cast_fp16")];
+            fp16 lconv1ds_4_linear_start_input_min_to_fp16 = const()[name = string("lconv1ds_4_linear_start_input_min_to_fp16"), val = fp16(-0x1.4ep+3)];
+            fp16 lconv1ds_4_linear_start_input_max_to_fp16 = const()[name = string("lconv1ds_4_linear_start_input_max_to_fp16"), val = fp16(0x1.4ap+3)];
+            tensor<fp16, [1, 50, 1024]> clip_128_cast_fp16 = clip(alpha = lconv1ds_4_linear_start_input_min_to_fp16, beta = lconv1ds_4_linear_start_input_max_to_fp16, x = normed_output_163_cast_fp16)[name = string("clip_128_cast_fp16")];
+            tensor<fp16, [2048, 1024]> lconv1ds_4_linear_start_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(55409280))), lut = tensor<fp16, [64, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(56457920))))[name = string("lconv1ds_4_linear_start_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 2048]> linear_52_cast_fp16 = linear(bias = linear_8_bias_0_to_fp16, weight = lconv1ds_4_linear_start_linear_weight_to_fp16_palettized, x = clip_128_cast_fp16)[name = string("linear_52_cast_fp16")];
+            fp16 lconv1ds_4_linear_start_output_min_to_fp16 = const()[name = string("lconv1ds_4_linear_start_output_min_to_fp16"), val = fp16(-0x1.7cp+4)];
+            fp16 lconv1ds_4_linear_start_output_max_to_fp16 = const()[name = string("lconv1ds_4_linear_start_output_max_to_fp16"), val = fp16(0x1.7ap+4)];
+            tensor<fp16, [1, 50, 2048]> clip_129_cast_fp16 = clip(alpha = lconv1ds_4_linear_start_output_min_to_fp16, beta = lconv1ds_4_linear_start_output_max_to_fp16, x = linear_52_cast_fp16)[name = string("clip_129_cast_fp16")];
+            int32 hidden_states_477_split_num_splits_0 = const()[name = string("hidden_states_477_split_num_splits_0"), val = int32(2)];
+            int32 hidden_states_477_split_axis_0 = const()[name = string("hidden_states_477_split_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_477_split_cast_fp16_0, tensor<fp16, [1, 50, 1024]> hidden_states_477_split_cast_fp16_1 = split(axis = hidden_states_477_split_axis_0, num_splits = hidden_states_477_split_num_splits_0, x = clip_129_cast_fp16)[name = string("hidden_states_477_split_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_477_split_1_sigmoid_cast_fp16 = sigmoid(x = hidden_states_477_split_cast_fp16_1)[name = string("hidden_states_477_split_1_sigmoid_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_477_cast_fp16 = mul(x = hidden_states_477_split_cast_fp16_0, y = hidden_states_477_split_1_sigmoid_cast_fp16)[name = string("hidden_states_477_cast_fp16")];
+            tensor<int32, [3]> input_213_perm_0 = const()[name = string("input_213_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [6]> input_215_pad_0 = const()[name = string("input_215_pad_0"), val = tensor<int32, [6]>([0, 0, 0, 0, 4, 0])];
+            string input_215_mode_0 = const()[name = string("input_215_mode_0"), val = string("constant")];
+            fp16 const_72_to_fp16 = const()[name = string("const_72_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 1024, 50]> input_213_cast_fp16 = transpose(perm = input_213_perm_0, x = hidden_states_477_cast_fp16)[name = string("transpose_43")];
+            tensor<fp16, [1, 1024, 54]> input_215_cast_fp16 = pad(constant_val = const_72_to_fp16, mode = input_215_mode_0, pad = input_215_pad_0, x = input_213_cast_fp16)[name = string("input_215_cast_fp16")];
+            string var_2313_pad_type_0 = const()[name = string("op_2313_pad_type_0"), val = string("valid")];
+            int32 var_2313_groups_0 = const()[name = string("op_2313_groups_0"), val = int32(1024)];
+            tensor<int32, [1]> var_2313_strides_0 = const()[name = string("op_2313_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_2313_pad_0 = const()[name = string("op_2313_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_2313_dilations_0 = const()[name = string("op_2313_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1024, 1, 5]> lconv1ds_4_depthwise_conv1d_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1, 5]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(56460032))), lut = tensor<fp16, [32, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(56462656))))[name = string("lconv1ds_4_depthwise_conv1d_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 50]> var_2313_cast_fp16 = conv(dilations = var_2313_dilations_0, groups = var_2313_groups_0, pad = var_2313_pad_0, pad_type = var_2313_pad_type_0, strides = var_2313_strides_0, weight = lconv1ds_4_depthwise_conv1d_weight_to_fp16_palettized, x = input_215_cast_fp16)[name = string("op_2313_cast_fp16")];
+            tensor<int32, [3]> hidden_states_479_perm_0 = const()[name = string("hidden_states_479_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            string hidden_states_479_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_479_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_479_cast_fp16 = transpose(perm = hidden_states_479_perm_0, x = var_2313_cast_fp16)[name = string("transpose_42")];
+            tensor<fp32, [1, 50, 1024]> hidden_states_479_cast_fp16_to_fp32 = cast(dtype = hidden_states_479_cast_fp16_to_fp32_dtype_0, x = hidden_states_479_cast_fp16)[name = string("cast_333")];
+            tensor<fp32, [1, 50, 1024]> clip_130 = clip(alpha = var_2262, beta = var_2261, x = hidden_states_479_cast_fp16_to_fp32)[name = string("clip_130")];
+            fp32 var_2270_promoted_1 = const()[name = string("op_2270_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_2318 = pow(x = clip_130, y = var_2270_promoted_1)[name = string("op_2318")];
+            tensor<int32, [1]> var_2320_axes_0 = const()[name = string("op_2320_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2320_keep_dims_0 = const()[name = string("op_2320_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_2320 = reduce_mean(axes = var_2320_axes_0, keep_dims = var_2320_keep_dims_0, x = var_2318)[name = string("op_2320")];
+            string var_2320_to_fp16_dtype_0 = const()[name = string("op_2320_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_2321_to_fp16 = const()[name = string("op_2321_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_2320_to_fp16 = cast(dtype = var_2320_to_fp16_dtype_0, x = var_2320)[name = string("cast_332")];
+            tensor<fp16, [1, 50, 1]> mean_squared_83_cast_fp16 = add(x = var_2320_to_fp16, y = var_2321_to_fp16)[name = string("mean_squared_83_cast_fp16")];
+            string mean_squared_83_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_83_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_83_cast_fp16_to_fp32 = cast(dtype = mean_squared_83_cast_fp16_to_fp32_dtype_0, x = mean_squared_83_cast_fp16)[name = string("cast_331")];
+            tensor<fp32, [1, 50, 1]> var_2323 = pow(x = mean_squared_83_cast_fp16_to_fp32, y = var_2274)[name = string("op_2323")];
+            string clip_130_to_fp16_dtype_0 = const()[name = string("clip_130_to_fp16_dtype_0"), val = string("fp16")];
+            string var_2323_to_fp16_dtype_0 = const()[name = string("op_2323_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_130_to_fp16 = cast(dtype = clip_130_to_fp16_dtype_0, x = clip_130)[name = string("cast_329")];
+            tensor<fp16, [1, 50, 1]> var_2323_to_fp16 = cast(dtype = var_2323_to_fp16_dtype_0, x = var_2323)[name = string("cast_330")];
+            tensor<fp16, [1, 50, 1024]> normed_output_165_cast_fp16 = mul(x = clip_130_to_fp16, y = var_2323_to_fp16)[name = string("normed_output_165_cast_fp16")];
+            tensor<fp16, [1024]> const_73_to_fp16 = const()[name = string("const_73_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(56463744)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_167_cast_fp16 = mul(x = normed_output_165_cast_fp16, y = const_73_to_fp16)[name = string("normed_output_167_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_485_cast_fp16 = silu(x = normed_output_167_cast_fp16)[name = string("hidden_states_485_cast_fp16")];
+            fp16 lconv1ds_4_linear_end_input_min_to_fp16 = const()[name = string("lconv1ds_4_linear_end_input_min_to_fp16"), val = fp16(-0x1.5cp+3)];
+            fp16 lconv1ds_4_linear_end_input_max_to_fp16 = const()[name = string("lconv1ds_4_linear_end_input_max_to_fp16"), val = fp16(0x1.5ap+3)];
+            tensor<fp16, [1, 50, 1024]> clip_131_cast_fp16 = clip(alpha = lconv1ds_4_linear_end_input_min_to_fp16, beta = lconv1ds_4_linear_end_input_max_to_fp16, x = hidden_states_485_cast_fp16)[name = string("clip_131_cast_fp16")];
+            tensor<fp16, [1024, 1024]> lconv1ds_4_linear_end_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(56465856))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(56990208))))[name = string("lconv1ds_4_linear_end_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_53_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = lconv1ds_4_linear_end_linear_weight_to_fp16_palettized, x = clip_131_cast_fp16)[name = string("linear_53_cast_fp16")];
+            fp16 lconv1ds_4_linear_end_output_min_to_fp16 = const()[name = string("lconv1ds_4_linear_end_output_min_to_fp16"), val = fp16(-0x1.f4p+2)];
+            fp16 lconv1ds_4_linear_end_output_max_to_fp16 = const()[name = string("lconv1ds_4_linear_end_output_max_to_fp16"), val = fp16(0x1.fp+2)];
+            tensor<fp16, [1, 50, 1024]> clip_132_cast_fp16 = clip(alpha = lconv1ds_4_linear_end_output_min_to_fp16, beta = lconv1ds_4_linear_end_output_max_to_fp16, x = linear_53_cast_fp16)[name = string("clip_132_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_491_cast_fp16 = add(x = clip_132_cast_fp16, y = hidden_states_469_cast_fp16)[name = string("hidden_states_491_cast_fp16")];
+            string hidden_states_491_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_491_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_2347 = const()[name = string("op_2347"), val = fp32(-0x1p-1)];
+            fp32 var_2348 = const()[name = string("op_2348"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_2349 = const()[name = string("op_2349"), val = fp32(-0x1.2a05f2p+33)];
+            tensor<fp32, [1, 50, 1024]> hidden_states_491_cast_fp16_to_fp32 = cast(dtype = hidden_states_491_cast_fp16_to_fp32_dtype_0, x = hidden_states_491_cast_fp16)[name = string("cast_328")];
+            tensor<fp32, [1, 50, 1024]> clip_133 = clip(alpha = var_2349, beta = var_2348, x = hidden_states_491_cast_fp16_to_fp32)[name = string("clip_133")];
+            fp32 var_2343_promoted = const()[name = string("op_2343_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_2357 = pow(x = clip_133, y = var_2343_promoted)[name = string("op_2357")];
+            tensor<int32, [1]> var_2359_axes_0 = const()[name = string("op_2359_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2359_keep_dims_0 = const()[name = string("op_2359_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_2359 = reduce_mean(axes = var_2359_axes_0, keep_dims = var_2359_keep_dims_0, x = var_2357)[name = string("op_2359")];
+            string var_2359_to_fp16_dtype_0 = const()[name = string("op_2359_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_2360_to_fp16 = const()[name = string("op_2360_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_2359_to_fp16 = cast(dtype = var_2359_to_fp16_dtype_0, x = var_2359)[name = string("cast_327")];
+            tensor<fp16, [1, 50, 1]> mean_squared_85_cast_fp16 = add(x = var_2359_to_fp16, y = var_2360_to_fp16)[name = string("mean_squared_85_cast_fp16")];
+            string mean_squared_85_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_85_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_85_cast_fp16_to_fp32 = cast(dtype = mean_squared_85_cast_fp16_to_fp32_dtype_0, x = mean_squared_85_cast_fp16)[name = string("cast_326")];
+            tensor<fp32, [1, 50, 1]> var_2362 = pow(x = mean_squared_85_cast_fp16_to_fp32, y = var_2347)[name = string("op_2362")];
+            string clip_133_to_fp16_dtype_0 = const()[name = string("clip_133_to_fp16_dtype_0"), val = string("fp16")];
+            string var_2362_to_fp16_dtype_0 = const()[name = string("op_2362_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_133_to_fp16 = cast(dtype = clip_133_to_fp16_dtype_0, x = clip_133)[name = string("cast_324")];
+            tensor<fp16, [1, 50, 1]> var_2362_to_fp16 = cast(dtype = var_2362_to_fp16_dtype_0, x = var_2362)[name = string("cast_325")];
+            tensor<fp16, [1, 50, 1024]> normed_output_169_cast_fp16 = mul(x = clip_133_to_fp16, y = var_2362_to_fp16)[name = string("normed_output_169_cast_fp16")];
+            tensor<fp16, [1024]> const_74_to_fp16 = const()[name = string("const_74_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(56991296)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_171_cast_fp16 = mul(x = normed_output_169_cast_fp16, y = const_74_to_fp16)[name = string("normed_output_171_cast_fp16")];
+            fp16 feed_forward2s_4_ffw_layer_1_input_min_to_fp16 = const()[name = string("feed_forward2s_4_ffw_layer_1_input_min_to_fp16"), val = fp16(-0x1.aap+3)];
+            fp16 feed_forward2s_4_ffw_layer_1_input_max_to_fp16 = const()[name = string("feed_forward2s_4_ffw_layer_1_input_max_to_fp16"), val = fp16(0x1.a6p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_134_cast_fp16 = clip(alpha = feed_forward2s_4_ffw_layer_1_input_min_to_fp16, beta = feed_forward2s_4_ffw_layer_1_input_max_to_fp16, x = normed_output_171_cast_fp16)[name = string("clip_134_cast_fp16")];
+            tensor<fp16, [4096, 1024]> feed_forward2s_4_ffw_layer_1_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(56993408))), lut = tensor<fp16, [128, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(59090624))))[name = string("feed_forward2s_4_ffw_layer_1_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 4096]> linear_54_cast_fp16 = linear(bias = linear_1_bias_0_to_fp16, weight = feed_forward2s_4_ffw_layer_1_linear_weight_to_fp16_palettized, x = clip_134_cast_fp16)[name = string("linear_54_cast_fp16")];
+            fp16 feed_forward2s_4_ffw_layer_1_output_min_to_fp16 = const()[name = string("feed_forward2s_4_ffw_layer_1_output_min_to_fp16"), val = fp16(-0x1.ep+4)];
+            fp16 feed_forward2s_4_ffw_layer_1_output_max_to_fp16 = const()[name = string("feed_forward2s_4_ffw_layer_1_output_max_to_fp16"), val = fp16(0x1.dcp+4)];
+            tensor<fp16, [1, 50, 4096]> clip_135_cast_fp16 = clip(alpha = feed_forward2s_4_ffw_layer_1_output_min_to_fp16, beta = feed_forward2s_4_ffw_layer_1_output_max_to_fp16, x = linear_54_cast_fp16)[name = string("clip_135_cast_fp16")];
+            tensor<fp16, [1, 50, 4096]> hidden_states_501_cast_fp16 = silu(x = clip_135_cast_fp16)[name = string("hidden_states_501_cast_fp16")];
+            fp16 feed_forward2s_4_ffw_layer_2_input_min_to_fp16 = const()[name = string("feed_forward2s_4_ffw_layer_2_input_min_to_fp16"), val = fp16(-0x1.8ap+3)];
+            fp16 feed_forward2s_4_ffw_layer_2_input_max_to_fp16 = const()[name = string("feed_forward2s_4_ffw_layer_2_input_max_to_fp16"), val = fp16(0x1.88p+3)];
+            tensor<fp16, [1, 50, 4096]> clip_136_cast_fp16 = clip(alpha = feed_forward2s_4_ffw_layer_2_input_min_to_fp16, beta = feed_forward2s_4_ffw_layer_2_input_max_to_fp16, x = hidden_states_501_cast_fp16)[name = string("clip_136_cast_fp16")];
+            tensor<fp16, [1024, 4096]> feed_forward2s_4_ffw_layer_2_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 4096]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(59094784))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(61192000))))[name = string("feed_forward2s_4_ffw_layer_2_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_55_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = feed_forward2s_4_ffw_layer_2_linear_weight_to_fp16_palettized, x = clip_136_cast_fp16)[name = string("linear_55_cast_fp16")];
+            fp16 feed_forward2s_4_ffw_layer_2_output_min_to_fp16 = const()[name = string("feed_forward2s_4_ffw_layer_2_output_min_to_fp16"), val = fp16(-0x1.2cp+6)];
+            fp16 feed_forward2s_4_ffw_layer_2_output_max_to_fp16 = const()[name = string("feed_forward2s_4_ffw_layer_2_output_max_to_fp16"), val = fp16(0x1.2ap+6)];
+            tensor<fp16, [1, 50, 1024]> clip_137_cast_fp16 = clip(alpha = feed_forward2s_4_ffw_layer_2_output_min_to_fp16, beta = feed_forward2s_4_ffw_layer_2_output_max_to_fp16, x = linear_55_cast_fp16)[name = string("clip_137_cast_fp16")];
+            string clip_137_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_137_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1024]> clip_137_cast_fp16_to_fp32 = cast(dtype = clip_137_cast_fp16_to_fp32_dtype_0, x = clip_137_cast_fp16)[name = string("cast_323")];
+            tensor<fp32, [1, 50, 1024]> clip_138 = clip(alpha = var_2349, beta = var_2348, x = clip_137_cast_fp16_to_fp32)[name = string("clip_138")];
+            fp32 var_2343_promoted_1 = const()[name = string("op_2343_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_2389 = pow(x = clip_138, y = var_2343_promoted_1)[name = string("op_2389")];
+            tensor<int32, [1]> var_2391_axes_0 = const()[name = string("op_2391_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2391_keep_dims_0 = const()[name = string("op_2391_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_2391 = reduce_mean(axes = var_2391_axes_0, keep_dims = var_2391_keep_dims_0, x = var_2389)[name = string("op_2391")];
+            string var_2391_to_fp16_dtype_0 = const()[name = string("op_2391_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_2392_to_fp16 = const()[name = string("op_2392_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_2391_to_fp16 = cast(dtype = var_2391_to_fp16_dtype_0, x = var_2391)[name = string("cast_322")];
+            tensor<fp16, [1, 50, 1]> mean_squared_87_cast_fp16 = add(x = var_2391_to_fp16, y = var_2392_to_fp16)[name = string("mean_squared_87_cast_fp16")];
+            string mean_squared_87_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_87_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_87_cast_fp16_to_fp32 = cast(dtype = mean_squared_87_cast_fp16_to_fp32_dtype_0, x = mean_squared_87_cast_fp16)[name = string("cast_321")];
+            tensor<fp32, [1, 50, 1]> var_2394 = pow(x = mean_squared_87_cast_fp16_to_fp32, y = var_2347)[name = string("op_2394")];
+            string clip_138_to_fp16_dtype_0 = const()[name = string("clip_138_to_fp16_dtype_0"), val = string("fp16")];
+            string var_2394_to_fp16_dtype_0 = const()[name = string("op_2394_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_138_to_fp16 = cast(dtype = clip_138_to_fp16_dtype_0, x = clip_138)[name = string("cast_319")];
+            tensor<fp16, [1, 50, 1]> var_2394_to_fp16 = cast(dtype = var_2394_to_fp16_dtype_0, x = var_2394)[name = string("cast_320")];
+            tensor<fp16, [1, 50, 1024]> normed_output_173_cast_fp16 = mul(x = clip_138_to_fp16, y = var_2394_to_fp16)[name = string("normed_output_173_cast_fp16")];
+            tensor<fp16, [1024]> const_75_to_fp16 = const()[name = string("const_75_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(61193088)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_175_cast_fp16 = mul(x = normed_output_173_cast_fp16, y = const_75_to_fp16)[name = string("normed_output_175_cast_fp16")];
+            fp16 var_2339_to_fp16 = const()[name = string("op_2339_to_fp16"), val = fp16(0x1p-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_513_cast_fp16 = mul(x = normed_output_175_cast_fp16, y = var_2339_to_fp16)[name = string("hidden_states_513_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_515_cast_fp16 = add(x = hidden_states_513_cast_fp16, y = hidden_states_491_cast_fp16)[name = string("hidden_states_515_cast_fp16")];
+            fp16 var_2401_to_fp16 = const()[name = string("op_2401_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_2402_to_fp16 = const()[name = string("op_2402_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_139_cast_fp16 = clip(alpha = var_2401_to_fp16, beta = var_2402_to_fp16, x = hidden_states_515_cast_fp16)[name = string("clip_139_cast_fp16")];
+            string clip_139_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_139_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_2404 = const()[name = string("op_2404"), val = fp32(-0x1p-1)];
+            fp32 var_2408_promoted = const()[name = string("op_2408_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_139_cast_fp16_to_fp32 = cast(dtype = clip_139_cast_fp16_to_fp32_dtype_0, x = clip_139_cast_fp16)[name = string("cast_318")];
+            tensor<fp32, [1, 50, 1024]> var_2414 = pow(x = clip_139_cast_fp16_to_fp32, y = var_2408_promoted)[name = string("op_2414")];
+            tensor<int32, [1]> var_2416_axes_0 = const()[name = string("op_2416_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2416_keep_dims_0 = const()[name = string("op_2416_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_2416 = reduce_mean(axes = var_2416_axes_0, keep_dims = var_2416_keep_dims_0, x = var_2414)[name = string("op_2416")];
+            string var_2416_to_fp16_dtype_0 = const()[name = string("op_2416_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_2417_to_fp16 = const()[name = string("op_2417_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_2416_to_fp16 = cast(dtype = var_2416_to_fp16_dtype_0, x = var_2416)[name = string("cast_317")];
+            tensor<fp16, [1, 50, 1]> mean_squared_89_cast_fp16 = add(x = var_2416_to_fp16, y = var_2417_to_fp16)[name = string("mean_squared_89_cast_fp16")];
+            string mean_squared_89_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_89_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_89_cast_fp16_to_fp32 = cast(dtype = mean_squared_89_cast_fp16_to_fp32_dtype_0, x = mean_squared_89_cast_fp16)[name = string("cast_316")];
+            tensor<fp32, [1, 50, 1]> var_2419 = pow(x = mean_squared_89_cast_fp16_to_fp32, y = var_2404)[name = string("op_2419")];
+            string var_2419_to_fp16_dtype_0 = const()[name = string("op_2419_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_2419_to_fp16 = cast(dtype = var_2419_to_fp16_dtype_0, x = var_2419)[name = string("cast_315")];
+            tensor<fp16, [1, 50, 1024]> normed_output_177_cast_fp16 = mul(x = clip_139_cast_fp16, y = var_2419_to_fp16)[name = string("normed_output_177_cast_fp16")];
+            tensor<fp16, [1024]> const_76_to_fp16 = const()[name = string("const_76_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(61195200)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_179_cast_fp16 = mul(x = normed_output_177_cast_fp16, y = const_76_to_fp16)[name = string("normed_output_179_cast_fp16")];
+            string normed_output_179_cast_fp16_to_fp32_dtype_0 = const()[name = string("normed_output_179_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_2432 = const()[name = string("op_2432"), val = fp32(-0x1p-1)];
+            fp32 var_2433 = const()[name = string("op_2433"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_2434 = const()[name = string("op_2434"), val = fp32(-0x1.2a05f2p+33)];
+            tensor<fp32, [1, 50, 1024]> normed_output_179_cast_fp16_to_fp32 = cast(dtype = normed_output_179_cast_fp16_to_fp32_dtype_0, x = normed_output_179_cast_fp16)[name = string("cast_314")];
+            tensor<fp32, [1, 50, 1024]> clip_140 = clip(alpha = var_2434, beta = var_2433, x = normed_output_179_cast_fp16_to_fp32)[name = string("clip_140")];
+            fp32 var_2428_promoted = const()[name = string("op_2428_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_2442 = pow(x = clip_140, y = var_2428_promoted)[name = string("op_2442")];
+            tensor<int32, [1]> var_2444_axes_0 = const()[name = string("op_2444_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2444_keep_dims_0 = const()[name = string("op_2444_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_2444 = reduce_mean(axes = var_2444_axes_0, keep_dims = var_2444_keep_dims_0, x = var_2442)[name = string("op_2444")];
+            string var_2444_to_fp16_dtype_0 = const()[name = string("op_2444_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_2445_to_fp16 = const()[name = string("op_2445_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_2444_to_fp16 = cast(dtype = var_2444_to_fp16_dtype_0, x = var_2444)[name = string("cast_313")];
+            tensor<fp16, [1, 50, 1]> mean_squared_91_cast_fp16 = add(x = var_2444_to_fp16, y = var_2445_to_fp16)[name = string("mean_squared_91_cast_fp16")];
+            string mean_squared_91_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_91_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_91_cast_fp16_to_fp32 = cast(dtype = mean_squared_91_cast_fp16_to_fp32_dtype_0, x = mean_squared_91_cast_fp16)[name = string("cast_312")];
+            tensor<fp32, [1, 50, 1]> var_2447 = pow(x = mean_squared_91_cast_fp16_to_fp32, y = var_2432)[name = string("op_2447")];
+            string clip_140_to_fp16_dtype_0 = const()[name = string("clip_140_to_fp16_dtype_0"), val = string("fp16")];
+            string var_2447_to_fp16_dtype_0 = const()[name = string("op_2447_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_140_to_fp16 = cast(dtype = clip_140_to_fp16_dtype_0, x = clip_140)[name = string("cast_310")];
+            tensor<fp16, [1, 50, 1]> var_2447_to_fp16 = cast(dtype = var_2447_to_fp16_dtype_0, x = var_2447)[name = string("cast_311")];
+            tensor<fp16, [1, 50, 1024]> normed_output_181_cast_fp16 = mul(x = clip_140_to_fp16, y = var_2447_to_fp16)[name = string("normed_output_181_cast_fp16")];
+            tensor<fp16, [1024]> const_77_to_fp16 = const()[name = string("const_77_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(61197312)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_183_cast_fp16 = mul(x = normed_output_181_cast_fp16, y = const_77_to_fp16)[name = string("normed_output_183_cast_fp16")];
+            fp16 feed_forward1s_5_ffw_layer_1_input_min_to_fp16 = const()[name = string("feed_forward1s_5_ffw_layer_1_input_min_to_fp16"), val = fp16(-0x1.9cp+3)];
+            fp16 feed_forward1s_5_ffw_layer_1_input_max_to_fp16 = const()[name = string("feed_forward1s_5_ffw_layer_1_input_max_to_fp16"), val = fp16(0x1.9ap+3)];
+            tensor<fp16, [1, 50, 1024]> clip_141_cast_fp16 = clip(alpha = feed_forward1s_5_ffw_layer_1_input_min_to_fp16, beta = feed_forward1s_5_ffw_layer_1_input_max_to_fp16, x = normed_output_183_cast_fp16)[name = string("clip_141_cast_fp16")];
+            tensor<fp16, [4096, 1024]> feed_forward1s_5_ffw_layer_1_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(61199424))), lut = tensor<fp16, [128, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(63296640))))[name = string("feed_forward1s_5_ffw_layer_1_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 4096]> linear_56_cast_fp16 = linear(bias = linear_1_bias_0_to_fp16, weight = feed_forward1s_5_ffw_layer_1_linear_weight_to_fp16_palettized, x = clip_141_cast_fp16)[name = string("linear_56_cast_fp16")];
+            fp16 feed_forward1s_5_ffw_layer_1_output_min_to_fp16 = const()[name = string("feed_forward1s_5_ffw_layer_1_output_min_to_fp16"), val = fp16(-0x1.dcp+4)];
+            fp16 feed_forward1s_5_ffw_layer_1_output_max_to_fp16 = const()[name = string("feed_forward1s_5_ffw_layer_1_output_max_to_fp16"), val = fp16(0x1.d8p+4)];
+            tensor<fp16, [1, 50, 4096]> clip_142_cast_fp16 = clip(alpha = feed_forward1s_5_ffw_layer_1_output_min_to_fp16, beta = feed_forward1s_5_ffw_layer_1_output_max_to_fp16, x = linear_56_cast_fp16)[name = string("clip_142_cast_fp16")];
+            tensor<fp16, [1, 50, 4096]> hidden_states_531_cast_fp16 = silu(x = clip_142_cast_fp16)[name = string("hidden_states_531_cast_fp16")];
+            fp16 feed_forward1s_5_ffw_layer_2_input_min_to_fp16 = const()[name = string("feed_forward1s_5_ffw_layer_2_input_min_to_fp16"), val = fp16(-0x1.4ep+3)];
+            fp16 feed_forward1s_5_ffw_layer_2_input_max_to_fp16 = const()[name = string("feed_forward1s_5_ffw_layer_2_input_max_to_fp16"), val = fp16(0x1.4cp+3)];
+            tensor<fp16, [1, 50, 4096]> clip_143_cast_fp16 = clip(alpha = feed_forward1s_5_ffw_layer_2_input_min_to_fp16, beta = feed_forward1s_5_ffw_layer_2_input_max_to_fp16, x = hidden_states_531_cast_fp16)[name = string("clip_143_cast_fp16")];
+            tensor<fp16, [1024, 4096]> feed_forward1s_5_ffw_layer_2_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 4096]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(63300800))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(65398016))))[name = string("feed_forward1s_5_ffw_layer_2_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_57_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = feed_forward1s_5_ffw_layer_2_linear_weight_to_fp16_palettized, x = clip_143_cast_fp16)[name = string("linear_57_cast_fp16")];
+            fp16 feed_forward1s_5_ffw_layer_2_output_min_to_fp16 = const()[name = string("feed_forward1s_5_ffw_layer_2_output_min_to_fp16"), val = fp16(-0x1.d6p+5)];
+            fp16 feed_forward1s_5_ffw_layer_2_output_max_to_fp16 = const()[name = string("feed_forward1s_5_ffw_layer_2_output_max_to_fp16"), val = fp16(0x1.d2p+5)];
+            tensor<fp16, [1, 50, 1024]> clip_144_cast_fp16 = clip(alpha = feed_forward1s_5_ffw_layer_2_output_min_to_fp16, beta = feed_forward1s_5_ffw_layer_2_output_max_to_fp16, x = linear_57_cast_fp16)[name = string("clip_144_cast_fp16")];
+            string clip_144_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_144_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1024]> clip_144_cast_fp16_to_fp32 = cast(dtype = clip_144_cast_fp16_to_fp32_dtype_0, x = clip_144_cast_fp16)[name = string("cast_309")];
+            tensor<fp32, [1, 50, 1024]> clip_145 = clip(alpha = var_2434, beta = var_2433, x = clip_144_cast_fp16_to_fp32)[name = string("clip_145")];
+            fp32 var_2428_promoted_1 = const()[name = string("op_2428_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_2474 = pow(x = clip_145, y = var_2428_promoted_1)[name = string("op_2474")];
+            tensor<int32, [1]> var_2476_axes_0 = const()[name = string("op_2476_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2476_keep_dims_0 = const()[name = string("op_2476_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_2476 = reduce_mean(axes = var_2476_axes_0, keep_dims = var_2476_keep_dims_0, x = var_2474)[name = string("op_2476")];
+            string var_2476_to_fp16_dtype_0 = const()[name = string("op_2476_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_2477_to_fp16 = const()[name = string("op_2477_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_2476_to_fp16 = cast(dtype = var_2476_to_fp16_dtype_0, x = var_2476)[name = string("cast_308")];
+            tensor<fp16, [1, 50, 1]> mean_squared_93_cast_fp16 = add(x = var_2476_to_fp16, y = var_2477_to_fp16)[name = string("mean_squared_93_cast_fp16")];
+            string mean_squared_93_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_93_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_93_cast_fp16_to_fp32 = cast(dtype = mean_squared_93_cast_fp16_to_fp32_dtype_0, x = mean_squared_93_cast_fp16)[name = string("cast_307")];
+            tensor<fp32, [1, 50, 1]> var_2479 = pow(x = mean_squared_93_cast_fp16_to_fp32, y = var_2432)[name = string("op_2479")];
+            string clip_145_to_fp16_dtype_0 = const()[name = string("clip_145_to_fp16_dtype_0"), val = string("fp16")];
+            string var_2479_to_fp16_dtype_0 = const()[name = string("op_2479_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_145_to_fp16 = cast(dtype = clip_145_to_fp16_dtype_0, x = clip_145)[name = string("cast_305")];
+            tensor<fp16, [1, 50, 1]> var_2479_to_fp16 = cast(dtype = var_2479_to_fp16_dtype_0, x = var_2479)[name = string("cast_306")];
+            tensor<fp16, [1, 50, 1024]> normed_output_185_cast_fp16 = mul(x = clip_145_to_fp16, y = var_2479_to_fp16)[name = string("normed_output_185_cast_fp16")];
+            tensor<fp16, [1024]> const_78_to_fp16 = const()[name = string("const_78_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(65399104)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_187_cast_fp16 = mul(x = normed_output_185_cast_fp16, y = const_78_to_fp16)[name = string("normed_output_187_cast_fp16")];
+            fp16 var_2424_to_fp16 = const()[name = string("op_2424_to_fp16"), val = fp16(0x1p-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_543_cast_fp16 = mul(x = normed_output_187_cast_fp16, y = var_2424_to_fp16)[name = string("hidden_states_543_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_545_cast_fp16 = add(x = hidden_states_543_cast_fp16, y = normed_output_179_cast_fp16)[name = string("hidden_states_545_cast_fp16")];
+            fp16 var_2486_to_fp16 = const()[name = string("op_2486_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_2487_to_fp16 = const()[name = string("op_2487_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_146_cast_fp16 = clip(alpha = var_2486_to_fp16, beta = var_2487_to_fp16, x = hidden_states_545_cast_fp16)[name = string("clip_146_cast_fp16")];
+            string clip_146_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_146_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_2489 = const()[name = string("op_2489"), val = fp32(-0x1p-1)];
+            fp32 var_2493_promoted = const()[name = string("op_2493_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_146_cast_fp16_to_fp32 = cast(dtype = clip_146_cast_fp16_to_fp32_dtype_0, x = clip_146_cast_fp16)[name = string("cast_304")];
+            tensor<fp32, [1, 50, 1024]> var_2499 = pow(x = clip_146_cast_fp16_to_fp32, y = var_2493_promoted)[name = string("op_2499")];
+            tensor<int32, [1]> var_2501_axes_0 = const()[name = string("op_2501_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2501_keep_dims_0 = const()[name = string("op_2501_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_2501 = reduce_mean(axes = var_2501_axes_0, keep_dims = var_2501_keep_dims_0, x = var_2499)[name = string("op_2501")];
+            string var_2501_to_fp16_dtype_0 = const()[name = string("op_2501_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_2502_to_fp16 = const()[name = string("op_2502_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_2501_to_fp16 = cast(dtype = var_2501_to_fp16_dtype_0, x = var_2501)[name = string("cast_303")];
+            tensor<fp16, [1, 50, 1]> mean_squared_95_cast_fp16 = add(x = var_2501_to_fp16, y = var_2502_to_fp16)[name = string("mean_squared_95_cast_fp16")];
+            string mean_squared_95_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_95_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_95_cast_fp16_to_fp32 = cast(dtype = mean_squared_95_cast_fp16_to_fp32_dtype_0, x = mean_squared_95_cast_fp16)[name = string("cast_302")];
+            tensor<fp32, [1, 50, 1]> var_2504 = pow(x = mean_squared_95_cast_fp16_to_fp32, y = var_2489)[name = string("op_2504")];
+            string var_2504_to_fp16_dtype_0 = const()[name = string("op_2504_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_2504_to_fp16 = cast(dtype = var_2504_to_fp16_dtype_0, x = var_2504)[name = string("cast_301")];
+            tensor<fp16, [1, 50, 1024]> normed_output_189_cast_fp16 = mul(x = clip_146_cast_fp16, y = var_2504_to_fp16)[name = string("normed_output_189_cast_fp16")];
+            tensor<fp16, [1024]> const_79_to_fp16 = const()[name = string("const_79_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(65401216)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_191_cast_fp16 = mul(x = normed_output_189_cast_fp16, y = const_79_to_fp16)[name = string("normed_output_191_cast_fp16")];
+            int32 var_2510 = const()[name = string("op_2510"), val = int32(-1)];
+            fp32 var_2511 = const()[name = string("op_2511"), val = fp32(-0x1.dcd65p+29)];
+            fp16 self_attns_5_q_proj_input_min_to_fp16 = const()[name = string("self_attns_5_q_proj_input_min_to_fp16"), val = fp16(-0x1.32p+3)];
+            fp16 self_attns_5_q_proj_input_max_to_fp16 = const()[name = string("self_attns_5_q_proj_input_max_to_fp16"), val = fp16(0x1.3p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_147_cast_fp16 = clip(alpha = self_attns_5_q_proj_input_min_to_fp16, beta = self_attns_5_q_proj_input_max_to_fp16, x = normed_output_191_cast_fp16)[name = string("clip_147_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_5_q_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(65403328))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(65927680))))[name = string("self_attns_5_q_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_58_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_5_q_proj_linear_weight_to_fp16_palettized, x = clip_147_cast_fp16)[name = string("linear_58_cast_fp16")];
+            fp16 self_attns_5_q_proj_output_min_to_fp16 = const()[name = string("self_attns_5_q_proj_output_min_to_fp16"), val = fp16(-0x1.1p+4)];
+            fp16 self_attns_5_q_proj_output_max_to_fp16 = const()[name = string("self_attns_5_q_proj_output_max_to_fp16"), val = fp16(0x1.0ep+4)];
+            tensor<fp16, [1, 50, 1024]> clip_148_cast_fp16 = clip(alpha = self_attns_5_q_proj_output_min_to_fp16, beta = self_attns_5_q_proj_output_max_to_fp16, x = linear_58_cast_fp16)[name = string("clip_148_cast_fp16")];
+            tensor<int32, [4]> var_2555 = const()[name = string("op_2555"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> q_11_cast_fp16 = reshape(shape = var_2555, x = clip_148_cast_fp16)[name = string("q_11_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_5_k_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(65928768))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(66453120))))[name = string("self_attns_5_k_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_59_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_5_k_proj_linear_weight_to_fp16_palettized, x = clip_147_cast_fp16)[name = string("linear_59_cast_fp16")];
+            fp16 self_attns_5_k_proj_output_min_to_fp16 = const()[name = string("self_attns_5_k_proj_output_min_to_fp16"), val = fp16(-0x1.1p+4)];
+            fp16 self_attns_5_k_proj_output_max_to_fp16 = const()[name = string("self_attns_5_k_proj_output_max_to_fp16"), val = fp16(0x1.0ep+4)];
+            tensor<fp16, [1, 50, 1024]> clip_150_cast_fp16 = clip(alpha = self_attns_5_k_proj_output_min_to_fp16, beta = self_attns_5_k_proj_output_max_to_fp16, x = linear_59_cast_fp16)[name = string("clip_150_cast_fp16")];
+            tensor<int32, [4]> var_2567 = const()[name = string("op_2567"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> k_11_cast_fp16 = reshape(shape = var_2567, x = clip_150_cast_fp16)[name = string("k_11_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_5_v_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(66454208))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(66978560))))[name = string("self_attns_5_v_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_60_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_5_v_proj_linear_weight_to_fp16_palettized, x = clip_147_cast_fp16)[name = string("linear_60_cast_fp16")];
+            fp16 self_attns_5_v_proj_output_min_to_fp16 = const()[name = string("self_attns_5_v_proj_output_min_to_fp16"), val = fp16(-0x1.1p+4)];
+            fp16 self_attns_5_v_proj_output_max_to_fp16 = const()[name = string("self_attns_5_v_proj_output_max_to_fp16"), val = fp16(0x1.0ep+4)];
+            tensor<fp16, [1, 50, 1024]> clip_152_cast_fp16 = clip(alpha = self_attns_5_v_proj_output_min_to_fp16, beta = self_attns_5_v_proj_output_max_to_fp16, x = linear_60_cast_fp16)[name = string("clip_152_cast_fp16")];
+            tensor<int32, [4]> var_2579 = const()[name = string("op_2579"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> input_243_cast_fp16 = reshape(shape = var_2579, x = clip_152_cast_fp16)[name = string("input_243_cast_fp16")];
+            fp16 var_2581_to_fp16 = const()[name = string("op_2581_to_fp16"), val = fp16(0x1.054p-3)];
+            tensor<fp16, [1, 50, 8, 128]> var_2582_cast_fp16 = mul(x = q_11_cast_fp16, y = var_2581_to_fp16)[name = string("op_2582_cast_fp16")];
+            tensor<fp16, [128]> var_2583_to_fp16 = const()[name = string("op_2583_to_fp16"), val = tensor<fp16, [128]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(66979648)))];
+            tensor<fp16, [1, 50, 8, 128]> input_239_cast_fp16 = mul(x = var_2582_cast_fp16, y = var_2583_to_fp16)[name = string("input_239_cast_fp16")];
+            fp16 var_2585_to_fp16 = const()[name = string("op_2585_to_fp16"), val = fp16(0x1.e5p+0)];
+            tensor<fp16, [1, 50, 8, 128]> input_241_cast_fp16 = mul(x = k_11_cast_fp16, y = var_2585_to_fp16)[name = string("input_241_cast_fp16")];
+            tensor<int32, [8]> q_padded_11_pad_0 = const()[name = string("q_padded_11_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 10, 0, 0, 0, 0])];
+            string q_padded_11_mode_0 = const()[name = string("q_padded_11_mode_0"), val = string("constant")];
+            fp16 const_80_to_fp16 = const()[name = string("const_80_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 60, 8, 128]> q_padded_11_cast_fp16 = pad(constant_val = const_80_to_fp16, mode = q_padded_11_mode_0, pad = q_padded_11_pad_0, x = input_239_cast_fp16)[name = string("q_padded_11_cast_fp16")];
+            tensor<int32, [5]> var_2589 = const()[name = string("op_2589"), val = tensor<int32, [5]>([1, 5, 12, 8, 128])];
+            tensor<fp16, [1, 5, 12, 8, 128]> q_blocks_11_cast_fp16 = reshape(shape = var_2589, x = q_padded_11_cast_fp16)[name = string("q_blocks_11_cast_fp16")];
+            tensor<int32, [8]> k_padded_11_pad_0 = const()[name = string("k_padded_11_pad_0"), val = tensor<int32, [8]>([0, 0, 12, 11, 0, 0, 0, 0])];
+            string k_padded_11_mode_0 = const()[name = string("k_padded_11_mode_0"), val = string("constant")];
+            fp16 const_81_to_fp16 = const()[name = string("const_81_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 73, 8, 128]> k_padded_11_cast_fp16 = pad(constant_val = const_81_to_fp16, mode = k_padded_11_mode_0, pad = k_padded_11_pad_0, x = input_241_cast_fp16)[name = string("k_padded_11_cast_fp16")];
+            tensor<int32, [8]> v_padded_11_pad_0 = const()[name = string("v_padded_11_pad_0"), val = tensor<int32, [8]>([0, 0, 12, 11, 0, 0, 0, 0])];
+            string v_padded_11_mode_0 = const()[name = string("v_padded_11_mode_0"), val = string("constant")];
+            fp16 const_82_to_fp16 = const()[name = string("const_82_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 73, 8, 128]> v_padded_11_cast_fp16 = pad(constant_val = const_82_to_fp16, mode = v_padded_11_mode_0, pad = v_padded_11_pad_0, x = input_243_cast_fp16)[name = string("v_padded_11_cast_fp16")];
+            tensor<int32, [4]> var_2596_begin_0 = const()[name = string("op_2596_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2596_end_0 = const()[name = string("op_2596_end_0"), val = tensor<int32, [4]>([1, 24, 8, 128])];
+            tensor<bool, [4]> var_2596_end_mask_0 = const()[name = string("op_2596_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_2596_cast_fp16 = slice_by_index(begin = var_2596_begin_0, end = var_2596_end_0, end_mask = var_2596_end_mask_0, x = k_padded_11_cast_fp16)[name = string("op_2596_cast_fp16")];
+            tensor<int32, [4]> var_2598_begin_0 = const()[name = string("op_2598_begin_0"), val = tensor<int32, [4]>([0, 12, 0, 0])];
+            tensor<int32, [4]> var_2598_end_0 = const()[name = string("op_2598_end_0"), val = tensor<int32, [4]>([1, 36, 8, 128])];
+            tensor<bool, [4]> var_2598_end_mask_0 = const()[name = string("op_2598_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_2598_cast_fp16 = slice_by_index(begin = var_2598_begin_0, end = var_2598_end_0, end_mask = var_2598_end_mask_0, x = k_padded_11_cast_fp16)[name = string("op_2598_cast_fp16")];
+            tensor<int32, [4]> var_2600_begin_0 = const()[name = string("op_2600_begin_0"), val = tensor<int32, [4]>([0, 24, 0, 0])];
+            tensor<int32, [4]> var_2600_end_0 = const()[name = string("op_2600_end_0"), val = tensor<int32, [4]>([1, 48, 8, 128])];
+            tensor<bool, [4]> var_2600_end_mask_0 = const()[name = string("op_2600_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_2600_cast_fp16 = slice_by_index(begin = var_2600_begin_0, end = var_2600_end_0, end_mask = var_2600_end_mask_0, x = k_padded_11_cast_fp16)[name = string("op_2600_cast_fp16")];
+            tensor<int32, [4]> var_2602_begin_0 = const()[name = string("op_2602_begin_0"), val = tensor<int32, [4]>([0, 36, 0, 0])];
+            tensor<int32, [4]> var_2602_end_0 = const()[name = string("op_2602_end_0"), val = tensor<int32, [4]>([1, 60, 8, 128])];
+            tensor<bool, [4]> var_2602_end_mask_0 = const()[name = string("op_2602_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_2602_cast_fp16 = slice_by_index(begin = var_2602_begin_0, end = var_2602_end_0, end_mask = var_2602_end_mask_0, x = k_padded_11_cast_fp16)[name = string("op_2602_cast_fp16")];
+            tensor<int32, [4]> var_2604_begin_0 = const()[name = string("op_2604_begin_0"), val = tensor<int32, [4]>([0, 48, 0, 0])];
+            tensor<int32, [4]> var_2604_end_0 = const()[name = string("op_2604_end_0"), val = tensor<int32, [4]>([1, 72, 8, 128])];
+            tensor<bool, [4]> var_2604_end_mask_0 = const()[name = string("op_2604_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_2604_cast_fp16 = slice_by_index(begin = var_2604_begin_0, end = var_2604_end_0, end_mask = var_2604_end_mask_0, x = k_padded_11_cast_fp16)[name = string("op_2604_cast_fp16")];
+            int32 k_blocks_11_axis_0 = const()[name = string("k_blocks_11_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 5, 24, 8, 128]> k_blocks_11_cast_fp16 = stack(axis = k_blocks_11_axis_0, values = (var_2596_cast_fp16, var_2598_cast_fp16, var_2600_cast_fp16, var_2602_cast_fp16, var_2604_cast_fp16))[name = string("k_blocks_11_cast_fp16")];
+            tensor<int32, [4]> var_2608_begin_0 = const()[name = string("op_2608_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2608_end_0 = const()[name = string("op_2608_end_0"), val = tensor<int32, [4]>([1, 24, 8, 128])];
+            tensor<bool, [4]> var_2608_end_mask_0 = const()[name = string("op_2608_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_2608_cast_fp16 = slice_by_index(begin = var_2608_begin_0, end = var_2608_end_0, end_mask = var_2608_end_mask_0, x = v_padded_11_cast_fp16)[name = string("op_2608_cast_fp16")];
+            tensor<int32, [4]> var_2610_begin_0 = const()[name = string("op_2610_begin_0"), val = tensor<int32, [4]>([0, 12, 0, 0])];
+            tensor<int32, [4]> var_2610_end_0 = const()[name = string("op_2610_end_0"), val = tensor<int32, [4]>([1, 36, 8, 128])];
+            tensor<bool, [4]> var_2610_end_mask_0 = const()[name = string("op_2610_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_2610_cast_fp16 = slice_by_index(begin = var_2610_begin_0, end = var_2610_end_0, end_mask = var_2610_end_mask_0, x = v_padded_11_cast_fp16)[name = string("op_2610_cast_fp16")];
+            tensor<int32, [4]> var_2612_begin_0 = const()[name = string("op_2612_begin_0"), val = tensor<int32, [4]>([0, 24, 0, 0])];
+            tensor<int32, [4]> var_2612_end_0 = const()[name = string("op_2612_end_0"), val = tensor<int32, [4]>([1, 48, 8, 128])];
+            tensor<bool, [4]> var_2612_end_mask_0 = const()[name = string("op_2612_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_2612_cast_fp16 = slice_by_index(begin = var_2612_begin_0, end = var_2612_end_0, end_mask = var_2612_end_mask_0, x = v_padded_11_cast_fp16)[name = string("op_2612_cast_fp16")];
+            tensor<int32, [4]> var_2614_begin_0 = const()[name = string("op_2614_begin_0"), val = tensor<int32, [4]>([0, 36, 0, 0])];
+            tensor<int32, [4]> var_2614_end_0 = const()[name = string("op_2614_end_0"), val = tensor<int32, [4]>([1, 60, 8, 128])];
+            tensor<bool, [4]> var_2614_end_mask_0 = const()[name = string("op_2614_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_2614_cast_fp16 = slice_by_index(begin = var_2614_begin_0, end = var_2614_end_0, end_mask = var_2614_end_mask_0, x = v_padded_11_cast_fp16)[name = string("op_2614_cast_fp16")];
+            tensor<int32, [4]> var_2616_begin_0 = const()[name = string("op_2616_begin_0"), val = tensor<int32, [4]>([0, 48, 0, 0])];
+            tensor<int32, [4]> var_2616_end_0 = const()[name = string("op_2616_end_0"), val = tensor<int32, [4]>([1, 72, 8, 128])];
+            tensor<bool, [4]> var_2616_end_mask_0 = const()[name = string("op_2616_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_2616_cast_fp16 = slice_by_index(begin = var_2616_begin_0, end = var_2616_end_0, end_mask = var_2616_end_mask_0, x = v_padded_11_cast_fp16)[name = string("op_2616_cast_fp16")];
+            int32 v_blocks_11_axis_0 = const()[name = string("v_blocks_11_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 5, 24, 8, 128]> v_blocks_11_cast_fp16 = stack(axis = v_blocks_11_axis_0, values = (var_2608_cast_fp16, var_2610_cast_fp16, var_2612_cast_fp16, var_2614_cast_fp16, var_2616_cast_fp16))[name = string("v_blocks_11_cast_fp16")];
+            tensor<int32, [5]> var_2624 = const()[name = string("op_2624"), val = tensor<int32, [5]>([0, 3, 1, -3, -1])];
+            tensor<int32, [5]> var_2626 = const()[name = string("op_2626"), val = tensor<int32, [5]>([0, 3, 1, -1, -3])];
+            bool matrix_ac_11_transpose_x_0 = const()[name = string("matrix_ac_11_transpose_x_0"), val = bool(false)];
+            bool matrix_ac_11_transpose_y_0 = const()[name = string("matrix_ac_11_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 5, 12, 128]> queries_11_cast_fp16 = transpose(perm = var_2624, x = q_blocks_11_cast_fp16)[name = string("transpose_40")];
+            tensor<fp16, [1, 8, 5, 128, 24]> keys_t_11_cast_fp16 = transpose(perm = var_2626, x = k_blocks_11_cast_fp16)[name = string("transpose_41")];
+            tensor<fp16, [1, 8, 5, 12, 24]> matrix_ac_11_cast_fp16 = matmul(transpose_x = matrix_ac_11_transpose_x_0, transpose_y = matrix_ac_11_transpose_y_0, x = queries_11_cast_fp16, y = keys_t_11_cast_fp16)[name = string("matrix_ac_11_cast_fp16")];
+            tensor<int32, [4]> var_2629 = const()[name = string("op_2629"), val = tensor<int32, [4]>([1, 8, 60, 128])];
+            tensor<fp16, [1, 8, 60, 128]> q_flat_11_cast_fp16 = reshape(shape = var_2629, x = queries_11_cast_fp16)[name = string("q_flat_11_cast_fp16")];
+            bool matrix_bd_51_transpose_x_0 = const()[name = string("matrix_bd_51_transpose_x_0"), val = bool(false)];
+            bool matrix_bd_51_transpose_y_0 = const()[name = string("matrix_bd_51_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [8, 128, 13]> rel_k_t_11_to_fp16 = const()[name = string("rel_k_t_11_to_fp16"), val = tensor<fp16, [8, 128, 13]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(66979968)))];
+            tensor<fp16, [1, 8, 60, 13]> matrix_bd_51_cast_fp16 = matmul(transpose_x = matrix_bd_51_transpose_x_0, transpose_y = matrix_bd_51_transpose_y_0, x = q_flat_11_cast_fp16, y = rel_k_t_11_to_fp16)[name = string("matrix_bd_51_cast_fp16")];
+            tensor<int32, [5]> var_2634 = const()[name = string("op_2634"), val = tensor<int32, [5]>([1, 8, 5, 12, 13])];
+            tensor<fp16, [1, 8, 5, 12, 13]> input_245_cast_fp16 = reshape(shape = var_2634, x = matrix_bd_51_cast_fp16)[name = string("input_245_cast_fp16")];
+            tensor<int32, [10]> matrix_bd_53_pad_0 = const()[name = string("matrix_bd_53_pad_0"), val = tensor<int32, [10]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(67006656)))];
+            string matrix_bd_53_mode_0 = const()[name = string("matrix_bd_53_mode_0"), val = string("constant")];
+            fp16 const_84_to_fp16 = const()[name = string("const_84_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 8, 5, 12, 25]> matrix_bd_53_cast_fp16 = pad(constant_val = const_84_to_fp16, mode = matrix_bd_53_mode_0, pad = matrix_bd_53_pad_0, x = input_245_cast_fp16)[name = string("matrix_bd_53_cast_fp16")];
+            tensor<int32, [4]> var_2638 = const()[name = string("op_2638"), val = tensor<int32, [4]>([1, 8, 5, 300])];
+            tensor<fp16, [1, 8, 5, 300]> matrix_bd_55_cast_fp16 = reshape(shape = var_2638, x = matrix_bd_53_cast_fp16)[name = string("matrix_bd_55_cast_fp16")];
+            tensor<int32, [4]> matrix_bd_57_begin_0 = const()[name = string("matrix_bd_57_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> matrix_bd_57_end_0 = const()[name = string("matrix_bd_57_end_0"), val = tensor<int32, [4]>([1, 8, 5, 288])];
+            tensor<bool, [4]> matrix_bd_57_end_mask_0 = const()[name = string("matrix_bd_57_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 8, 5, 288]> matrix_bd_57_cast_fp16 = slice_by_index(begin = matrix_bd_57_begin_0, end = matrix_bd_57_end_0, end_mask = matrix_bd_57_end_mask_0, x = matrix_bd_55_cast_fp16)[name = string("matrix_bd_57_cast_fp16")];
+            tensor<int32, [5]> var_2644 = const()[name = string("op_2644"), val = tensor<int32, [5]>([1, 8, 5, 12, 24])];
+            tensor<fp16, [1, 8, 5, 12, 24]> matrix_bd_59_cast_fp16 = reshape(shape = var_2644, x = matrix_bd_57_cast_fp16)[name = string("matrix_bd_59_cast_fp16")];
+            tensor<fp16, [1, 8, 5, 12, 24]> attn_31_cast_fp16 = add(x = matrix_ac_11_cast_fp16, y = matrix_bd_59_cast_fp16)[name = string("attn_31_cast_fp16")];
+            fp16 _inversed_2647_y_0_to_fp16 = const()[name = string("_inversed_2647_y_0_to_fp16"), val = fp16(0x1.47cp-6)];
+            tensor<fp16, [1, 8, 5, 12, 24]> _inversed_2647_cast_fp16 = mul(x = attn_31_cast_fp16, y = _inversed_2647_y_0_to_fp16)[name = string("_inversed_2647_cast_fp16")];
+            string _inversed_2647_cast_fp16_to_fp32_dtype_0 = const()[name = string("_inversed_2647_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 8, 5, 12, 24]> _inversed_2647_cast_fp16_to_fp32 = cast(dtype = _inversed_2647_cast_fp16_to_fp32_dtype_0, x = _inversed_2647_cast_fp16)[name = string("cast_300")];
+            tensor<fp32, [1, 8, 5, 12, 24]> var_2648 = tanh(x = _inversed_2647_cast_fp16_to_fp32)[name = string("op_2648")];
+            string var_2648_to_fp16_dtype_0 = const()[name = string("op_2648_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 self_attns_5_softcap_to_fp16 = const()[name = string("self_attns_5_softcap_to_fp16"), val = fp16(0x1.9p+5)];
+            tensor<fp16, [1, 8, 5, 12, 24]> var_2648_to_fp16 = cast(dtype = var_2648_to_fp16_dtype_0, x = var_2648)[name = string("cast_299")];
+            tensor<fp16, [1, 8, 5, 12, 24]> attn_33_cast_fp16 = mul(x = var_2648_to_fp16, y = self_attns_5_softcap_to_fp16)[name = string("attn_33_cast_fp16")];
+            string attn_33_cast_fp16_to_fp32_dtype_0 = const()[name = string("attn_33_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 8, 5, 12, 24]> attn_33_cast_fp16_to_fp32 = cast(dtype = attn_33_cast_fp16_to_fp32_dtype_0, x = attn_33_cast_fp16)[name = string("cast_298")];
+            tensor<fp32, [1, 8, 5, 12, 24]> input_247 = select(a = var_2511, b = attn_33_cast_fp16_to_fp32, cond = var_460)[name = string("input_247")];
+            tensor<fp32, [1, 8, 5, 12, 24]> var_2652 = softmax(axis = var_2510, x = input_247)[name = string("op_2652")];
+            tensor<int32, [5]> var_2654 = const()[name = string("op_2654"), val = tensor<int32, [5]>([0, 3, 1, -3, -1])];
+            bool out_31_transpose_x_0 = const()[name = string("out_31_transpose_x_0"), val = bool(false)];
+            bool out_31_transpose_y_0 = const()[name = string("out_31_transpose_y_0"), val = bool(false)];
+            string var_2652_to_fp16_dtype_0 = const()[name = string("op_2652_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 8, 5, 24, 128]> values_t_11_cast_fp16 = transpose(perm = var_2654, x = v_blocks_11_cast_fp16)[name = string("transpose_39")];
+            tensor<fp16, [1, 8, 5, 12, 24]> var_2652_to_fp16 = cast(dtype = var_2652_to_fp16_dtype_0, x = var_2652)[name = string("cast_297")];
+            tensor<fp16, [1, 8, 5, 12, 128]> out_31_cast_fp16 = matmul(transpose_x = out_31_transpose_x_0, transpose_y = out_31_transpose_y_0, x = var_2652_to_fp16, y = values_t_11_cast_fp16)[name = string("out_31_cast_fp16")];
+            tensor<int32, [5]> var_2657 = const()[name = string("op_2657"), val = tensor<int32, [5]>([0, 2, 3, 1, 4])];
+            tensor<int32, [3]> var_2659 = const()[name = string("op_2659"), val = tensor<int32, [3]>([1, 60, 1024])];
+            tensor<fp16, [1, 5, 12, 8, 128]> var_2658_cast_fp16 = transpose(perm = var_2657, x = out_31_cast_fp16)[name = string("transpose_38")];
+            tensor<fp16, [1, 60, 1024]> out_33_cast_fp16 = reshape(shape = var_2659, x = var_2658_cast_fp16)[name = string("out_33_cast_fp16")];
+            tensor<int32, [3]> var_2662_begin_0 = const()[name = string("op_2662_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_2662_end_0 = const()[name = string("op_2662_end_0"), val = tensor<int32, [3]>([1, 50, 1024])];
+            tensor<bool, [3]> var_2662_end_mask_0 = const()[name = string("op_2662_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp16, [1, 50, 1024]> var_2662_cast_fp16 = slice_by_index(begin = var_2662_begin_0, end = var_2662_end_0, end_mask = var_2662_end_mask_0, x = out_33_cast_fp16)[name = string("op_2662_cast_fp16")];
+            fp16 self_attns_5_post_input_min_to_fp16 = const()[name = string("self_attns_5_post_input_min_to_fp16"), val = fp16(-0x1.f6p+3)];
+            fp16 self_attns_5_post_input_max_to_fp16 = const()[name = string("self_attns_5_post_input_max_to_fp16"), val = fp16(0x1.f2p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_153_cast_fp16 = clip(alpha = self_attns_5_post_input_min_to_fp16, beta = self_attns_5_post_input_max_to_fp16, x = var_2662_cast_fp16)[name = string("clip_153_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_5_post_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(67006784))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(67531136))))[name = string("self_attns_5_post_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_62_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_5_post_linear_weight_to_fp16_palettized, x = clip_153_cast_fp16)[name = string("linear_62_cast_fp16")];
+            fp16 self_attns_5_post_output_min_to_fp16 = const()[name = string("self_attns_5_post_output_min_to_fp16"), val = fp16(-0x1.96p+5)];
+            fp16 self_attns_5_post_output_max_to_fp16 = const()[name = string("self_attns_5_post_output_max_to_fp16"), val = fp16(0x1.94p+5)];
+            tensor<fp16, [1, 50, 1024]> clip_154_cast_fp16 = clip(alpha = self_attns_5_post_output_min_to_fp16, beta = self_attns_5_post_output_max_to_fp16, x = linear_62_cast_fp16)[name = string("clip_154_cast_fp16")];
+            fp16 var_2674_to_fp16 = const()[name = string("op_2674_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_2675_to_fp16 = const()[name = string("op_2675_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_155_cast_fp16 = clip(alpha = var_2674_to_fp16, beta = var_2675_to_fp16, x = clip_154_cast_fp16)[name = string("clip_155_cast_fp16")];
+            string clip_155_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_155_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_2677 = const()[name = string("op_2677"), val = fp32(-0x1p-1)];
+            fp32 var_2681_promoted = const()[name = string("op_2681_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_155_cast_fp16_to_fp32 = cast(dtype = clip_155_cast_fp16_to_fp32_dtype_0, x = clip_155_cast_fp16)[name = string("cast_296")];
+            tensor<fp32, [1, 50, 1024]> var_2687 = pow(x = clip_155_cast_fp16_to_fp32, y = var_2681_promoted)[name = string("op_2687")];
+            tensor<int32, [1]> var_2689_axes_0 = const()[name = string("op_2689_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2689_keep_dims_0 = const()[name = string("op_2689_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_2689 = reduce_mean(axes = var_2689_axes_0, keep_dims = var_2689_keep_dims_0, x = var_2687)[name = string("op_2689")];
+            string var_2689_to_fp16_dtype_0 = const()[name = string("op_2689_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_2690_to_fp16 = const()[name = string("op_2690_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_2689_to_fp16 = cast(dtype = var_2689_to_fp16_dtype_0, x = var_2689)[name = string("cast_295")];
+            tensor<fp16, [1, 50, 1]> mean_squared_97_cast_fp16 = add(x = var_2689_to_fp16, y = var_2690_to_fp16)[name = string("mean_squared_97_cast_fp16")];
+            string mean_squared_97_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_97_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_97_cast_fp16_to_fp32 = cast(dtype = mean_squared_97_cast_fp16_to_fp32_dtype_0, x = mean_squared_97_cast_fp16)[name = string("cast_294")];
+            tensor<fp32, [1, 50, 1]> var_2692 = pow(x = mean_squared_97_cast_fp16_to_fp32, y = var_2677)[name = string("op_2692")];
+            string var_2692_to_fp16_dtype_0 = const()[name = string("op_2692_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_2692_to_fp16 = cast(dtype = var_2692_to_fp16_dtype_0, x = var_2692)[name = string("cast_293")];
+            tensor<fp16, [1, 50, 1024]> normed_output_193_cast_fp16 = mul(x = clip_155_cast_fp16, y = var_2692_to_fp16)[name = string("normed_output_193_cast_fp16")];
+            tensor<fp16, [1024]> const_85_to_fp16 = const()[name = string("const_85_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(67532224)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_195_cast_fp16 = mul(x = normed_output_193_cast_fp16, y = const_85_to_fp16)[name = string("normed_output_195_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_571_cast_fp16 = add(x = normed_output_195_cast_fp16, y = hidden_states_545_cast_fp16)[name = string("hidden_states_571_cast_fp16")];
+            string hidden_states_571_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_571_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_2699 = const()[name = string("op_2699"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_2700 = const()[name = string("op_2700"), val = fp32(-0x1.2a05f2p+33)];
+            fp32 var_2712 = const()[name = string("op_2712"), val = fp32(-0x1p-1)];
+            fp32 var_2708_promoted = const()[name = string("op_2708_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> hidden_states_571_cast_fp16_to_fp32 = cast(dtype = hidden_states_571_cast_fp16_to_fp32_dtype_0, x = hidden_states_571_cast_fp16)[name = string("cast_292")];
+            tensor<fp32, [1, 50, 1024]> var_2720 = pow(x = hidden_states_571_cast_fp16_to_fp32, y = var_2708_promoted)[name = string("op_2720")];
+            tensor<int32, [1]> var_2722_axes_0 = const()[name = string("op_2722_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2722_keep_dims_0 = const()[name = string("op_2722_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_2722 = reduce_mean(axes = var_2722_axes_0, keep_dims = var_2722_keep_dims_0, x = var_2720)[name = string("op_2722")];
+            string var_2722_to_fp16_dtype_0 = const()[name = string("op_2722_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_2723_to_fp16 = const()[name = string("op_2723_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_2722_to_fp16 = cast(dtype = var_2722_to_fp16_dtype_0, x = var_2722)[name = string("cast_291")];
+            tensor<fp16, [1, 50, 1]> mean_squared_99_cast_fp16 = add(x = var_2722_to_fp16, y = var_2723_to_fp16)[name = string("mean_squared_99_cast_fp16")];
+            string mean_squared_99_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_99_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_99_cast_fp16_to_fp32 = cast(dtype = mean_squared_99_cast_fp16_to_fp32_dtype_0, x = mean_squared_99_cast_fp16)[name = string("cast_290")];
+            tensor<fp32, [1, 50, 1]> var_2725 = pow(x = mean_squared_99_cast_fp16_to_fp32, y = var_2712)[name = string("op_2725")];
+            string var_2725_to_fp16_dtype_0 = const()[name = string("op_2725_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_2725_to_fp16 = cast(dtype = var_2725_to_fp16_dtype_0, x = var_2725)[name = string("cast_289")];
+            tensor<fp16, [1, 50, 1024]> normed_output_197_cast_fp16 = mul(x = hidden_states_571_cast_fp16, y = var_2725_to_fp16)[name = string("normed_output_197_cast_fp16")];
+            tensor<fp16, [1024]> const_86_to_fp16 = const()[name = string("const_86_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(67534336)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_199_cast_fp16 = mul(x = normed_output_197_cast_fp16, y = const_86_to_fp16)[name = string("normed_output_199_cast_fp16")];
+            fp16 lconv1ds_5_linear_start_input_min_to_fp16 = const()[name = string("lconv1ds_5_linear_start_input_min_to_fp16"), val = fp16(-0x1.7p+3)];
+            fp16 lconv1ds_5_linear_start_input_max_to_fp16 = const()[name = string("lconv1ds_5_linear_start_input_max_to_fp16"), val = fp16(0x1.6ep+3)];
+            tensor<fp16, [1, 50, 1024]> clip_156_cast_fp16 = clip(alpha = lconv1ds_5_linear_start_input_min_to_fp16, beta = lconv1ds_5_linear_start_input_max_to_fp16, x = normed_output_199_cast_fp16)[name = string("clip_156_cast_fp16")];
+            tensor<fp16, [2048, 1024]> lconv1ds_5_linear_start_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(67536448))), lut = tensor<fp16, [64, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(68585088))))[name = string("lconv1ds_5_linear_start_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 2048]> linear_63_cast_fp16 = linear(bias = linear_8_bias_0_to_fp16, weight = lconv1ds_5_linear_start_linear_weight_to_fp16_palettized, x = clip_156_cast_fp16)[name = string("linear_63_cast_fp16")];
+            fp16 lconv1ds_5_linear_start_output_min_to_fp16 = const()[name = string("lconv1ds_5_linear_start_output_min_to_fp16"), val = fp16(-0x1.92p+4)];
+            fp16 lconv1ds_5_linear_start_output_max_to_fp16 = const()[name = string("lconv1ds_5_linear_start_output_max_to_fp16"), val = fp16(0x1.8ep+4)];
+            tensor<fp16, [1, 50, 2048]> clip_157_cast_fp16 = clip(alpha = lconv1ds_5_linear_start_output_min_to_fp16, beta = lconv1ds_5_linear_start_output_max_to_fp16, x = linear_63_cast_fp16)[name = string("clip_157_cast_fp16")];
+            int32 hidden_states_579_split_num_splits_0 = const()[name = string("hidden_states_579_split_num_splits_0"), val = int32(2)];
+            int32 hidden_states_579_split_axis_0 = const()[name = string("hidden_states_579_split_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_579_split_cast_fp16_0, tensor<fp16, [1, 50, 1024]> hidden_states_579_split_cast_fp16_1 = split(axis = hidden_states_579_split_axis_0, num_splits = hidden_states_579_split_num_splits_0, x = clip_157_cast_fp16)[name = string("hidden_states_579_split_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_579_split_1_sigmoid_cast_fp16 = sigmoid(x = hidden_states_579_split_cast_fp16_1)[name = string("hidden_states_579_split_1_sigmoid_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_579_cast_fp16 = mul(x = hidden_states_579_split_cast_fp16_0, y = hidden_states_579_split_1_sigmoid_cast_fp16)[name = string("hidden_states_579_cast_fp16")];
+            tensor<int32, [3]> input_255_perm_0 = const()[name = string("input_255_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [6]> input_257_pad_0 = const()[name = string("input_257_pad_0"), val = tensor<int32, [6]>([0, 0, 0, 0, 4, 0])];
+            string input_257_mode_0 = const()[name = string("input_257_mode_0"), val = string("constant")];
+            fp16 const_87_to_fp16 = const()[name = string("const_87_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 1024, 50]> input_255_cast_fp16 = transpose(perm = input_255_perm_0, x = hidden_states_579_cast_fp16)[name = string("transpose_37")];
+            tensor<fp16, [1, 1024, 54]> input_257_cast_fp16 = pad(constant_val = const_87_to_fp16, mode = input_257_mode_0, pad = input_257_pad_0, x = input_255_cast_fp16)[name = string("input_257_cast_fp16")];
+            string var_2751_pad_type_0 = const()[name = string("op_2751_pad_type_0"), val = string("valid")];
+            int32 var_2751_groups_0 = const()[name = string("op_2751_groups_0"), val = int32(1024)];
+            tensor<int32, [1]> var_2751_strides_0 = const()[name = string("op_2751_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_2751_pad_0 = const()[name = string("op_2751_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_2751_dilations_0 = const()[name = string("op_2751_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1024, 1, 5]> lconv1ds_5_depthwise_conv1d_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1, 5]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(68587200))), lut = tensor<fp16, [32, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(68589824))))[name = string("lconv1ds_5_depthwise_conv1d_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 50]> var_2751_cast_fp16 = conv(dilations = var_2751_dilations_0, groups = var_2751_groups_0, pad = var_2751_pad_0, pad_type = var_2751_pad_type_0, strides = var_2751_strides_0, weight = lconv1ds_5_depthwise_conv1d_weight_to_fp16_palettized, x = input_257_cast_fp16)[name = string("op_2751_cast_fp16")];
+            tensor<int32, [3]> hidden_states_581_perm_0 = const()[name = string("hidden_states_581_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            string hidden_states_581_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_581_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_581_cast_fp16 = transpose(perm = hidden_states_581_perm_0, x = var_2751_cast_fp16)[name = string("transpose_36")];
+            tensor<fp32, [1, 50, 1024]> hidden_states_581_cast_fp16_to_fp32 = cast(dtype = hidden_states_581_cast_fp16_to_fp32_dtype_0, x = hidden_states_581_cast_fp16)[name = string("cast_288")];
+            tensor<fp32, [1, 50, 1024]> clip_158 = clip(alpha = var_2700, beta = var_2699, x = hidden_states_581_cast_fp16_to_fp32)[name = string("clip_158")];
+            fp32 var_2708_promoted_1 = const()[name = string("op_2708_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_2756 = pow(x = clip_158, y = var_2708_promoted_1)[name = string("op_2756")];
+            tensor<int32, [1]> var_2758_axes_0 = const()[name = string("op_2758_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2758_keep_dims_0 = const()[name = string("op_2758_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_2758 = reduce_mean(axes = var_2758_axes_0, keep_dims = var_2758_keep_dims_0, x = var_2756)[name = string("op_2758")];
+            string var_2758_to_fp16_dtype_0 = const()[name = string("op_2758_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_2759_to_fp16 = const()[name = string("op_2759_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_2758_to_fp16 = cast(dtype = var_2758_to_fp16_dtype_0, x = var_2758)[name = string("cast_287")];
+            tensor<fp16, [1, 50, 1]> mean_squared_101_cast_fp16 = add(x = var_2758_to_fp16, y = var_2759_to_fp16)[name = string("mean_squared_101_cast_fp16")];
+            string mean_squared_101_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_101_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_101_cast_fp16_to_fp32 = cast(dtype = mean_squared_101_cast_fp16_to_fp32_dtype_0, x = mean_squared_101_cast_fp16)[name = string("cast_286")];
+            tensor<fp32, [1, 50, 1]> var_2761 = pow(x = mean_squared_101_cast_fp16_to_fp32, y = var_2712)[name = string("op_2761")];
+            string clip_158_to_fp16_dtype_0 = const()[name = string("clip_158_to_fp16_dtype_0"), val = string("fp16")];
+            string var_2761_to_fp16_dtype_0 = const()[name = string("op_2761_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_158_to_fp16 = cast(dtype = clip_158_to_fp16_dtype_0, x = clip_158)[name = string("cast_284")];
+            tensor<fp16, [1, 50, 1]> var_2761_to_fp16 = cast(dtype = var_2761_to_fp16_dtype_0, x = var_2761)[name = string("cast_285")];
+            tensor<fp16, [1, 50, 1024]> normed_output_201_cast_fp16 = mul(x = clip_158_to_fp16, y = var_2761_to_fp16)[name = string("normed_output_201_cast_fp16")];
+            tensor<fp16, [1024]> const_88_to_fp16 = const()[name = string("const_88_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(68590912)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_203_cast_fp16 = mul(x = normed_output_201_cast_fp16, y = const_88_to_fp16)[name = string("normed_output_203_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_587_cast_fp16 = silu(x = normed_output_203_cast_fp16)[name = string("hidden_states_587_cast_fp16")];
+            fp16 lconv1ds_5_linear_end_input_min_to_fp16 = const()[name = string("lconv1ds_5_linear_end_input_min_to_fp16"), val = fp16(-0x1.bep+3)];
+            fp16 lconv1ds_5_linear_end_input_max_to_fp16 = const()[name = string("lconv1ds_5_linear_end_input_max_to_fp16"), val = fp16(0x1.bap+3)];
+            tensor<fp16, [1, 50, 1024]> clip_159_cast_fp16 = clip(alpha = lconv1ds_5_linear_end_input_min_to_fp16, beta = lconv1ds_5_linear_end_input_max_to_fp16, x = hidden_states_587_cast_fp16)[name = string("clip_159_cast_fp16")];
+            tensor<fp16, [1024, 1024]> lconv1ds_5_linear_end_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(68593024))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(69117376))))[name = string("lconv1ds_5_linear_end_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_64_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = lconv1ds_5_linear_end_linear_weight_to_fp16_palettized, x = clip_159_cast_fp16)[name = string("linear_64_cast_fp16")];
+            fp16 lconv1ds_5_linear_end_output_min_to_fp16 = const()[name = string("lconv1ds_5_linear_end_output_min_to_fp16"), val = fp16(-0x1.02p+3)];
+            fp16 lconv1ds_5_linear_end_output_max_to_fp16 = const()[name = string("lconv1ds_5_linear_end_output_max_to_fp16"), val = fp16(0x1p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_160_cast_fp16 = clip(alpha = lconv1ds_5_linear_end_output_min_to_fp16, beta = lconv1ds_5_linear_end_output_max_to_fp16, x = linear_64_cast_fp16)[name = string("clip_160_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_593_cast_fp16 = add(x = clip_160_cast_fp16, y = hidden_states_571_cast_fp16)[name = string("hidden_states_593_cast_fp16")];
+            string hidden_states_593_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_593_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_2785 = const()[name = string("op_2785"), val = fp32(-0x1p-1)];
+            fp32 var_2786 = const()[name = string("op_2786"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_2787 = const()[name = string("op_2787"), val = fp32(-0x1.2a05f2p+33)];
+            tensor<fp32, [1, 50, 1024]> hidden_states_593_cast_fp16_to_fp32 = cast(dtype = hidden_states_593_cast_fp16_to_fp32_dtype_0, x = hidden_states_593_cast_fp16)[name = string("cast_283")];
+            tensor<fp32, [1, 50, 1024]> clip_161 = clip(alpha = var_2787, beta = var_2786, x = hidden_states_593_cast_fp16_to_fp32)[name = string("clip_161")];
+            fp32 var_2781_promoted = const()[name = string("op_2781_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_2795 = pow(x = clip_161, y = var_2781_promoted)[name = string("op_2795")];
+            tensor<int32, [1]> var_2797_axes_0 = const()[name = string("op_2797_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2797_keep_dims_0 = const()[name = string("op_2797_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_2797 = reduce_mean(axes = var_2797_axes_0, keep_dims = var_2797_keep_dims_0, x = var_2795)[name = string("op_2797")];
+            string var_2797_to_fp16_dtype_0 = const()[name = string("op_2797_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_2798_to_fp16 = const()[name = string("op_2798_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_2797_to_fp16 = cast(dtype = var_2797_to_fp16_dtype_0, x = var_2797)[name = string("cast_282")];
+            tensor<fp16, [1, 50, 1]> mean_squared_103_cast_fp16 = add(x = var_2797_to_fp16, y = var_2798_to_fp16)[name = string("mean_squared_103_cast_fp16")];
+            string mean_squared_103_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_103_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_103_cast_fp16_to_fp32 = cast(dtype = mean_squared_103_cast_fp16_to_fp32_dtype_0, x = mean_squared_103_cast_fp16)[name = string("cast_281")];
+            tensor<fp32, [1, 50, 1]> var_2800 = pow(x = mean_squared_103_cast_fp16_to_fp32, y = var_2785)[name = string("op_2800")];
+            string clip_161_to_fp16_dtype_0 = const()[name = string("clip_161_to_fp16_dtype_0"), val = string("fp16")];
+            string var_2800_to_fp16_dtype_0 = const()[name = string("op_2800_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_161_to_fp16 = cast(dtype = clip_161_to_fp16_dtype_0, x = clip_161)[name = string("cast_279")];
+            tensor<fp16, [1, 50, 1]> var_2800_to_fp16 = cast(dtype = var_2800_to_fp16_dtype_0, x = var_2800)[name = string("cast_280")];
+            tensor<fp16, [1, 50, 1024]> normed_output_205_cast_fp16 = mul(x = clip_161_to_fp16, y = var_2800_to_fp16)[name = string("normed_output_205_cast_fp16")];
+            tensor<fp16, [1024]> const_89_to_fp16 = const()[name = string("const_89_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(69118464)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_207_cast_fp16 = mul(x = normed_output_205_cast_fp16, y = const_89_to_fp16)[name = string("normed_output_207_cast_fp16")];
+            fp16 feed_forward2s_5_ffw_layer_1_input_min_to_fp16 = const()[name = string("feed_forward2s_5_ffw_layer_1_input_min_to_fp16"), val = fp16(-0x1.9ep+3)];
+            fp16 feed_forward2s_5_ffw_layer_1_input_max_to_fp16 = const()[name = string("feed_forward2s_5_ffw_layer_1_input_max_to_fp16"), val = fp16(0x1.9cp+3)];
+            tensor<fp16, [1, 50, 1024]> clip_162_cast_fp16 = clip(alpha = feed_forward2s_5_ffw_layer_1_input_min_to_fp16, beta = feed_forward2s_5_ffw_layer_1_input_max_to_fp16, x = normed_output_207_cast_fp16)[name = string("clip_162_cast_fp16")];
+            tensor<fp16, [4096, 1024]> feed_forward2s_5_ffw_layer_1_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(69120576))), lut = tensor<fp16, [128, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(71217792))))[name = string("feed_forward2s_5_ffw_layer_1_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 4096]> linear_65_cast_fp16 = linear(bias = linear_1_bias_0_to_fp16, weight = feed_forward2s_5_ffw_layer_1_linear_weight_to_fp16_palettized, x = clip_162_cast_fp16)[name = string("linear_65_cast_fp16")];
+            fp16 feed_forward2s_5_ffw_layer_1_output_min_to_fp16 = const()[name = string("feed_forward2s_5_ffw_layer_1_output_min_to_fp16"), val = fp16(-0x1.f4p+4)];
+            fp16 feed_forward2s_5_ffw_layer_1_output_max_to_fp16 = const()[name = string("feed_forward2s_5_ffw_layer_1_output_max_to_fp16"), val = fp16(0x1.fp+4)];
+            tensor<fp16, [1, 50, 4096]> clip_163_cast_fp16 = clip(alpha = feed_forward2s_5_ffw_layer_1_output_min_to_fp16, beta = feed_forward2s_5_ffw_layer_1_output_max_to_fp16, x = linear_65_cast_fp16)[name = string("clip_163_cast_fp16")];
+            tensor<fp16, [1, 50, 4096]> hidden_states_603_cast_fp16 = silu(x = clip_163_cast_fp16)[name = string("hidden_states_603_cast_fp16")];
+            fp16 feed_forward2s_5_ffw_layer_2_input_min_to_fp16 = const()[name = string("feed_forward2s_5_ffw_layer_2_input_min_to_fp16"), val = fp16(-0x1.1p+3)];
+            fp16 feed_forward2s_5_ffw_layer_2_input_max_to_fp16 = const()[name = string("feed_forward2s_5_ffw_layer_2_input_max_to_fp16"), val = fp16(0x1.0ep+3)];
+            tensor<fp16, [1, 50, 4096]> clip_164_cast_fp16 = clip(alpha = feed_forward2s_5_ffw_layer_2_input_min_to_fp16, beta = feed_forward2s_5_ffw_layer_2_input_max_to_fp16, x = hidden_states_603_cast_fp16)[name = string("clip_164_cast_fp16")];
+            tensor<fp16, [1024, 4096]> feed_forward2s_5_ffw_layer_2_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 4096]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(71221952))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(73319168))))[name = string("feed_forward2s_5_ffw_layer_2_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_66_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = feed_forward2s_5_ffw_layer_2_linear_weight_to_fp16_palettized, x = clip_164_cast_fp16)[name = string("linear_66_cast_fp16")];
+            fp16 feed_forward2s_5_ffw_layer_2_output_min_to_fp16 = const()[name = string("feed_forward2s_5_ffw_layer_2_output_min_to_fp16"), val = fp16(-0x1.2p+5)];
+            fp16 feed_forward2s_5_ffw_layer_2_output_max_to_fp16 = const()[name = string("feed_forward2s_5_ffw_layer_2_output_max_to_fp16"), val = fp16(0x1.1ep+5)];
+            tensor<fp16, [1, 50, 1024]> clip_165_cast_fp16 = clip(alpha = feed_forward2s_5_ffw_layer_2_output_min_to_fp16, beta = feed_forward2s_5_ffw_layer_2_output_max_to_fp16, x = linear_66_cast_fp16)[name = string("clip_165_cast_fp16")];
+            string clip_165_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_165_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1024]> clip_165_cast_fp16_to_fp32 = cast(dtype = clip_165_cast_fp16_to_fp32_dtype_0, x = clip_165_cast_fp16)[name = string("cast_278")];
+            tensor<fp32, [1, 50, 1024]> clip_166 = clip(alpha = var_2787, beta = var_2786, x = clip_165_cast_fp16_to_fp32)[name = string("clip_166")];
+            fp32 var_2781_promoted_1 = const()[name = string("op_2781_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_2827 = pow(x = clip_166, y = var_2781_promoted_1)[name = string("op_2827")];
+            tensor<int32, [1]> var_2829_axes_0 = const()[name = string("op_2829_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2829_keep_dims_0 = const()[name = string("op_2829_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_2829 = reduce_mean(axes = var_2829_axes_0, keep_dims = var_2829_keep_dims_0, x = var_2827)[name = string("op_2829")];
+            string var_2829_to_fp16_dtype_0 = const()[name = string("op_2829_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_2830_to_fp16 = const()[name = string("op_2830_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_2829_to_fp16 = cast(dtype = var_2829_to_fp16_dtype_0, x = var_2829)[name = string("cast_277")];
+            tensor<fp16, [1, 50, 1]> mean_squared_105_cast_fp16 = add(x = var_2829_to_fp16, y = var_2830_to_fp16)[name = string("mean_squared_105_cast_fp16")];
+            string mean_squared_105_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_105_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_105_cast_fp16_to_fp32 = cast(dtype = mean_squared_105_cast_fp16_to_fp32_dtype_0, x = mean_squared_105_cast_fp16)[name = string("cast_276")];
+            tensor<fp32, [1, 50, 1]> var_2832 = pow(x = mean_squared_105_cast_fp16_to_fp32, y = var_2785)[name = string("op_2832")];
+            string clip_166_to_fp16_dtype_0 = const()[name = string("clip_166_to_fp16_dtype_0"), val = string("fp16")];
+            string var_2832_to_fp16_dtype_0 = const()[name = string("op_2832_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_166_to_fp16 = cast(dtype = clip_166_to_fp16_dtype_0, x = clip_166)[name = string("cast_274")];
+            tensor<fp16, [1, 50, 1]> var_2832_to_fp16 = cast(dtype = var_2832_to_fp16_dtype_0, x = var_2832)[name = string("cast_275")];
+            tensor<fp16, [1, 50, 1024]> normed_output_209_cast_fp16 = mul(x = clip_166_to_fp16, y = var_2832_to_fp16)[name = string("normed_output_209_cast_fp16")];
+            tensor<fp16, [1024]> const_90_to_fp16 = const()[name = string("const_90_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(73320256)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_211_cast_fp16 = mul(x = normed_output_209_cast_fp16, y = const_90_to_fp16)[name = string("normed_output_211_cast_fp16")];
+            fp16 var_2777_to_fp16 = const()[name = string("op_2777_to_fp16"), val = fp16(0x1p-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_615_cast_fp16 = mul(x = normed_output_211_cast_fp16, y = var_2777_to_fp16)[name = string("hidden_states_615_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_617_cast_fp16 = add(x = hidden_states_615_cast_fp16, y = hidden_states_593_cast_fp16)[name = string("hidden_states_617_cast_fp16")];
+            fp16 var_2839_to_fp16 = const()[name = string("op_2839_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_2840_to_fp16 = const()[name = string("op_2840_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_167_cast_fp16 = clip(alpha = var_2839_to_fp16, beta = var_2840_to_fp16, x = hidden_states_617_cast_fp16)[name = string("clip_167_cast_fp16")];
+            string clip_167_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_167_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_2842 = const()[name = string("op_2842"), val = fp32(-0x1p-1)];
+            fp32 var_2846_promoted = const()[name = string("op_2846_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_167_cast_fp16_to_fp32 = cast(dtype = clip_167_cast_fp16_to_fp32_dtype_0, x = clip_167_cast_fp16)[name = string("cast_273")];
+            tensor<fp32, [1, 50, 1024]> var_2852 = pow(x = clip_167_cast_fp16_to_fp32, y = var_2846_promoted)[name = string("op_2852")];
+            tensor<int32, [1]> var_2854_axes_0 = const()[name = string("op_2854_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2854_keep_dims_0 = const()[name = string("op_2854_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_2854 = reduce_mean(axes = var_2854_axes_0, keep_dims = var_2854_keep_dims_0, x = var_2852)[name = string("op_2854")];
+            string var_2854_to_fp16_dtype_0 = const()[name = string("op_2854_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_2855_to_fp16 = const()[name = string("op_2855_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_2854_to_fp16 = cast(dtype = var_2854_to_fp16_dtype_0, x = var_2854)[name = string("cast_272")];
+            tensor<fp16, [1, 50, 1]> mean_squared_107_cast_fp16 = add(x = var_2854_to_fp16, y = var_2855_to_fp16)[name = string("mean_squared_107_cast_fp16")];
+            string mean_squared_107_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_107_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_107_cast_fp16_to_fp32 = cast(dtype = mean_squared_107_cast_fp16_to_fp32_dtype_0, x = mean_squared_107_cast_fp16)[name = string("cast_271")];
+            tensor<fp32, [1, 50, 1]> var_2857 = pow(x = mean_squared_107_cast_fp16_to_fp32, y = var_2842)[name = string("op_2857")];
+            string var_2857_to_fp16_dtype_0 = const()[name = string("op_2857_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_2857_to_fp16 = cast(dtype = var_2857_to_fp16_dtype_0, x = var_2857)[name = string("cast_270")];
+            tensor<fp16, [1, 50, 1024]> normed_output_213_cast_fp16 = mul(x = clip_167_cast_fp16, y = var_2857_to_fp16)[name = string("normed_output_213_cast_fp16")];
+            tensor<fp16, [1024]> const_91_to_fp16 = const()[name = string("const_91_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(73322368)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_215_cast_fp16 = mul(x = normed_output_213_cast_fp16, y = const_91_to_fp16)[name = string("normed_output_215_cast_fp16")];
+            string normed_output_215_cast_fp16_to_fp32_dtype_0 = const()[name = string("normed_output_215_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_2870 = const()[name = string("op_2870"), val = fp32(-0x1p-1)];
+            fp32 var_2871 = const()[name = string("op_2871"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_2872 = const()[name = string("op_2872"), val = fp32(-0x1.2a05f2p+33)];
+            tensor<fp32, [1, 50, 1024]> normed_output_215_cast_fp16_to_fp32 = cast(dtype = normed_output_215_cast_fp16_to_fp32_dtype_0, x = normed_output_215_cast_fp16)[name = string("cast_269")];
+            tensor<fp32, [1, 50, 1024]> clip_168 = clip(alpha = var_2872, beta = var_2871, x = normed_output_215_cast_fp16_to_fp32)[name = string("clip_168")];
+            fp32 var_2866_promoted = const()[name = string("op_2866_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_2880 = pow(x = clip_168, y = var_2866_promoted)[name = string("op_2880")];
+            tensor<int32, [1]> var_2882_axes_0 = const()[name = string("op_2882_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2882_keep_dims_0 = const()[name = string("op_2882_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_2882 = reduce_mean(axes = var_2882_axes_0, keep_dims = var_2882_keep_dims_0, x = var_2880)[name = string("op_2882")];
+            string var_2882_to_fp16_dtype_0 = const()[name = string("op_2882_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_2883_to_fp16 = const()[name = string("op_2883_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_2882_to_fp16 = cast(dtype = var_2882_to_fp16_dtype_0, x = var_2882)[name = string("cast_268")];
+            tensor<fp16, [1, 50, 1]> mean_squared_109_cast_fp16 = add(x = var_2882_to_fp16, y = var_2883_to_fp16)[name = string("mean_squared_109_cast_fp16")];
+            string mean_squared_109_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_109_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_109_cast_fp16_to_fp32 = cast(dtype = mean_squared_109_cast_fp16_to_fp32_dtype_0, x = mean_squared_109_cast_fp16)[name = string("cast_267")];
+            tensor<fp32, [1, 50, 1]> var_2885 = pow(x = mean_squared_109_cast_fp16_to_fp32, y = var_2870)[name = string("op_2885")];
+            string clip_168_to_fp16_dtype_0 = const()[name = string("clip_168_to_fp16_dtype_0"), val = string("fp16")];
+            string var_2885_to_fp16_dtype_0 = const()[name = string("op_2885_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_168_to_fp16 = cast(dtype = clip_168_to_fp16_dtype_0, x = clip_168)[name = string("cast_265")];
+            tensor<fp16, [1, 50, 1]> var_2885_to_fp16 = cast(dtype = var_2885_to_fp16_dtype_0, x = var_2885)[name = string("cast_266")];
+            tensor<fp16, [1, 50, 1024]> normed_output_217_cast_fp16 = mul(x = clip_168_to_fp16, y = var_2885_to_fp16)[name = string("normed_output_217_cast_fp16")];
+            tensor<fp16, [1024]> const_92_to_fp16 = const()[name = string("const_92_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(73324480)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_219_cast_fp16 = mul(x = normed_output_217_cast_fp16, y = const_92_to_fp16)[name = string("normed_output_219_cast_fp16")];
+            fp16 feed_forward1s_6_ffw_layer_1_input_min_to_fp16 = const()[name = string("feed_forward1s_6_ffw_layer_1_input_min_to_fp16"), val = fp16(-0x1.84p+3)];
+            fp16 feed_forward1s_6_ffw_layer_1_input_max_to_fp16 = const()[name = string("feed_forward1s_6_ffw_layer_1_input_max_to_fp16"), val = fp16(0x1.82p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_169_cast_fp16 = clip(alpha = feed_forward1s_6_ffw_layer_1_input_min_to_fp16, beta = feed_forward1s_6_ffw_layer_1_input_max_to_fp16, x = normed_output_219_cast_fp16)[name = string("clip_169_cast_fp16")];
+            tensor<fp16, [4096, 1024]> feed_forward1s_6_ffw_layer_1_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(73326592))), lut = tensor<fp16, [128, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(75423808))))[name = string("feed_forward1s_6_ffw_layer_1_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 4096]> linear_67_cast_fp16 = linear(bias = linear_1_bias_0_to_fp16, weight = feed_forward1s_6_ffw_layer_1_linear_weight_to_fp16_palettized, x = clip_169_cast_fp16)[name = string("linear_67_cast_fp16")];
+            fp16 feed_forward1s_6_ffw_layer_1_output_min_to_fp16 = const()[name = string("feed_forward1s_6_ffw_layer_1_output_min_to_fp16"), val = fp16(-0x1.9cp+4)];
+            fp16 feed_forward1s_6_ffw_layer_1_output_max_to_fp16 = const()[name = string("feed_forward1s_6_ffw_layer_1_output_max_to_fp16"), val = fp16(0x1.98p+4)];
+            tensor<fp16, [1, 50, 4096]> clip_170_cast_fp16 = clip(alpha = feed_forward1s_6_ffw_layer_1_output_min_to_fp16, beta = feed_forward1s_6_ffw_layer_1_output_max_to_fp16, x = linear_67_cast_fp16)[name = string("clip_170_cast_fp16")];
+            tensor<fp16, [1, 50, 4096]> hidden_states_633_cast_fp16 = silu(x = clip_170_cast_fp16)[name = string("hidden_states_633_cast_fp16")];
+            fp16 feed_forward1s_6_ffw_layer_2_input_min_to_fp16 = const()[name = string("feed_forward1s_6_ffw_layer_2_input_min_to_fp16"), val = fp16(-0x1p+3)];
+            fp16 feed_forward1s_6_ffw_layer_2_input_max_to_fp16 = const()[name = string("feed_forward1s_6_ffw_layer_2_input_max_to_fp16"), val = fp16(0x1.fcp+2)];
+            tensor<fp16, [1, 50, 4096]> clip_171_cast_fp16 = clip(alpha = feed_forward1s_6_ffw_layer_2_input_min_to_fp16, beta = feed_forward1s_6_ffw_layer_2_input_max_to_fp16, x = hidden_states_633_cast_fp16)[name = string("clip_171_cast_fp16")];
+            tensor<fp16, [1024, 4096]> feed_forward1s_6_ffw_layer_2_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 4096]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(75427968))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(77525184))))[name = string("feed_forward1s_6_ffw_layer_2_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_68_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = feed_forward1s_6_ffw_layer_2_linear_weight_to_fp16_palettized, x = clip_171_cast_fp16)[name = string("linear_68_cast_fp16")];
+            fp16 feed_forward1s_6_ffw_layer_2_output_min_to_fp16 = const()[name = string("feed_forward1s_6_ffw_layer_2_output_min_to_fp16"), val = fp16(-0x1.d8p+4)];
+            fp16 feed_forward1s_6_ffw_layer_2_output_max_to_fp16 = const()[name = string("feed_forward1s_6_ffw_layer_2_output_max_to_fp16"), val = fp16(0x1.d4p+4)];
+            tensor<fp16, [1, 50, 1024]> clip_172_cast_fp16 = clip(alpha = feed_forward1s_6_ffw_layer_2_output_min_to_fp16, beta = feed_forward1s_6_ffw_layer_2_output_max_to_fp16, x = linear_68_cast_fp16)[name = string("clip_172_cast_fp16")];
+            string clip_172_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_172_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1024]> clip_172_cast_fp16_to_fp32 = cast(dtype = clip_172_cast_fp16_to_fp32_dtype_0, x = clip_172_cast_fp16)[name = string("cast_264")];
+            tensor<fp32, [1, 50, 1024]> clip_173 = clip(alpha = var_2872, beta = var_2871, x = clip_172_cast_fp16_to_fp32)[name = string("clip_173")];
+            fp32 var_2866_promoted_1 = const()[name = string("op_2866_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_2912 = pow(x = clip_173, y = var_2866_promoted_1)[name = string("op_2912")];
+            tensor<int32, [1]> var_2914_axes_0 = const()[name = string("op_2914_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2914_keep_dims_0 = const()[name = string("op_2914_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_2914 = reduce_mean(axes = var_2914_axes_0, keep_dims = var_2914_keep_dims_0, x = var_2912)[name = string("op_2914")];
+            string var_2914_to_fp16_dtype_0 = const()[name = string("op_2914_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_2915_to_fp16 = const()[name = string("op_2915_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_2914_to_fp16 = cast(dtype = var_2914_to_fp16_dtype_0, x = var_2914)[name = string("cast_263")];
+            tensor<fp16, [1, 50, 1]> mean_squared_111_cast_fp16 = add(x = var_2914_to_fp16, y = var_2915_to_fp16)[name = string("mean_squared_111_cast_fp16")];
+            string mean_squared_111_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_111_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_111_cast_fp16_to_fp32 = cast(dtype = mean_squared_111_cast_fp16_to_fp32_dtype_0, x = mean_squared_111_cast_fp16)[name = string("cast_262")];
+            tensor<fp32, [1, 50, 1]> var_2917 = pow(x = mean_squared_111_cast_fp16_to_fp32, y = var_2870)[name = string("op_2917")];
+            string clip_173_to_fp16_dtype_0 = const()[name = string("clip_173_to_fp16_dtype_0"), val = string("fp16")];
+            string var_2917_to_fp16_dtype_0 = const()[name = string("op_2917_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_173_to_fp16 = cast(dtype = clip_173_to_fp16_dtype_0, x = clip_173)[name = string("cast_260")];
+            tensor<fp16, [1, 50, 1]> var_2917_to_fp16 = cast(dtype = var_2917_to_fp16_dtype_0, x = var_2917)[name = string("cast_261")];
+            tensor<fp16, [1, 50, 1024]> normed_output_221_cast_fp16 = mul(x = clip_173_to_fp16, y = var_2917_to_fp16)[name = string("normed_output_221_cast_fp16")];
+            tensor<fp16, [1024]> const_93_to_fp16 = const()[name = string("const_93_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(77526272)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_223_cast_fp16 = mul(x = normed_output_221_cast_fp16, y = const_93_to_fp16)[name = string("normed_output_223_cast_fp16")];
+            fp16 var_2862_to_fp16 = const()[name = string("op_2862_to_fp16"), val = fp16(0x1p-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_645_cast_fp16 = mul(x = normed_output_223_cast_fp16, y = var_2862_to_fp16)[name = string("hidden_states_645_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_647_cast_fp16 = add(x = hidden_states_645_cast_fp16, y = normed_output_215_cast_fp16)[name = string("hidden_states_647_cast_fp16")];
+            fp16 var_2924_to_fp16 = const()[name = string("op_2924_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_2925_to_fp16 = const()[name = string("op_2925_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_174_cast_fp16 = clip(alpha = var_2924_to_fp16, beta = var_2925_to_fp16, x = hidden_states_647_cast_fp16)[name = string("clip_174_cast_fp16")];
+            string clip_174_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_174_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_2927 = const()[name = string("op_2927"), val = fp32(-0x1p-1)];
+            fp32 var_2931_promoted = const()[name = string("op_2931_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_174_cast_fp16_to_fp32 = cast(dtype = clip_174_cast_fp16_to_fp32_dtype_0, x = clip_174_cast_fp16)[name = string("cast_259")];
+            tensor<fp32, [1, 50, 1024]> var_2937 = pow(x = clip_174_cast_fp16_to_fp32, y = var_2931_promoted)[name = string("op_2937")];
+            tensor<int32, [1]> var_2939_axes_0 = const()[name = string("op_2939_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2939_keep_dims_0 = const()[name = string("op_2939_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_2939 = reduce_mean(axes = var_2939_axes_0, keep_dims = var_2939_keep_dims_0, x = var_2937)[name = string("op_2939")];
+            string var_2939_to_fp16_dtype_0 = const()[name = string("op_2939_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_2940_to_fp16 = const()[name = string("op_2940_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_2939_to_fp16 = cast(dtype = var_2939_to_fp16_dtype_0, x = var_2939)[name = string("cast_258")];
+            tensor<fp16, [1, 50, 1]> mean_squared_113_cast_fp16 = add(x = var_2939_to_fp16, y = var_2940_to_fp16)[name = string("mean_squared_113_cast_fp16")];
+            string mean_squared_113_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_113_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_113_cast_fp16_to_fp32 = cast(dtype = mean_squared_113_cast_fp16_to_fp32_dtype_0, x = mean_squared_113_cast_fp16)[name = string("cast_257")];
+            tensor<fp32, [1, 50, 1]> var_2942 = pow(x = mean_squared_113_cast_fp16_to_fp32, y = var_2927)[name = string("op_2942")];
+            string var_2942_to_fp16_dtype_0 = const()[name = string("op_2942_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_2942_to_fp16 = cast(dtype = var_2942_to_fp16_dtype_0, x = var_2942)[name = string("cast_256")];
+            tensor<fp16, [1, 50, 1024]> normed_output_225_cast_fp16 = mul(x = clip_174_cast_fp16, y = var_2942_to_fp16)[name = string("normed_output_225_cast_fp16")];
+            tensor<fp16, [1024]> const_94_to_fp16 = const()[name = string("const_94_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(77528384)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_227_cast_fp16 = mul(x = normed_output_225_cast_fp16, y = const_94_to_fp16)[name = string("normed_output_227_cast_fp16")];
+            int32 var_2948 = const()[name = string("op_2948"), val = int32(-1)];
+            fp32 var_2949 = const()[name = string("op_2949"), val = fp32(-0x1.dcd65p+29)];
+            fp16 self_attns_6_q_proj_input_min_to_fp16 = const()[name = string("self_attns_6_q_proj_input_min_to_fp16"), val = fp16(-0x1.4ap+3)];
+            fp16 self_attns_6_q_proj_input_max_to_fp16 = const()[name = string("self_attns_6_q_proj_input_max_to_fp16"), val = fp16(0x1.48p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_175_cast_fp16 = clip(alpha = self_attns_6_q_proj_input_min_to_fp16, beta = self_attns_6_q_proj_input_max_to_fp16, x = normed_output_227_cast_fp16)[name = string("clip_175_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_6_q_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(77530496))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(78054848))))[name = string("self_attns_6_q_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_69_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_6_q_proj_linear_weight_to_fp16_palettized, x = clip_175_cast_fp16)[name = string("linear_69_cast_fp16")];
+            fp16 self_attns_6_q_proj_output_min_to_fp16 = const()[name = string("self_attns_6_q_proj_output_min_to_fp16"), val = fp16(-0x1.fp+3)];
+            fp16 self_attns_6_q_proj_output_max_to_fp16 = const()[name = string("self_attns_6_q_proj_output_max_to_fp16"), val = fp16(0x1.ecp+3)];
+            tensor<fp16, [1, 50, 1024]> clip_176_cast_fp16 = clip(alpha = self_attns_6_q_proj_output_min_to_fp16, beta = self_attns_6_q_proj_output_max_to_fp16, x = linear_69_cast_fp16)[name = string("clip_176_cast_fp16")];
+            tensor<int32, [4]> var_2993 = const()[name = string("op_2993"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> q_13_cast_fp16 = reshape(shape = var_2993, x = clip_176_cast_fp16)[name = string("q_13_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_6_k_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(78055936))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(78580288))))[name = string("self_attns_6_k_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_70_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_6_k_proj_linear_weight_to_fp16_palettized, x = clip_175_cast_fp16)[name = string("linear_70_cast_fp16")];
+            fp16 self_attns_6_k_proj_output_min_to_fp16 = const()[name = string("self_attns_6_k_proj_output_min_to_fp16"), val = fp16(-0x1.fp+3)];
+            fp16 self_attns_6_k_proj_output_max_to_fp16 = const()[name = string("self_attns_6_k_proj_output_max_to_fp16"), val = fp16(0x1.ecp+3)];
+            tensor<fp16, [1, 50, 1024]> clip_178_cast_fp16 = clip(alpha = self_attns_6_k_proj_output_min_to_fp16, beta = self_attns_6_k_proj_output_max_to_fp16, x = linear_70_cast_fp16)[name = string("clip_178_cast_fp16")];
+            tensor<int32, [4]> var_3005 = const()[name = string("op_3005"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> k_13_cast_fp16 = reshape(shape = var_3005, x = clip_178_cast_fp16)[name = string("k_13_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_6_v_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(78581376))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(79105728))))[name = string("self_attns_6_v_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_71_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_6_v_proj_linear_weight_to_fp16_palettized, x = clip_175_cast_fp16)[name = string("linear_71_cast_fp16")];
+            fp16 self_attns_6_v_proj_output_min_to_fp16 = const()[name = string("self_attns_6_v_proj_output_min_to_fp16"), val = fp16(-0x1.fp+3)];
+            fp16 self_attns_6_v_proj_output_max_to_fp16 = const()[name = string("self_attns_6_v_proj_output_max_to_fp16"), val = fp16(0x1.ecp+3)];
+            tensor<fp16, [1, 50, 1024]> clip_180_cast_fp16 = clip(alpha = self_attns_6_v_proj_output_min_to_fp16, beta = self_attns_6_v_proj_output_max_to_fp16, x = linear_71_cast_fp16)[name = string("clip_180_cast_fp16")];
+            tensor<int32, [4]> var_3017 = const()[name = string("op_3017"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> input_285_cast_fp16 = reshape(shape = var_3017, x = clip_180_cast_fp16)[name = string("input_285_cast_fp16")];
+            fp16 var_3019_to_fp16 = const()[name = string("op_3019_to_fp16"), val = fp16(0x1.054p-3)];
+            tensor<fp16, [1, 50, 8, 128]> var_3020_cast_fp16 = mul(x = q_13_cast_fp16, y = var_3019_to_fp16)[name = string("op_3020_cast_fp16")];
+            tensor<fp16, [128]> var_3021_to_fp16 = const()[name = string("op_3021_to_fp16"), val = tensor<fp16, [128]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(79106816)))];
+            tensor<fp16, [1, 50, 8, 128]> input_281_cast_fp16 = mul(x = var_3020_cast_fp16, y = var_3021_to_fp16)[name = string("input_281_cast_fp16")];
+            fp16 var_3023_to_fp16 = const()[name = string("op_3023_to_fp16"), val = fp16(0x1.e5p+0)];
+            tensor<fp16, [1, 50, 8, 128]> input_283_cast_fp16 = mul(x = k_13_cast_fp16, y = var_3023_to_fp16)[name = string("input_283_cast_fp16")];
+            tensor<int32, [8]> q_padded_13_pad_0 = const()[name = string("q_padded_13_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 10, 0, 0, 0, 0])];
+            string q_padded_13_mode_0 = const()[name = string("q_padded_13_mode_0"), val = string("constant")];
+            fp16 const_95_to_fp16 = const()[name = string("const_95_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 60, 8, 128]> q_padded_13_cast_fp16 = pad(constant_val = const_95_to_fp16, mode = q_padded_13_mode_0, pad = q_padded_13_pad_0, x = input_281_cast_fp16)[name = string("q_padded_13_cast_fp16")];
+            tensor<int32, [5]> var_3027 = const()[name = string("op_3027"), val = tensor<int32, [5]>([1, 5, 12, 8, 128])];
+            tensor<fp16, [1, 5, 12, 8, 128]> q_blocks_13_cast_fp16 = reshape(shape = var_3027, x = q_padded_13_cast_fp16)[name = string("q_blocks_13_cast_fp16")];
+            tensor<int32, [8]> k_padded_13_pad_0 = const()[name = string("k_padded_13_pad_0"), val = tensor<int32, [8]>([0, 0, 12, 11, 0, 0, 0, 0])];
+            string k_padded_13_mode_0 = const()[name = string("k_padded_13_mode_0"), val = string("constant")];
+            fp16 const_96_to_fp16 = const()[name = string("const_96_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 73, 8, 128]> k_padded_13_cast_fp16 = pad(constant_val = const_96_to_fp16, mode = k_padded_13_mode_0, pad = k_padded_13_pad_0, x = input_283_cast_fp16)[name = string("k_padded_13_cast_fp16")];
+            tensor<int32, [8]> v_padded_13_pad_0 = const()[name = string("v_padded_13_pad_0"), val = tensor<int32, [8]>([0, 0, 12, 11, 0, 0, 0, 0])];
+            string v_padded_13_mode_0 = const()[name = string("v_padded_13_mode_0"), val = string("constant")];
+            fp16 const_97_to_fp16 = const()[name = string("const_97_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 73, 8, 128]> v_padded_13_cast_fp16 = pad(constant_val = const_97_to_fp16, mode = v_padded_13_mode_0, pad = v_padded_13_pad_0, x = input_285_cast_fp16)[name = string("v_padded_13_cast_fp16")];
+            tensor<int32, [4]> var_3034_begin_0 = const()[name = string("op_3034_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3034_end_0 = const()[name = string("op_3034_end_0"), val = tensor<int32, [4]>([1, 24, 8, 128])];
+            tensor<bool, [4]> var_3034_end_mask_0 = const()[name = string("op_3034_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_3034_cast_fp16 = slice_by_index(begin = var_3034_begin_0, end = var_3034_end_0, end_mask = var_3034_end_mask_0, x = k_padded_13_cast_fp16)[name = string("op_3034_cast_fp16")];
+            tensor<int32, [4]> var_3036_begin_0 = const()[name = string("op_3036_begin_0"), val = tensor<int32, [4]>([0, 12, 0, 0])];
+            tensor<int32, [4]> var_3036_end_0 = const()[name = string("op_3036_end_0"), val = tensor<int32, [4]>([1, 36, 8, 128])];
+            tensor<bool, [4]> var_3036_end_mask_0 = const()[name = string("op_3036_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_3036_cast_fp16 = slice_by_index(begin = var_3036_begin_0, end = var_3036_end_0, end_mask = var_3036_end_mask_0, x = k_padded_13_cast_fp16)[name = string("op_3036_cast_fp16")];
+            tensor<int32, [4]> var_3038_begin_0 = const()[name = string("op_3038_begin_0"), val = tensor<int32, [4]>([0, 24, 0, 0])];
+            tensor<int32, [4]> var_3038_end_0 = const()[name = string("op_3038_end_0"), val = tensor<int32, [4]>([1, 48, 8, 128])];
+            tensor<bool, [4]> var_3038_end_mask_0 = const()[name = string("op_3038_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_3038_cast_fp16 = slice_by_index(begin = var_3038_begin_0, end = var_3038_end_0, end_mask = var_3038_end_mask_0, x = k_padded_13_cast_fp16)[name = string("op_3038_cast_fp16")];
+            tensor<int32, [4]> var_3040_begin_0 = const()[name = string("op_3040_begin_0"), val = tensor<int32, [4]>([0, 36, 0, 0])];
+            tensor<int32, [4]> var_3040_end_0 = const()[name = string("op_3040_end_0"), val = tensor<int32, [4]>([1, 60, 8, 128])];
+            tensor<bool, [4]> var_3040_end_mask_0 = const()[name = string("op_3040_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_3040_cast_fp16 = slice_by_index(begin = var_3040_begin_0, end = var_3040_end_0, end_mask = var_3040_end_mask_0, x = k_padded_13_cast_fp16)[name = string("op_3040_cast_fp16")];
+            tensor<int32, [4]> var_3042_begin_0 = const()[name = string("op_3042_begin_0"), val = tensor<int32, [4]>([0, 48, 0, 0])];
+            tensor<int32, [4]> var_3042_end_0 = const()[name = string("op_3042_end_0"), val = tensor<int32, [4]>([1, 72, 8, 128])];
+            tensor<bool, [4]> var_3042_end_mask_0 = const()[name = string("op_3042_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_3042_cast_fp16 = slice_by_index(begin = var_3042_begin_0, end = var_3042_end_0, end_mask = var_3042_end_mask_0, x = k_padded_13_cast_fp16)[name = string("op_3042_cast_fp16")];
+            int32 k_blocks_13_axis_0 = const()[name = string("k_blocks_13_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 5, 24, 8, 128]> k_blocks_13_cast_fp16 = stack(axis = k_blocks_13_axis_0, values = (var_3034_cast_fp16, var_3036_cast_fp16, var_3038_cast_fp16, var_3040_cast_fp16, var_3042_cast_fp16))[name = string("k_blocks_13_cast_fp16")];
+            tensor<int32, [4]> var_3046_begin_0 = const()[name = string("op_3046_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3046_end_0 = const()[name = string("op_3046_end_0"), val = tensor<int32, [4]>([1, 24, 8, 128])];
+            tensor<bool, [4]> var_3046_end_mask_0 = const()[name = string("op_3046_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_3046_cast_fp16 = slice_by_index(begin = var_3046_begin_0, end = var_3046_end_0, end_mask = var_3046_end_mask_0, x = v_padded_13_cast_fp16)[name = string("op_3046_cast_fp16")];
+            tensor<int32, [4]> var_3048_begin_0 = const()[name = string("op_3048_begin_0"), val = tensor<int32, [4]>([0, 12, 0, 0])];
+            tensor<int32, [4]> var_3048_end_0 = const()[name = string("op_3048_end_0"), val = tensor<int32, [4]>([1, 36, 8, 128])];
+            tensor<bool, [4]> var_3048_end_mask_0 = const()[name = string("op_3048_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_3048_cast_fp16 = slice_by_index(begin = var_3048_begin_0, end = var_3048_end_0, end_mask = var_3048_end_mask_0, x = v_padded_13_cast_fp16)[name = string("op_3048_cast_fp16")];
+            tensor<int32, [4]> var_3050_begin_0 = const()[name = string("op_3050_begin_0"), val = tensor<int32, [4]>([0, 24, 0, 0])];
+            tensor<int32, [4]> var_3050_end_0 = const()[name = string("op_3050_end_0"), val = tensor<int32, [4]>([1, 48, 8, 128])];
+            tensor<bool, [4]> var_3050_end_mask_0 = const()[name = string("op_3050_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_3050_cast_fp16 = slice_by_index(begin = var_3050_begin_0, end = var_3050_end_0, end_mask = var_3050_end_mask_0, x = v_padded_13_cast_fp16)[name = string("op_3050_cast_fp16")];
+            tensor<int32, [4]> var_3052_begin_0 = const()[name = string("op_3052_begin_0"), val = tensor<int32, [4]>([0, 36, 0, 0])];
+            tensor<int32, [4]> var_3052_end_0 = const()[name = string("op_3052_end_0"), val = tensor<int32, [4]>([1, 60, 8, 128])];
+            tensor<bool, [4]> var_3052_end_mask_0 = const()[name = string("op_3052_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_3052_cast_fp16 = slice_by_index(begin = var_3052_begin_0, end = var_3052_end_0, end_mask = var_3052_end_mask_0, x = v_padded_13_cast_fp16)[name = string("op_3052_cast_fp16")];
+            tensor<int32, [4]> var_3054_begin_0 = const()[name = string("op_3054_begin_0"), val = tensor<int32, [4]>([0, 48, 0, 0])];
+            tensor<int32, [4]> var_3054_end_0 = const()[name = string("op_3054_end_0"), val = tensor<int32, [4]>([1, 72, 8, 128])];
+            tensor<bool, [4]> var_3054_end_mask_0 = const()[name = string("op_3054_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_3054_cast_fp16 = slice_by_index(begin = var_3054_begin_0, end = var_3054_end_0, end_mask = var_3054_end_mask_0, x = v_padded_13_cast_fp16)[name = string("op_3054_cast_fp16")];
+            int32 v_blocks_13_axis_0 = const()[name = string("v_blocks_13_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 5, 24, 8, 128]> v_blocks_13_cast_fp16 = stack(axis = v_blocks_13_axis_0, values = (var_3046_cast_fp16, var_3048_cast_fp16, var_3050_cast_fp16, var_3052_cast_fp16, var_3054_cast_fp16))[name = string("v_blocks_13_cast_fp16")];
+            tensor<int32, [5]> var_3062 = const()[name = string("op_3062"), val = tensor<int32, [5]>([0, 3, 1, -3, -1])];
+            tensor<int32, [5]> var_3064 = const()[name = string("op_3064"), val = tensor<int32, [5]>([0, 3, 1, -1, -3])];
+            bool matrix_ac_13_transpose_x_0 = const()[name = string("matrix_ac_13_transpose_x_0"), val = bool(false)];
+            bool matrix_ac_13_transpose_y_0 = const()[name = string("matrix_ac_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 5, 12, 128]> queries_13_cast_fp16 = transpose(perm = var_3062, x = q_blocks_13_cast_fp16)[name = string("transpose_34")];
+            tensor<fp16, [1, 8, 5, 128, 24]> keys_t_13_cast_fp16 = transpose(perm = var_3064, x = k_blocks_13_cast_fp16)[name = string("transpose_35")];
+            tensor<fp16, [1, 8, 5, 12, 24]> matrix_ac_13_cast_fp16 = matmul(transpose_x = matrix_ac_13_transpose_x_0, transpose_y = matrix_ac_13_transpose_y_0, x = queries_13_cast_fp16, y = keys_t_13_cast_fp16)[name = string("matrix_ac_13_cast_fp16")];
+            tensor<int32, [4]> var_3067 = const()[name = string("op_3067"), val = tensor<int32, [4]>([1, 8, 60, 128])];
+            tensor<fp16, [1, 8, 60, 128]> q_flat_13_cast_fp16 = reshape(shape = var_3067, x = queries_13_cast_fp16)[name = string("q_flat_13_cast_fp16")];
+            bool matrix_bd_61_transpose_x_0 = const()[name = string("matrix_bd_61_transpose_x_0"), val = bool(false)];
+            bool matrix_bd_61_transpose_y_0 = const()[name = string("matrix_bd_61_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [8, 128, 13]> rel_k_t_13_to_fp16 = const()[name = string("rel_k_t_13_to_fp16"), val = tensor<fp16, [8, 128, 13]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(79107136)))];
+            tensor<fp16, [1, 8, 60, 13]> matrix_bd_61_cast_fp16 = matmul(transpose_x = matrix_bd_61_transpose_x_0, transpose_y = matrix_bd_61_transpose_y_0, x = q_flat_13_cast_fp16, y = rel_k_t_13_to_fp16)[name = string("matrix_bd_61_cast_fp16")];
+            tensor<int32, [5]> var_3072 = const()[name = string("op_3072"), val = tensor<int32, [5]>([1, 8, 5, 12, 13])];
+            tensor<fp16, [1, 8, 5, 12, 13]> input_287_cast_fp16 = reshape(shape = var_3072, x = matrix_bd_61_cast_fp16)[name = string("input_287_cast_fp16")];
+            tensor<int32, [10]> matrix_bd_63_pad_0 = const()[name = string("matrix_bd_63_pad_0"), val = tensor<int32, [10]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(79133824)))];
+            string matrix_bd_63_mode_0 = const()[name = string("matrix_bd_63_mode_0"), val = string("constant")];
+            fp16 const_99_to_fp16 = const()[name = string("const_99_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 8, 5, 12, 25]> matrix_bd_63_cast_fp16 = pad(constant_val = const_99_to_fp16, mode = matrix_bd_63_mode_0, pad = matrix_bd_63_pad_0, x = input_287_cast_fp16)[name = string("matrix_bd_63_cast_fp16")];
+            tensor<int32, [4]> var_3076 = const()[name = string("op_3076"), val = tensor<int32, [4]>([1, 8, 5, 300])];
+            tensor<fp16, [1, 8, 5, 300]> matrix_bd_65_cast_fp16 = reshape(shape = var_3076, x = matrix_bd_63_cast_fp16)[name = string("matrix_bd_65_cast_fp16")];
+            tensor<int32, [4]> matrix_bd_67_begin_0 = const()[name = string("matrix_bd_67_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> matrix_bd_67_end_0 = const()[name = string("matrix_bd_67_end_0"), val = tensor<int32, [4]>([1, 8, 5, 288])];
+            tensor<bool, [4]> matrix_bd_67_end_mask_0 = const()[name = string("matrix_bd_67_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 8, 5, 288]> matrix_bd_67_cast_fp16 = slice_by_index(begin = matrix_bd_67_begin_0, end = matrix_bd_67_end_0, end_mask = matrix_bd_67_end_mask_0, x = matrix_bd_65_cast_fp16)[name = string("matrix_bd_67_cast_fp16")];
+            tensor<int32, [5]> var_3082 = const()[name = string("op_3082"), val = tensor<int32, [5]>([1, 8, 5, 12, 24])];
+            tensor<fp16, [1, 8, 5, 12, 24]> matrix_bd_69_cast_fp16 = reshape(shape = var_3082, x = matrix_bd_67_cast_fp16)[name = string("matrix_bd_69_cast_fp16")];
+            tensor<fp16, [1, 8, 5, 12, 24]> attn_37_cast_fp16 = add(x = matrix_ac_13_cast_fp16, y = matrix_bd_69_cast_fp16)[name = string("attn_37_cast_fp16")];
+            fp16 _inversed_3085_y_0_to_fp16 = const()[name = string("_inversed_3085_y_0_to_fp16"), val = fp16(0x1.47cp-6)];
+            tensor<fp16, [1, 8, 5, 12, 24]> _inversed_3085_cast_fp16 = mul(x = attn_37_cast_fp16, y = _inversed_3085_y_0_to_fp16)[name = string("_inversed_3085_cast_fp16")];
+            string _inversed_3085_cast_fp16_to_fp32_dtype_0 = const()[name = string("_inversed_3085_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 8, 5, 12, 24]> _inversed_3085_cast_fp16_to_fp32 = cast(dtype = _inversed_3085_cast_fp16_to_fp32_dtype_0, x = _inversed_3085_cast_fp16)[name = string("cast_255")];
+            tensor<fp32, [1, 8, 5, 12, 24]> var_3086 = tanh(x = _inversed_3085_cast_fp16_to_fp32)[name = string("op_3086")];
+            string var_3086_to_fp16_dtype_0 = const()[name = string("op_3086_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 self_attns_6_softcap_to_fp16 = const()[name = string("self_attns_6_softcap_to_fp16"), val = fp16(0x1.9p+5)];
+            tensor<fp16, [1, 8, 5, 12, 24]> var_3086_to_fp16 = cast(dtype = var_3086_to_fp16_dtype_0, x = var_3086)[name = string("cast_254")];
+            tensor<fp16, [1, 8, 5, 12, 24]> attn_39_cast_fp16 = mul(x = var_3086_to_fp16, y = self_attns_6_softcap_to_fp16)[name = string("attn_39_cast_fp16")];
+            string attn_39_cast_fp16_to_fp32_dtype_0 = const()[name = string("attn_39_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 8, 5, 12, 24]> attn_39_cast_fp16_to_fp32 = cast(dtype = attn_39_cast_fp16_to_fp32_dtype_0, x = attn_39_cast_fp16)[name = string("cast_253")];
+            tensor<fp32, [1, 8, 5, 12, 24]> input_289 = select(a = var_2949, b = attn_39_cast_fp16_to_fp32, cond = var_460)[name = string("input_289")];
+            tensor<fp32, [1, 8, 5, 12, 24]> var_3090 = softmax(axis = var_2948, x = input_289)[name = string("op_3090")];
+            tensor<int32, [5]> var_3092 = const()[name = string("op_3092"), val = tensor<int32, [5]>([0, 3, 1, -3, -1])];
+            bool out_37_transpose_x_0 = const()[name = string("out_37_transpose_x_0"), val = bool(false)];
+            bool out_37_transpose_y_0 = const()[name = string("out_37_transpose_y_0"), val = bool(false)];
+            string var_3090_to_fp16_dtype_0 = const()[name = string("op_3090_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 8, 5, 24, 128]> values_t_13_cast_fp16 = transpose(perm = var_3092, x = v_blocks_13_cast_fp16)[name = string("transpose_33")];
+            tensor<fp16, [1, 8, 5, 12, 24]> var_3090_to_fp16 = cast(dtype = var_3090_to_fp16_dtype_0, x = var_3090)[name = string("cast_252")];
+            tensor<fp16, [1, 8, 5, 12, 128]> out_37_cast_fp16 = matmul(transpose_x = out_37_transpose_x_0, transpose_y = out_37_transpose_y_0, x = var_3090_to_fp16, y = values_t_13_cast_fp16)[name = string("out_37_cast_fp16")];
+            tensor<int32, [5]> var_3095 = const()[name = string("op_3095"), val = tensor<int32, [5]>([0, 2, 3, 1, 4])];
+            tensor<int32, [3]> var_3097 = const()[name = string("op_3097"), val = tensor<int32, [3]>([1, 60, 1024])];
+            tensor<fp16, [1, 5, 12, 8, 128]> var_3096_cast_fp16 = transpose(perm = var_3095, x = out_37_cast_fp16)[name = string("transpose_32")];
+            tensor<fp16, [1, 60, 1024]> out_39_cast_fp16 = reshape(shape = var_3097, x = var_3096_cast_fp16)[name = string("out_39_cast_fp16")];
+            tensor<int32, [3]> var_3100_begin_0 = const()[name = string("op_3100_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_3100_end_0 = const()[name = string("op_3100_end_0"), val = tensor<int32, [3]>([1, 50, 1024])];
+            tensor<bool, [3]> var_3100_end_mask_0 = const()[name = string("op_3100_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp16, [1, 50, 1024]> var_3100_cast_fp16 = slice_by_index(begin = var_3100_begin_0, end = var_3100_end_0, end_mask = var_3100_end_mask_0, x = out_39_cast_fp16)[name = string("op_3100_cast_fp16")];
+            fp16 self_attns_6_post_input_min_to_fp16 = const()[name = string("self_attns_6_post_input_min_to_fp16"), val = fp16(-0x1.e2p+3)];
+            fp16 self_attns_6_post_input_max_to_fp16 = const()[name = string("self_attns_6_post_input_max_to_fp16"), val = fp16(0x1.dep+3)];
+            tensor<fp16, [1, 50, 1024]> clip_181_cast_fp16 = clip(alpha = self_attns_6_post_input_min_to_fp16, beta = self_attns_6_post_input_max_to_fp16, x = var_3100_cast_fp16)[name = string("clip_181_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_6_post_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(79133952))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(79658304))))[name = string("self_attns_6_post_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_73_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_6_post_linear_weight_to_fp16_palettized, x = clip_181_cast_fp16)[name = string("linear_73_cast_fp16")];
+            fp16 self_attns_6_post_output_min_to_fp16 = const()[name = string("self_attns_6_post_output_min_to_fp16"), val = fp16(-0x1.88p+5)];
+            fp16 self_attns_6_post_output_max_to_fp16 = const()[name = string("self_attns_6_post_output_max_to_fp16"), val = fp16(0x1.84p+5)];
+            tensor<fp16, [1, 50, 1024]> clip_182_cast_fp16 = clip(alpha = self_attns_6_post_output_min_to_fp16, beta = self_attns_6_post_output_max_to_fp16, x = linear_73_cast_fp16)[name = string("clip_182_cast_fp16")];
+            fp16 var_3112_to_fp16 = const()[name = string("op_3112_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_3113_to_fp16 = const()[name = string("op_3113_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_183_cast_fp16 = clip(alpha = var_3112_to_fp16, beta = var_3113_to_fp16, x = clip_182_cast_fp16)[name = string("clip_183_cast_fp16")];
+            string clip_183_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_183_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_3115 = const()[name = string("op_3115"), val = fp32(-0x1p-1)];
+            fp32 var_3119_promoted = const()[name = string("op_3119_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_183_cast_fp16_to_fp32 = cast(dtype = clip_183_cast_fp16_to_fp32_dtype_0, x = clip_183_cast_fp16)[name = string("cast_251")];
+            tensor<fp32, [1, 50, 1024]> var_3125 = pow(x = clip_183_cast_fp16_to_fp32, y = var_3119_promoted)[name = string("op_3125")];
+            tensor<int32, [1]> var_3127_axes_0 = const()[name = string("op_3127_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3127_keep_dims_0 = const()[name = string("op_3127_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_3127 = reduce_mean(axes = var_3127_axes_0, keep_dims = var_3127_keep_dims_0, x = var_3125)[name = string("op_3127")];
+            string var_3127_to_fp16_dtype_0 = const()[name = string("op_3127_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_3128_to_fp16 = const()[name = string("op_3128_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_3127_to_fp16 = cast(dtype = var_3127_to_fp16_dtype_0, x = var_3127)[name = string("cast_250")];
+            tensor<fp16, [1, 50, 1]> mean_squared_115_cast_fp16 = add(x = var_3127_to_fp16, y = var_3128_to_fp16)[name = string("mean_squared_115_cast_fp16")];
+            string mean_squared_115_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_115_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_115_cast_fp16_to_fp32 = cast(dtype = mean_squared_115_cast_fp16_to_fp32_dtype_0, x = mean_squared_115_cast_fp16)[name = string("cast_249")];
+            tensor<fp32, [1, 50, 1]> var_3130 = pow(x = mean_squared_115_cast_fp16_to_fp32, y = var_3115)[name = string("op_3130")];
+            string var_3130_to_fp16_dtype_0 = const()[name = string("op_3130_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_3130_to_fp16 = cast(dtype = var_3130_to_fp16_dtype_0, x = var_3130)[name = string("cast_248")];
+            tensor<fp16, [1, 50, 1024]> normed_output_229_cast_fp16 = mul(x = clip_183_cast_fp16, y = var_3130_to_fp16)[name = string("normed_output_229_cast_fp16")];
+            tensor<fp16, [1024]> const_100_to_fp16 = const()[name = string("const_100_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(79659392)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_231_cast_fp16 = mul(x = normed_output_229_cast_fp16, y = const_100_to_fp16)[name = string("normed_output_231_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_673_cast_fp16 = add(x = normed_output_231_cast_fp16, y = hidden_states_647_cast_fp16)[name = string("hidden_states_673_cast_fp16")];
+            string hidden_states_673_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_673_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_3137 = const()[name = string("op_3137"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_3138 = const()[name = string("op_3138"), val = fp32(-0x1.2a05f2p+33)];
+            fp32 var_3150 = const()[name = string("op_3150"), val = fp32(-0x1p-1)];
+            fp32 var_3146_promoted = const()[name = string("op_3146_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> hidden_states_673_cast_fp16_to_fp32 = cast(dtype = hidden_states_673_cast_fp16_to_fp32_dtype_0, x = hidden_states_673_cast_fp16)[name = string("cast_247")];
+            tensor<fp32, [1, 50, 1024]> var_3158 = pow(x = hidden_states_673_cast_fp16_to_fp32, y = var_3146_promoted)[name = string("op_3158")];
+            tensor<int32, [1]> var_3160_axes_0 = const()[name = string("op_3160_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3160_keep_dims_0 = const()[name = string("op_3160_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_3160 = reduce_mean(axes = var_3160_axes_0, keep_dims = var_3160_keep_dims_0, x = var_3158)[name = string("op_3160")];
+            string var_3160_to_fp16_dtype_0 = const()[name = string("op_3160_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_3161_to_fp16 = const()[name = string("op_3161_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_3160_to_fp16 = cast(dtype = var_3160_to_fp16_dtype_0, x = var_3160)[name = string("cast_246")];
+            tensor<fp16, [1, 50, 1]> mean_squared_117_cast_fp16 = add(x = var_3160_to_fp16, y = var_3161_to_fp16)[name = string("mean_squared_117_cast_fp16")];
+            string mean_squared_117_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_117_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_117_cast_fp16_to_fp32 = cast(dtype = mean_squared_117_cast_fp16_to_fp32_dtype_0, x = mean_squared_117_cast_fp16)[name = string("cast_245")];
+            tensor<fp32, [1, 50, 1]> var_3163 = pow(x = mean_squared_117_cast_fp16_to_fp32, y = var_3150)[name = string("op_3163")];
+            string var_3163_to_fp16_dtype_0 = const()[name = string("op_3163_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_3163_to_fp16 = cast(dtype = var_3163_to_fp16_dtype_0, x = var_3163)[name = string("cast_244")];
+            tensor<fp16, [1, 50, 1024]> normed_output_233_cast_fp16 = mul(x = hidden_states_673_cast_fp16, y = var_3163_to_fp16)[name = string("normed_output_233_cast_fp16")];
+            tensor<fp16, [1024]> const_101_to_fp16 = const()[name = string("const_101_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(79661504)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_235_cast_fp16 = mul(x = normed_output_233_cast_fp16, y = const_101_to_fp16)[name = string("normed_output_235_cast_fp16")];
+            fp16 lconv1ds_6_linear_start_input_min_to_fp16 = const()[name = string("lconv1ds_6_linear_start_input_min_to_fp16"), val = fp16(-0x1.5ep+3)];
+            fp16 lconv1ds_6_linear_start_input_max_to_fp16 = const()[name = string("lconv1ds_6_linear_start_input_max_to_fp16"), val = fp16(0x1.5cp+3)];
+            tensor<fp16, [1, 50, 1024]> clip_184_cast_fp16 = clip(alpha = lconv1ds_6_linear_start_input_min_to_fp16, beta = lconv1ds_6_linear_start_input_max_to_fp16, x = normed_output_235_cast_fp16)[name = string("clip_184_cast_fp16")];
+            tensor<fp16, [2048, 1024]> lconv1ds_6_linear_start_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(79663616))), lut = tensor<fp16, [64, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(80712256))))[name = string("lconv1ds_6_linear_start_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 2048]> linear_74_cast_fp16 = linear(bias = linear_8_bias_0_to_fp16, weight = lconv1ds_6_linear_start_linear_weight_to_fp16_palettized, x = clip_184_cast_fp16)[name = string("linear_74_cast_fp16")];
+            fp16 lconv1ds_6_linear_start_output_min_to_fp16 = const()[name = string("lconv1ds_6_linear_start_output_min_to_fp16"), val = fp16(-0x1.6ap+4)];
+            fp16 lconv1ds_6_linear_start_output_max_to_fp16 = const()[name = string("lconv1ds_6_linear_start_output_max_to_fp16"), val = fp16(0x1.66p+4)];
+            tensor<fp16, [1, 50, 2048]> clip_185_cast_fp16 = clip(alpha = lconv1ds_6_linear_start_output_min_to_fp16, beta = lconv1ds_6_linear_start_output_max_to_fp16, x = linear_74_cast_fp16)[name = string("clip_185_cast_fp16")];
+            int32 hidden_states_681_split_num_splits_0 = const()[name = string("hidden_states_681_split_num_splits_0"), val = int32(2)];
+            int32 hidden_states_681_split_axis_0 = const()[name = string("hidden_states_681_split_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_681_split_cast_fp16_0, tensor<fp16, [1, 50, 1024]> hidden_states_681_split_cast_fp16_1 = split(axis = hidden_states_681_split_axis_0, num_splits = hidden_states_681_split_num_splits_0, x = clip_185_cast_fp16)[name = string("hidden_states_681_split_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_681_split_1_sigmoid_cast_fp16 = sigmoid(x = hidden_states_681_split_cast_fp16_1)[name = string("hidden_states_681_split_1_sigmoid_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_681_cast_fp16 = mul(x = hidden_states_681_split_cast_fp16_0, y = hidden_states_681_split_1_sigmoid_cast_fp16)[name = string("hidden_states_681_cast_fp16")];
+            tensor<int32, [3]> input_297_perm_0 = const()[name = string("input_297_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [6]> input_299_pad_0 = const()[name = string("input_299_pad_0"), val = tensor<int32, [6]>([0, 0, 0, 0, 4, 0])];
+            string input_299_mode_0 = const()[name = string("input_299_mode_0"), val = string("constant")];
+            fp16 const_102_to_fp16 = const()[name = string("const_102_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 1024, 50]> input_297_cast_fp16 = transpose(perm = input_297_perm_0, x = hidden_states_681_cast_fp16)[name = string("transpose_31")];
+            tensor<fp16, [1, 1024, 54]> input_299_cast_fp16 = pad(constant_val = const_102_to_fp16, mode = input_299_mode_0, pad = input_299_pad_0, x = input_297_cast_fp16)[name = string("input_299_cast_fp16")];
+            string var_3189_pad_type_0 = const()[name = string("op_3189_pad_type_0"), val = string("valid")];
+            int32 var_3189_groups_0 = const()[name = string("op_3189_groups_0"), val = int32(1024)];
+            tensor<int32, [1]> var_3189_strides_0 = const()[name = string("op_3189_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_3189_pad_0 = const()[name = string("op_3189_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_3189_dilations_0 = const()[name = string("op_3189_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1024, 1, 5]> lconv1ds_6_depthwise_conv1d_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1, 5]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(80714368))), lut = tensor<fp16, [32, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(80716992))))[name = string("lconv1ds_6_depthwise_conv1d_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 50]> var_3189_cast_fp16 = conv(dilations = var_3189_dilations_0, groups = var_3189_groups_0, pad = var_3189_pad_0, pad_type = var_3189_pad_type_0, strides = var_3189_strides_0, weight = lconv1ds_6_depthwise_conv1d_weight_to_fp16_palettized, x = input_299_cast_fp16)[name = string("op_3189_cast_fp16")];
+            tensor<int32, [3]> hidden_states_683_perm_0 = const()[name = string("hidden_states_683_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            string hidden_states_683_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_683_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_683_cast_fp16 = transpose(perm = hidden_states_683_perm_0, x = var_3189_cast_fp16)[name = string("transpose_30")];
+            tensor<fp32, [1, 50, 1024]> hidden_states_683_cast_fp16_to_fp32 = cast(dtype = hidden_states_683_cast_fp16_to_fp32_dtype_0, x = hidden_states_683_cast_fp16)[name = string("cast_243")];
+            tensor<fp32, [1, 50, 1024]> clip_186 = clip(alpha = var_3138, beta = var_3137, x = hidden_states_683_cast_fp16_to_fp32)[name = string("clip_186")];
+            fp32 var_3146_promoted_1 = const()[name = string("op_3146_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_3194 = pow(x = clip_186, y = var_3146_promoted_1)[name = string("op_3194")];
+            tensor<int32, [1]> var_3196_axes_0 = const()[name = string("op_3196_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3196_keep_dims_0 = const()[name = string("op_3196_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_3196 = reduce_mean(axes = var_3196_axes_0, keep_dims = var_3196_keep_dims_0, x = var_3194)[name = string("op_3196")];
+            string var_3196_to_fp16_dtype_0 = const()[name = string("op_3196_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_3197_to_fp16 = const()[name = string("op_3197_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_3196_to_fp16 = cast(dtype = var_3196_to_fp16_dtype_0, x = var_3196)[name = string("cast_242")];
+            tensor<fp16, [1, 50, 1]> mean_squared_119_cast_fp16 = add(x = var_3196_to_fp16, y = var_3197_to_fp16)[name = string("mean_squared_119_cast_fp16")];
+            string mean_squared_119_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_119_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_119_cast_fp16_to_fp32 = cast(dtype = mean_squared_119_cast_fp16_to_fp32_dtype_0, x = mean_squared_119_cast_fp16)[name = string("cast_241")];
+            tensor<fp32, [1, 50, 1]> var_3199 = pow(x = mean_squared_119_cast_fp16_to_fp32, y = var_3150)[name = string("op_3199")];
+            string clip_186_to_fp16_dtype_0 = const()[name = string("clip_186_to_fp16_dtype_0"), val = string("fp16")];
+            string var_3199_to_fp16_dtype_0 = const()[name = string("op_3199_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_186_to_fp16 = cast(dtype = clip_186_to_fp16_dtype_0, x = clip_186)[name = string("cast_239")];
+            tensor<fp16, [1, 50, 1]> var_3199_to_fp16 = cast(dtype = var_3199_to_fp16_dtype_0, x = var_3199)[name = string("cast_240")];
+            tensor<fp16, [1, 50, 1024]> normed_output_237_cast_fp16 = mul(x = clip_186_to_fp16, y = var_3199_to_fp16)[name = string("normed_output_237_cast_fp16")];
+            tensor<fp16, [1024]> const_103_to_fp16 = const()[name = string("const_103_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(80718080)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_239_cast_fp16 = mul(x = normed_output_237_cast_fp16, y = const_103_to_fp16)[name = string("normed_output_239_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_689_cast_fp16 = silu(x = normed_output_239_cast_fp16)[name = string("hidden_states_689_cast_fp16")];
+            fp16 lconv1ds_6_linear_end_input_min_to_fp16 = const()[name = string("lconv1ds_6_linear_end_input_min_to_fp16"), val = fp16(-0x1.d2p+3)];
+            fp16 lconv1ds_6_linear_end_input_max_to_fp16 = const()[name = string("lconv1ds_6_linear_end_input_max_to_fp16"), val = fp16(0x1.cep+3)];
+            tensor<fp16, [1, 50, 1024]> clip_187_cast_fp16 = clip(alpha = lconv1ds_6_linear_end_input_min_to_fp16, beta = lconv1ds_6_linear_end_input_max_to_fp16, x = hidden_states_689_cast_fp16)[name = string("clip_187_cast_fp16")];
+            tensor<fp16, [1024, 1024]> lconv1ds_6_linear_end_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(80720192))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(81244544))))[name = string("lconv1ds_6_linear_end_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_75_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = lconv1ds_6_linear_end_linear_weight_to_fp16_palettized, x = clip_187_cast_fp16)[name = string("linear_75_cast_fp16")];
+            fp16 lconv1ds_6_linear_end_output_min_to_fp16 = const()[name = string("lconv1ds_6_linear_end_output_min_to_fp16"), val = fp16(-0x1.f8p+2)];
+            fp16 lconv1ds_6_linear_end_output_max_to_fp16 = const()[name = string("lconv1ds_6_linear_end_output_max_to_fp16"), val = fp16(0x1.f4p+2)];
+            tensor<fp16, [1, 50, 1024]> clip_188_cast_fp16 = clip(alpha = lconv1ds_6_linear_end_output_min_to_fp16, beta = lconv1ds_6_linear_end_output_max_to_fp16, x = linear_75_cast_fp16)[name = string("clip_188_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_695_cast_fp16 = add(x = clip_188_cast_fp16, y = hidden_states_673_cast_fp16)[name = string("hidden_states_695_cast_fp16")];
+            string hidden_states_695_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_695_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_3223 = const()[name = string("op_3223"), val = fp32(-0x1p-1)];
+            fp32 var_3224 = const()[name = string("op_3224"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_3225 = const()[name = string("op_3225"), val = fp32(-0x1.2a05f2p+33)];
+            tensor<fp32, [1, 50, 1024]> hidden_states_695_cast_fp16_to_fp32 = cast(dtype = hidden_states_695_cast_fp16_to_fp32_dtype_0, x = hidden_states_695_cast_fp16)[name = string("cast_238")];
+            tensor<fp32, [1, 50, 1024]> clip_189 = clip(alpha = var_3225, beta = var_3224, x = hidden_states_695_cast_fp16_to_fp32)[name = string("clip_189")];
+            fp32 var_3219_promoted = const()[name = string("op_3219_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_3233 = pow(x = clip_189, y = var_3219_promoted)[name = string("op_3233")];
+            tensor<int32, [1]> var_3235_axes_0 = const()[name = string("op_3235_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3235_keep_dims_0 = const()[name = string("op_3235_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_3235 = reduce_mean(axes = var_3235_axes_0, keep_dims = var_3235_keep_dims_0, x = var_3233)[name = string("op_3235")];
+            string var_3235_to_fp16_dtype_0 = const()[name = string("op_3235_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_3236_to_fp16 = const()[name = string("op_3236_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_3235_to_fp16 = cast(dtype = var_3235_to_fp16_dtype_0, x = var_3235)[name = string("cast_237")];
+            tensor<fp16, [1, 50, 1]> mean_squared_121_cast_fp16 = add(x = var_3235_to_fp16, y = var_3236_to_fp16)[name = string("mean_squared_121_cast_fp16")];
+            string mean_squared_121_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_121_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_121_cast_fp16_to_fp32 = cast(dtype = mean_squared_121_cast_fp16_to_fp32_dtype_0, x = mean_squared_121_cast_fp16)[name = string("cast_236")];
+            tensor<fp32, [1, 50, 1]> var_3238 = pow(x = mean_squared_121_cast_fp16_to_fp32, y = var_3223)[name = string("op_3238")];
+            string clip_189_to_fp16_dtype_0 = const()[name = string("clip_189_to_fp16_dtype_0"), val = string("fp16")];
+            string var_3238_to_fp16_dtype_0 = const()[name = string("op_3238_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_189_to_fp16 = cast(dtype = clip_189_to_fp16_dtype_0, x = clip_189)[name = string("cast_234")];
+            tensor<fp16, [1, 50, 1]> var_3238_to_fp16 = cast(dtype = var_3238_to_fp16_dtype_0, x = var_3238)[name = string("cast_235")];
+            tensor<fp16, [1, 50, 1024]> normed_output_241_cast_fp16 = mul(x = clip_189_to_fp16, y = var_3238_to_fp16)[name = string("normed_output_241_cast_fp16")];
+            tensor<fp16, [1024]> const_104_to_fp16 = const()[name = string("const_104_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(81245632)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_243_cast_fp16 = mul(x = normed_output_241_cast_fp16, y = const_104_to_fp16)[name = string("normed_output_243_cast_fp16")];
+            fp16 feed_forward2s_6_ffw_layer_1_input_min_to_fp16 = const()[name = string("feed_forward2s_6_ffw_layer_1_input_min_to_fp16"), val = fp16(-0x1.d2p+3)];
+            fp16 feed_forward2s_6_ffw_layer_1_input_max_to_fp16 = const()[name = string("feed_forward2s_6_ffw_layer_1_input_max_to_fp16"), val = fp16(0x1.cep+3)];
+            tensor<fp16, [1, 50, 1024]> clip_190_cast_fp16 = clip(alpha = feed_forward2s_6_ffw_layer_1_input_min_to_fp16, beta = feed_forward2s_6_ffw_layer_1_input_max_to_fp16, x = normed_output_243_cast_fp16)[name = string("clip_190_cast_fp16")];
+            tensor<fp16, [4096, 1024]> feed_forward2s_6_ffw_layer_1_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(81247744))), lut = tensor<fp16, [128, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(83344960))))[name = string("feed_forward2s_6_ffw_layer_1_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 4096]> linear_76_cast_fp16 = linear(bias = linear_1_bias_0_to_fp16, weight = feed_forward2s_6_ffw_layer_1_linear_weight_to_fp16_palettized, x = clip_190_cast_fp16)[name = string("linear_76_cast_fp16")];
+            fp16 feed_forward2s_6_ffw_layer_1_output_min_to_fp16 = const()[name = string("feed_forward2s_6_ffw_layer_1_output_min_to_fp16"), val = fp16(-0x1.0ap+5)];
+            fp16 feed_forward2s_6_ffw_layer_1_output_max_to_fp16 = const()[name = string("feed_forward2s_6_ffw_layer_1_output_max_to_fp16"), val = fp16(0x1.08p+5)];
+            tensor<fp16, [1, 50, 4096]> clip_191_cast_fp16 = clip(alpha = feed_forward2s_6_ffw_layer_1_output_min_to_fp16, beta = feed_forward2s_6_ffw_layer_1_output_max_to_fp16, x = linear_76_cast_fp16)[name = string("clip_191_cast_fp16")];
+            tensor<fp16, [1, 50, 4096]> hidden_states_705_cast_fp16 = silu(x = clip_191_cast_fp16)[name = string("hidden_states_705_cast_fp16")];
+            fp16 feed_forward2s_6_ffw_layer_2_input_min_to_fp16 = const()[name = string("feed_forward2s_6_ffw_layer_2_input_min_to_fp16"), val = fp16(-0x1.6ep+3)];
+            fp16 feed_forward2s_6_ffw_layer_2_input_max_to_fp16 = const()[name = string("feed_forward2s_6_ffw_layer_2_input_max_to_fp16"), val = fp16(0x1.6cp+3)];
+            tensor<fp16, [1, 50, 4096]> clip_192_cast_fp16 = clip(alpha = feed_forward2s_6_ffw_layer_2_input_min_to_fp16, beta = feed_forward2s_6_ffw_layer_2_input_max_to_fp16, x = hidden_states_705_cast_fp16)[name = string("clip_192_cast_fp16")];
+            tensor<fp16, [1024, 4096]> feed_forward2s_6_ffw_layer_2_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 4096]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(83349120))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(85446336))))[name = string("feed_forward2s_6_ffw_layer_2_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_77_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = feed_forward2s_6_ffw_layer_2_linear_weight_to_fp16_palettized, x = clip_192_cast_fp16)[name = string("linear_77_cast_fp16")];
+            fp16 feed_forward2s_6_ffw_layer_2_output_min_to_fp16 = const()[name = string("feed_forward2s_6_ffw_layer_2_output_min_to_fp16"), val = fp16(-0x1.34p+6)];
+            fp16 feed_forward2s_6_ffw_layer_2_output_max_to_fp16 = const()[name = string("feed_forward2s_6_ffw_layer_2_output_max_to_fp16"), val = fp16(0x1.32p+6)];
+            tensor<fp16, [1, 50, 1024]> clip_193_cast_fp16 = clip(alpha = feed_forward2s_6_ffw_layer_2_output_min_to_fp16, beta = feed_forward2s_6_ffw_layer_2_output_max_to_fp16, x = linear_77_cast_fp16)[name = string("clip_193_cast_fp16")];
+            string clip_193_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_193_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1024]> clip_193_cast_fp16_to_fp32 = cast(dtype = clip_193_cast_fp16_to_fp32_dtype_0, x = clip_193_cast_fp16)[name = string("cast_233")];
+            tensor<fp32, [1, 50, 1024]> clip_194 = clip(alpha = var_3225, beta = var_3224, x = clip_193_cast_fp16_to_fp32)[name = string("clip_194")];
+            fp32 var_3219_promoted_1 = const()[name = string("op_3219_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_3265 = pow(x = clip_194, y = var_3219_promoted_1)[name = string("op_3265")];
+            tensor<int32, [1]> var_3267_axes_0 = const()[name = string("op_3267_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3267_keep_dims_0 = const()[name = string("op_3267_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_3267 = reduce_mean(axes = var_3267_axes_0, keep_dims = var_3267_keep_dims_0, x = var_3265)[name = string("op_3267")];
+            string var_3267_to_fp16_dtype_0 = const()[name = string("op_3267_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_3268_to_fp16 = const()[name = string("op_3268_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_3267_to_fp16 = cast(dtype = var_3267_to_fp16_dtype_0, x = var_3267)[name = string("cast_232")];
+            tensor<fp16, [1, 50, 1]> mean_squared_123_cast_fp16 = add(x = var_3267_to_fp16, y = var_3268_to_fp16)[name = string("mean_squared_123_cast_fp16")];
+            string mean_squared_123_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_123_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_123_cast_fp16_to_fp32 = cast(dtype = mean_squared_123_cast_fp16_to_fp32_dtype_0, x = mean_squared_123_cast_fp16)[name = string("cast_231")];
+            tensor<fp32, [1, 50, 1]> var_3270 = pow(x = mean_squared_123_cast_fp16_to_fp32, y = var_3223)[name = string("op_3270")];
+            string clip_194_to_fp16_dtype_0 = const()[name = string("clip_194_to_fp16_dtype_0"), val = string("fp16")];
+            string var_3270_to_fp16_dtype_0 = const()[name = string("op_3270_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_194_to_fp16 = cast(dtype = clip_194_to_fp16_dtype_0, x = clip_194)[name = string("cast_229")];
+            tensor<fp16, [1, 50, 1]> var_3270_to_fp16 = cast(dtype = var_3270_to_fp16_dtype_0, x = var_3270)[name = string("cast_230")];
+            tensor<fp16, [1, 50, 1024]> normed_output_245_cast_fp16 = mul(x = clip_194_to_fp16, y = var_3270_to_fp16)[name = string("normed_output_245_cast_fp16")];
+            tensor<fp16, [1024]> const_105_to_fp16 = const()[name = string("const_105_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(85447424)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_247_cast_fp16 = mul(x = normed_output_245_cast_fp16, y = const_105_to_fp16)[name = string("normed_output_247_cast_fp16")];
+            fp16 var_3215_to_fp16 = const()[name = string("op_3215_to_fp16"), val = fp16(0x1p-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_717_cast_fp16 = mul(x = normed_output_247_cast_fp16, y = var_3215_to_fp16)[name = string("hidden_states_717_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_719_cast_fp16 = add(x = hidden_states_717_cast_fp16, y = hidden_states_695_cast_fp16)[name = string("hidden_states_719_cast_fp16")];
+            fp16 var_3277_to_fp16 = const()[name = string("op_3277_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_3278_to_fp16 = const()[name = string("op_3278_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_195_cast_fp16 = clip(alpha = var_3277_to_fp16, beta = var_3278_to_fp16, x = hidden_states_719_cast_fp16)[name = string("clip_195_cast_fp16")];
+            string clip_195_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_195_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_3280 = const()[name = string("op_3280"), val = fp32(-0x1p-1)];
+            fp32 var_3284_promoted = const()[name = string("op_3284_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_195_cast_fp16_to_fp32 = cast(dtype = clip_195_cast_fp16_to_fp32_dtype_0, x = clip_195_cast_fp16)[name = string("cast_228")];
+            tensor<fp32, [1, 50, 1024]> var_3290 = pow(x = clip_195_cast_fp16_to_fp32, y = var_3284_promoted)[name = string("op_3290")];
+            tensor<int32, [1]> var_3292_axes_0 = const()[name = string("op_3292_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3292_keep_dims_0 = const()[name = string("op_3292_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_3292 = reduce_mean(axes = var_3292_axes_0, keep_dims = var_3292_keep_dims_0, x = var_3290)[name = string("op_3292")];
+            string var_3292_to_fp16_dtype_0 = const()[name = string("op_3292_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_3293_to_fp16 = const()[name = string("op_3293_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_3292_to_fp16 = cast(dtype = var_3292_to_fp16_dtype_0, x = var_3292)[name = string("cast_227")];
+            tensor<fp16, [1, 50, 1]> mean_squared_125_cast_fp16 = add(x = var_3292_to_fp16, y = var_3293_to_fp16)[name = string("mean_squared_125_cast_fp16")];
+            string mean_squared_125_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_125_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_125_cast_fp16_to_fp32 = cast(dtype = mean_squared_125_cast_fp16_to_fp32_dtype_0, x = mean_squared_125_cast_fp16)[name = string("cast_226")];
+            tensor<fp32, [1, 50, 1]> var_3295 = pow(x = mean_squared_125_cast_fp16_to_fp32, y = var_3280)[name = string("op_3295")];
+            string var_3295_to_fp16_dtype_0 = const()[name = string("op_3295_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_3295_to_fp16 = cast(dtype = var_3295_to_fp16_dtype_0, x = var_3295)[name = string("cast_225")];
+            tensor<fp16, [1, 50, 1024]> normed_output_249_cast_fp16 = mul(x = clip_195_cast_fp16, y = var_3295_to_fp16)[name = string("normed_output_249_cast_fp16")];
+            tensor<fp16, [1024]> const_106_to_fp16 = const()[name = string("const_106_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(85449536)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_251_cast_fp16 = mul(x = normed_output_249_cast_fp16, y = const_106_to_fp16)[name = string("normed_output_251_cast_fp16")];
+            string normed_output_251_cast_fp16_to_fp32_dtype_0 = const()[name = string("normed_output_251_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_3308 = const()[name = string("op_3308"), val = fp32(-0x1p-1)];
+            fp32 var_3309 = const()[name = string("op_3309"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_3310 = const()[name = string("op_3310"), val = fp32(-0x1.2a05f2p+33)];
+            tensor<fp32, [1, 50, 1024]> normed_output_251_cast_fp16_to_fp32 = cast(dtype = normed_output_251_cast_fp16_to_fp32_dtype_0, x = normed_output_251_cast_fp16)[name = string("cast_224")];
+            tensor<fp32, [1, 50, 1024]> clip_196 = clip(alpha = var_3310, beta = var_3309, x = normed_output_251_cast_fp16_to_fp32)[name = string("clip_196")];
+            fp32 var_3304_promoted = const()[name = string("op_3304_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_3318 = pow(x = clip_196, y = var_3304_promoted)[name = string("op_3318")];
+            tensor<int32, [1]> var_3320_axes_0 = const()[name = string("op_3320_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3320_keep_dims_0 = const()[name = string("op_3320_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_3320 = reduce_mean(axes = var_3320_axes_0, keep_dims = var_3320_keep_dims_0, x = var_3318)[name = string("op_3320")];
+            string var_3320_to_fp16_dtype_0 = const()[name = string("op_3320_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_3321_to_fp16 = const()[name = string("op_3321_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_3320_to_fp16 = cast(dtype = var_3320_to_fp16_dtype_0, x = var_3320)[name = string("cast_223")];
+            tensor<fp16, [1, 50, 1]> mean_squared_127_cast_fp16 = add(x = var_3320_to_fp16, y = var_3321_to_fp16)[name = string("mean_squared_127_cast_fp16")];
+            string mean_squared_127_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_127_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_127_cast_fp16_to_fp32 = cast(dtype = mean_squared_127_cast_fp16_to_fp32_dtype_0, x = mean_squared_127_cast_fp16)[name = string("cast_222")];
+            tensor<fp32, [1, 50, 1]> var_3323 = pow(x = mean_squared_127_cast_fp16_to_fp32, y = var_3308)[name = string("op_3323")];
+            string clip_196_to_fp16_dtype_0 = const()[name = string("clip_196_to_fp16_dtype_0"), val = string("fp16")];
+            string var_3323_to_fp16_dtype_0 = const()[name = string("op_3323_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_196_to_fp16 = cast(dtype = clip_196_to_fp16_dtype_0, x = clip_196)[name = string("cast_220")];
+            tensor<fp16, [1, 50, 1]> var_3323_to_fp16 = cast(dtype = var_3323_to_fp16_dtype_0, x = var_3323)[name = string("cast_221")];
+            tensor<fp16, [1, 50, 1024]> normed_output_253_cast_fp16 = mul(x = clip_196_to_fp16, y = var_3323_to_fp16)[name = string("normed_output_253_cast_fp16")];
+            tensor<fp16, [1024]> const_107_to_fp16 = const()[name = string("const_107_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(85451648)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_255_cast_fp16 = mul(x = normed_output_253_cast_fp16, y = const_107_to_fp16)[name = string("normed_output_255_cast_fp16")];
+            fp16 feed_forward1s_7_ffw_layer_1_input_min_to_fp16 = const()[name = string("feed_forward1s_7_ffw_layer_1_input_min_to_fp16"), val = fp16(-0x1.8cp+3)];
+            fp16 feed_forward1s_7_ffw_layer_1_input_max_to_fp16 = const()[name = string("feed_forward1s_7_ffw_layer_1_input_max_to_fp16"), val = fp16(0x1.88p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_197_cast_fp16 = clip(alpha = feed_forward1s_7_ffw_layer_1_input_min_to_fp16, beta = feed_forward1s_7_ffw_layer_1_input_max_to_fp16, x = normed_output_255_cast_fp16)[name = string("clip_197_cast_fp16")];
+            tensor<fp16, [4096, 1024]> feed_forward1s_7_ffw_layer_1_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(85453760))), lut = tensor<fp16, [128, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(87550976))))[name = string("feed_forward1s_7_ffw_layer_1_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 4096]> linear_78_cast_fp16 = linear(bias = linear_1_bias_0_to_fp16, weight = feed_forward1s_7_ffw_layer_1_linear_weight_to_fp16_palettized, x = clip_197_cast_fp16)[name = string("linear_78_cast_fp16")];
+            fp16 feed_forward1s_7_ffw_layer_1_output_min_to_fp16 = const()[name = string("feed_forward1s_7_ffw_layer_1_output_min_to_fp16"), val = fp16(-0x1.98p+4)];
+            fp16 feed_forward1s_7_ffw_layer_1_output_max_to_fp16 = const()[name = string("feed_forward1s_7_ffw_layer_1_output_max_to_fp16"), val = fp16(0x1.94p+4)];
+            tensor<fp16, [1, 50, 4096]> clip_198_cast_fp16 = clip(alpha = feed_forward1s_7_ffw_layer_1_output_min_to_fp16, beta = feed_forward1s_7_ffw_layer_1_output_max_to_fp16, x = linear_78_cast_fp16)[name = string("clip_198_cast_fp16")];
+            tensor<fp16, [1, 50, 4096]> hidden_states_735_cast_fp16 = silu(x = clip_198_cast_fp16)[name = string("hidden_states_735_cast_fp16")];
+            fp16 feed_forward1s_7_ffw_layer_2_input_min_to_fp16 = const()[name = string("feed_forward1s_7_ffw_layer_2_input_min_to_fp16"), val = fp16(-0x1.12p+3)];
+            fp16 feed_forward1s_7_ffw_layer_2_input_max_to_fp16 = const()[name = string("feed_forward1s_7_ffw_layer_2_input_max_to_fp16"), val = fp16(0x1.1p+3)];
+            tensor<fp16, [1, 50, 4096]> clip_199_cast_fp16 = clip(alpha = feed_forward1s_7_ffw_layer_2_input_min_to_fp16, beta = feed_forward1s_7_ffw_layer_2_input_max_to_fp16, x = hidden_states_735_cast_fp16)[name = string("clip_199_cast_fp16")];
+            tensor<fp16, [1024, 4096]> feed_forward1s_7_ffw_layer_2_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 4096]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(87555136))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(89652352))))[name = string("feed_forward1s_7_ffw_layer_2_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_79_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = feed_forward1s_7_ffw_layer_2_linear_weight_to_fp16_palettized, x = clip_199_cast_fp16)[name = string("linear_79_cast_fp16")];
+            fp16 feed_forward1s_7_ffw_layer_2_output_min_to_fp16 = const()[name = string("feed_forward1s_7_ffw_layer_2_output_min_to_fp16"), val = fp16(-0x1.52p+5)];
+            fp16 feed_forward1s_7_ffw_layer_2_output_max_to_fp16 = const()[name = string("feed_forward1s_7_ffw_layer_2_output_max_to_fp16"), val = fp16(0x1.5p+5)];
+            tensor<fp16, [1, 50, 1024]> clip_200_cast_fp16 = clip(alpha = feed_forward1s_7_ffw_layer_2_output_min_to_fp16, beta = feed_forward1s_7_ffw_layer_2_output_max_to_fp16, x = linear_79_cast_fp16)[name = string("clip_200_cast_fp16")];
+            string clip_200_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_200_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1024]> clip_200_cast_fp16_to_fp32 = cast(dtype = clip_200_cast_fp16_to_fp32_dtype_0, x = clip_200_cast_fp16)[name = string("cast_219")];
+            tensor<fp32, [1, 50, 1024]> clip_201 = clip(alpha = var_3310, beta = var_3309, x = clip_200_cast_fp16_to_fp32)[name = string("clip_201")];
+            fp32 var_3304_promoted_1 = const()[name = string("op_3304_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_3350 = pow(x = clip_201, y = var_3304_promoted_1)[name = string("op_3350")];
+            tensor<int32, [1]> var_3352_axes_0 = const()[name = string("op_3352_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3352_keep_dims_0 = const()[name = string("op_3352_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_3352 = reduce_mean(axes = var_3352_axes_0, keep_dims = var_3352_keep_dims_0, x = var_3350)[name = string("op_3352")];
+            string var_3352_to_fp16_dtype_0 = const()[name = string("op_3352_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_3353_to_fp16 = const()[name = string("op_3353_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_3352_to_fp16 = cast(dtype = var_3352_to_fp16_dtype_0, x = var_3352)[name = string("cast_218")];
+            tensor<fp16, [1, 50, 1]> mean_squared_129_cast_fp16 = add(x = var_3352_to_fp16, y = var_3353_to_fp16)[name = string("mean_squared_129_cast_fp16")];
+            string mean_squared_129_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_129_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_129_cast_fp16_to_fp32 = cast(dtype = mean_squared_129_cast_fp16_to_fp32_dtype_0, x = mean_squared_129_cast_fp16)[name = string("cast_217")];
+            tensor<fp32, [1, 50, 1]> var_3355 = pow(x = mean_squared_129_cast_fp16_to_fp32, y = var_3308)[name = string("op_3355")];
+            string clip_201_to_fp16_dtype_0 = const()[name = string("clip_201_to_fp16_dtype_0"), val = string("fp16")];
+            string var_3355_to_fp16_dtype_0 = const()[name = string("op_3355_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_201_to_fp16 = cast(dtype = clip_201_to_fp16_dtype_0, x = clip_201)[name = string("cast_215")];
+            tensor<fp16, [1, 50, 1]> var_3355_to_fp16 = cast(dtype = var_3355_to_fp16_dtype_0, x = var_3355)[name = string("cast_216")];
+            tensor<fp16, [1, 50, 1024]> normed_output_257_cast_fp16 = mul(x = clip_201_to_fp16, y = var_3355_to_fp16)[name = string("normed_output_257_cast_fp16")];
+            tensor<fp16, [1024]> const_108_to_fp16 = const()[name = string("const_108_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(89653440)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_259_cast_fp16 = mul(x = normed_output_257_cast_fp16, y = const_108_to_fp16)[name = string("normed_output_259_cast_fp16")];
+            fp16 var_3300_to_fp16 = const()[name = string("op_3300_to_fp16"), val = fp16(0x1p-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_747_cast_fp16 = mul(x = normed_output_259_cast_fp16, y = var_3300_to_fp16)[name = string("hidden_states_747_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_749_cast_fp16 = add(x = hidden_states_747_cast_fp16, y = normed_output_251_cast_fp16)[name = string("hidden_states_749_cast_fp16")];
+            fp16 var_3362_to_fp16 = const()[name = string("op_3362_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_3363_to_fp16 = const()[name = string("op_3363_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_202_cast_fp16 = clip(alpha = var_3362_to_fp16, beta = var_3363_to_fp16, x = hidden_states_749_cast_fp16)[name = string("clip_202_cast_fp16")];
+            string clip_202_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_202_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_3365 = const()[name = string("op_3365"), val = fp32(-0x1p-1)];
+            fp32 var_3369_promoted = const()[name = string("op_3369_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_202_cast_fp16_to_fp32 = cast(dtype = clip_202_cast_fp16_to_fp32_dtype_0, x = clip_202_cast_fp16)[name = string("cast_214")];
+            tensor<fp32, [1, 50, 1024]> var_3375 = pow(x = clip_202_cast_fp16_to_fp32, y = var_3369_promoted)[name = string("op_3375")];
+            tensor<int32, [1]> var_3377_axes_0 = const()[name = string("op_3377_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3377_keep_dims_0 = const()[name = string("op_3377_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_3377 = reduce_mean(axes = var_3377_axes_0, keep_dims = var_3377_keep_dims_0, x = var_3375)[name = string("op_3377")];
+            string var_3377_to_fp16_dtype_0 = const()[name = string("op_3377_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_3378_to_fp16 = const()[name = string("op_3378_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_3377_to_fp16 = cast(dtype = var_3377_to_fp16_dtype_0, x = var_3377)[name = string("cast_213")];
+            tensor<fp16, [1, 50, 1]> mean_squared_131_cast_fp16 = add(x = var_3377_to_fp16, y = var_3378_to_fp16)[name = string("mean_squared_131_cast_fp16")];
+            string mean_squared_131_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_131_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_131_cast_fp16_to_fp32 = cast(dtype = mean_squared_131_cast_fp16_to_fp32_dtype_0, x = mean_squared_131_cast_fp16)[name = string("cast_212")];
+            tensor<fp32, [1, 50, 1]> var_3380 = pow(x = mean_squared_131_cast_fp16_to_fp32, y = var_3365)[name = string("op_3380")];
+            string var_3380_to_fp16_dtype_0 = const()[name = string("op_3380_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_3380_to_fp16 = cast(dtype = var_3380_to_fp16_dtype_0, x = var_3380)[name = string("cast_211")];
+            tensor<fp16, [1, 50, 1024]> normed_output_261_cast_fp16 = mul(x = clip_202_cast_fp16, y = var_3380_to_fp16)[name = string("normed_output_261_cast_fp16")];
+            tensor<fp16, [1024]> const_109_to_fp16 = const()[name = string("const_109_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(89655552)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_263_cast_fp16 = mul(x = normed_output_261_cast_fp16, y = const_109_to_fp16)[name = string("normed_output_263_cast_fp16")];
+            int32 var_3386 = const()[name = string("op_3386"), val = int32(-1)];
+            fp32 var_3387 = const()[name = string("op_3387"), val = fp32(-0x1.dcd65p+29)];
+            fp16 self_attns_7_q_proj_input_min_to_fp16 = const()[name = string("self_attns_7_q_proj_input_min_to_fp16"), val = fp16(-0x1.5ep+3)];
+            fp16 self_attns_7_q_proj_input_max_to_fp16 = const()[name = string("self_attns_7_q_proj_input_max_to_fp16"), val = fp16(0x1.5cp+3)];
+            tensor<fp16, [1, 50, 1024]> clip_203_cast_fp16 = clip(alpha = self_attns_7_q_proj_input_min_to_fp16, beta = self_attns_7_q_proj_input_max_to_fp16, x = normed_output_263_cast_fp16)[name = string("clip_203_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_7_q_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(89657664))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(90182016))))[name = string("self_attns_7_q_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_80_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_7_q_proj_linear_weight_to_fp16_palettized, x = clip_203_cast_fp16)[name = string("linear_80_cast_fp16")];
+            fp16 self_attns_7_q_proj_output_min_to_fp16 = const()[name = string("self_attns_7_q_proj_output_min_to_fp16"), val = fp16(-0x1.18p+4)];
+            fp16 self_attns_7_q_proj_output_max_to_fp16 = const()[name = string("self_attns_7_q_proj_output_max_to_fp16"), val = fp16(0x1.16p+4)];
+            tensor<fp16, [1, 50, 1024]> clip_204_cast_fp16 = clip(alpha = self_attns_7_q_proj_output_min_to_fp16, beta = self_attns_7_q_proj_output_max_to_fp16, x = linear_80_cast_fp16)[name = string("clip_204_cast_fp16")];
+            tensor<int32, [4]> var_3431 = const()[name = string("op_3431"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> q_15_cast_fp16 = reshape(shape = var_3431, x = clip_204_cast_fp16)[name = string("q_15_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_7_k_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(90183104))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(90707456))))[name = string("self_attns_7_k_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_81_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_7_k_proj_linear_weight_to_fp16_palettized, x = clip_203_cast_fp16)[name = string("linear_81_cast_fp16")];
+            fp16 self_attns_7_k_proj_output_min_to_fp16 = const()[name = string("self_attns_7_k_proj_output_min_to_fp16"), val = fp16(-0x1.18p+4)];
+            fp16 self_attns_7_k_proj_output_max_to_fp16 = const()[name = string("self_attns_7_k_proj_output_max_to_fp16"), val = fp16(0x1.16p+4)];
+            tensor<fp16, [1, 50, 1024]> clip_206_cast_fp16 = clip(alpha = self_attns_7_k_proj_output_min_to_fp16, beta = self_attns_7_k_proj_output_max_to_fp16, x = linear_81_cast_fp16)[name = string("clip_206_cast_fp16")];
+            tensor<int32, [4]> var_3443 = const()[name = string("op_3443"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> k_15_cast_fp16 = reshape(shape = var_3443, x = clip_206_cast_fp16)[name = string("k_15_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_7_v_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(90708544))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91232896))))[name = string("self_attns_7_v_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_82_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_7_v_proj_linear_weight_to_fp16_palettized, x = clip_203_cast_fp16)[name = string("linear_82_cast_fp16")];
+            fp16 self_attns_7_v_proj_output_min_to_fp16 = const()[name = string("self_attns_7_v_proj_output_min_to_fp16"), val = fp16(-0x1.18p+4)];
+            fp16 self_attns_7_v_proj_output_max_to_fp16 = const()[name = string("self_attns_7_v_proj_output_max_to_fp16"), val = fp16(0x1.16p+4)];
+            tensor<fp16, [1, 50, 1024]> clip_208_cast_fp16 = clip(alpha = self_attns_7_v_proj_output_min_to_fp16, beta = self_attns_7_v_proj_output_max_to_fp16, x = linear_82_cast_fp16)[name = string("clip_208_cast_fp16")];
+            tensor<int32, [4]> var_3455 = const()[name = string("op_3455"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> input_327_cast_fp16 = reshape(shape = var_3455, x = clip_208_cast_fp16)[name = string("input_327_cast_fp16")];
+            fp16 var_3457_to_fp16 = const()[name = string("op_3457_to_fp16"), val = fp16(0x1.054p-3)];
+            tensor<fp16, [1, 50, 8, 128]> var_3458_cast_fp16 = mul(x = q_15_cast_fp16, y = var_3457_to_fp16)[name = string("op_3458_cast_fp16")];
+            tensor<fp16, [128]> var_3459_to_fp16 = const()[name = string("op_3459_to_fp16"), val = tensor<fp16, [128]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91233984)))];
+            tensor<fp16, [1, 50, 8, 128]> input_323_cast_fp16 = mul(x = var_3458_cast_fp16, y = var_3459_to_fp16)[name = string("input_323_cast_fp16")];
+            fp16 var_3461_to_fp16 = const()[name = string("op_3461_to_fp16"), val = fp16(0x1.e5p+0)];
+            tensor<fp16, [1, 50, 8, 128]> input_325_cast_fp16 = mul(x = k_15_cast_fp16, y = var_3461_to_fp16)[name = string("input_325_cast_fp16")];
+            tensor<int32, [8]> q_padded_15_pad_0 = const()[name = string("q_padded_15_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 10, 0, 0, 0, 0])];
+            string q_padded_15_mode_0 = const()[name = string("q_padded_15_mode_0"), val = string("constant")];
+            fp16 const_110_to_fp16 = const()[name = string("const_110_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 60, 8, 128]> q_padded_15_cast_fp16 = pad(constant_val = const_110_to_fp16, mode = q_padded_15_mode_0, pad = q_padded_15_pad_0, x = input_323_cast_fp16)[name = string("q_padded_15_cast_fp16")];
+            tensor<int32, [5]> var_3465 = const()[name = string("op_3465"), val = tensor<int32, [5]>([1, 5, 12, 8, 128])];
+            tensor<fp16, [1, 5, 12, 8, 128]> q_blocks_15_cast_fp16 = reshape(shape = var_3465, x = q_padded_15_cast_fp16)[name = string("q_blocks_15_cast_fp16")];
+            tensor<int32, [8]> k_padded_15_pad_0 = const()[name = string("k_padded_15_pad_0"), val = tensor<int32, [8]>([0, 0, 12, 11, 0, 0, 0, 0])];
+            string k_padded_15_mode_0 = const()[name = string("k_padded_15_mode_0"), val = string("constant")];
+            fp16 const_111_to_fp16 = const()[name = string("const_111_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 73, 8, 128]> k_padded_15_cast_fp16 = pad(constant_val = const_111_to_fp16, mode = k_padded_15_mode_0, pad = k_padded_15_pad_0, x = input_325_cast_fp16)[name = string("k_padded_15_cast_fp16")];
+            tensor<int32, [8]> v_padded_15_pad_0 = const()[name = string("v_padded_15_pad_0"), val = tensor<int32, [8]>([0, 0, 12, 11, 0, 0, 0, 0])];
+            string v_padded_15_mode_0 = const()[name = string("v_padded_15_mode_0"), val = string("constant")];
+            fp16 const_112_to_fp16 = const()[name = string("const_112_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 73, 8, 128]> v_padded_15_cast_fp16 = pad(constant_val = const_112_to_fp16, mode = v_padded_15_mode_0, pad = v_padded_15_pad_0, x = input_327_cast_fp16)[name = string("v_padded_15_cast_fp16")];
+            tensor<int32, [4]> var_3472_begin_0 = const()[name = string("op_3472_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3472_end_0 = const()[name = string("op_3472_end_0"), val = tensor<int32, [4]>([1, 24, 8, 128])];
+            tensor<bool, [4]> var_3472_end_mask_0 = const()[name = string("op_3472_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_3472_cast_fp16 = slice_by_index(begin = var_3472_begin_0, end = var_3472_end_0, end_mask = var_3472_end_mask_0, x = k_padded_15_cast_fp16)[name = string("op_3472_cast_fp16")];
+            tensor<int32, [4]> var_3474_begin_0 = const()[name = string("op_3474_begin_0"), val = tensor<int32, [4]>([0, 12, 0, 0])];
+            tensor<int32, [4]> var_3474_end_0 = const()[name = string("op_3474_end_0"), val = tensor<int32, [4]>([1, 36, 8, 128])];
+            tensor<bool, [4]> var_3474_end_mask_0 = const()[name = string("op_3474_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_3474_cast_fp16 = slice_by_index(begin = var_3474_begin_0, end = var_3474_end_0, end_mask = var_3474_end_mask_0, x = k_padded_15_cast_fp16)[name = string("op_3474_cast_fp16")];
+            tensor<int32, [4]> var_3476_begin_0 = const()[name = string("op_3476_begin_0"), val = tensor<int32, [4]>([0, 24, 0, 0])];
+            tensor<int32, [4]> var_3476_end_0 = const()[name = string("op_3476_end_0"), val = tensor<int32, [4]>([1, 48, 8, 128])];
+            tensor<bool, [4]> var_3476_end_mask_0 = const()[name = string("op_3476_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_3476_cast_fp16 = slice_by_index(begin = var_3476_begin_0, end = var_3476_end_0, end_mask = var_3476_end_mask_0, x = k_padded_15_cast_fp16)[name = string("op_3476_cast_fp16")];
+            tensor<int32, [4]> var_3478_begin_0 = const()[name = string("op_3478_begin_0"), val = tensor<int32, [4]>([0, 36, 0, 0])];
+            tensor<int32, [4]> var_3478_end_0 = const()[name = string("op_3478_end_0"), val = tensor<int32, [4]>([1, 60, 8, 128])];
+            tensor<bool, [4]> var_3478_end_mask_0 = const()[name = string("op_3478_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_3478_cast_fp16 = slice_by_index(begin = var_3478_begin_0, end = var_3478_end_0, end_mask = var_3478_end_mask_0, x = k_padded_15_cast_fp16)[name = string("op_3478_cast_fp16")];
+            tensor<int32, [4]> var_3480_begin_0 = const()[name = string("op_3480_begin_0"), val = tensor<int32, [4]>([0, 48, 0, 0])];
+            tensor<int32, [4]> var_3480_end_0 = const()[name = string("op_3480_end_0"), val = tensor<int32, [4]>([1, 72, 8, 128])];
+            tensor<bool, [4]> var_3480_end_mask_0 = const()[name = string("op_3480_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_3480_cast_fp16 = slice_by_index(begin = var_3480_begin_0, end = var_3480_end_0, end_mask = var_3480_end_mask_0, x = k_padded_15_cast_fp16)[name = string("op_3480_cast_fp16")];
+            int32 k_blocks_15_axis_0 = const()[name = string("k_blocks_15_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 5, 24, 8, 128]> k_blocks_15_cast_fp16 = stack(axis = k_blocks_15_axis_0, values = (var_3472_cast_fp16, var_3474_cast_fp16, var_3476_cast_fp16, var_3478_cast_fp16, var_3480_cast_fp16))[name = string("k_blocks_15_cast_fp16")];
+            tensor<int32, [4]> var_3484_begin_0 = const()[name = string("op_3484_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3484_end_0 = const()[name = string("op_3484_end_0"), val = tensor<int32, [4]>([1, 24, 8, 128])];
+            tensor<bool, [4]> var_3484_end_mask_0 = const()[name = string("op_3484_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_3484_cast_fp16 = slice_by_index(begin = var_3484_begin_0, end = var_3484_end_0, end_mask = var_3484_end_mask_0, x = v_padded_15_cast_fp16)[name = string("op_3484_cast_fp16")];
+            tensor<int32, [4]> var_3486_begin_0 = const()[name = string("op_3486_begin_0"), val = tensor<int32, [4]>([0, 12, 0, 0])];
+            tensor<int32, [4]> var_3486_end_0 = const()[name = string("op_3486_end_0"), val = tensor<int32, [4]>([1, 36, 8, 128])];
+            tensor<bool, [4]> var_3486_end_mask_0 = const()[name = string("op_3486_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_3486_cast_fp16 = slice_by_index(begin = var_3486_begin_0, end = var_3486_end_0, end_mask = var_3486_end_mask_0, x = v_padded_15_cast_fp16)[name = string("op_3486_cast_fp16")];
+            tensor<int32, [4]> var_3488_begin_0 = const()[name = string("op_3488_begin_0"), val = tensor<int32, [4]>([0, 24, 0, 0])];
+            tensor<int32, [4]> var_3488_end_0 = const()[name = string("op_3488_end_0"), val = tensor<int32, [4]>([1, 48, 8, 128])];
+            tensor<bool, [4]> var_3488_end_mask_0 = const()[name = string("op_3488_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_3488_cast_fp16 = slice_by_index(begin = var_3488_begin_0, end = var_3488_end_0, end_mask = var_3488_end_mask_0, x = v_padded_15_cast_fp16)[name = string("op_3488_cast_fp16")];
+            tensor<int32, [4]> var_3490_begin_0 = const()[name = string("op_3490_begin_0"), val = tensor<int32, [4]>([0, 36, 0, 0])];
+            tensor<int32, [4]> var_3490_end_0 = const()[name = string("op_3490_end_0"), val = tensor<int32, [4]>([1, 60, 8, 128])];
+            tensor<bool, [4]> var_3490_end_mask_0 = const()[name = string("op_3490_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_3490_cast_fp16 = slice_by_index(begin = var_3490_begin_0, end = var_3490_end_0, end_mask = var_3490_end_mask_0, x = v_padded_15_cast_fp16)[name = string("op_3490_cast_fp16")];
+            tensor<int32, [4]> var_3492_begin_0 = const()[name = string("op_3492_begin_0"), val = tensor<int32, [4]>([0, 48, 0, 0])];
+            tensor<int32, [4]> var_3492_end_0 = const()[name = string("op_3492_end_0"), val = tensor<int32, [4]>([1, 72, 8, 128])];
+            tensor<bool, [4]> var_3492_end_mask_0 = const()[name = string("op_3492_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_3492_cast_fp16 = slice_by_index(begin = var_3492_begin_0, end = var_3492_end_0, end_mask = var_3492_end_mask_0, x = v_padded_15_cast_fp16)[name = string("op_3492_cast_fp16")];
+            int32 v_blocks_15_axis_0 = const()[name = string("v_blocks_15_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 5, 24, 8, 128]> v_blocks_15_cast_fp16 = stack(axis = v_blocks_15_axis_0, values = (var_3484_cast_fp16, var_3486_cast_fp16, var_3488_cast_fp16, var_3490_cast_fp16, var_3492_cast_fp16))[name = string("v_blocks_15_cast_fp16")];
+            tensor<int32, [5]> var_3500 = const()[name = string("op_3500"), val = tensor<int32, [5]>([0, 3, 1, -3, -1])];
+            tensor<int32, [5]> var_3502 = const()[name = string("op_3502"), val = tensor<int32, [5]>([0, 3, 1, -1, -3])];
+            bool matrix_ac_15_transpose_x_0 = const()[name = string("matrix_ac_15_transpose_x_0"), val = bool(false)];
+            bool matrix_ac_15_transpose_y_0 = const()[name = string("matrix_ac_15_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 5, 12, 128]> queries_15_cast_fp16 = transpose(perm = var_3500, x = q_blocks_15_cast_fp16)[name = string("transpose_28")];
+            tensor<fp16, [1, 8, 5, 128, 24]> keys_t_15_cast_fp16 = transpose(perm = var_3502, x = k_blocks_15_cast_fp16)[name = string("transpose_29")];
+            tensor<fp16, [1, 8, 5, 12, 24]> matrix_ac_15_cast_fp16 = matmul(transpose_x = matrix_ac_15_transpose_x_0, transpose_y = matrix_ac_15_transpose_y_0, x = queries_15_cast_fp16, y = keys_t_15_cast_fp16)[name = string("matrix_ac_15_cast_fp16")];
+            tensor<int32, [4]> var_3505 = const()[name = string("op_3505"), val = tensor<int32, [4]>([1, 8, 60, 128])];
+            tensor<fp16, [1, 8, 60, 128]> q_flat_15_cast_fp16 = reshape(shape = var_3505, x = queries_15_cast_fp16)[name = string("q_flat_15_cast_fp16")];
+            bool matrix_bd_71_transpose_x_0 = const()[name = string("matrix_bd_71_transpose_x_0"), val = bool(false)];
+            bool matrix_bd_71_transpose_y_0 = const()[name = string("matrix_bd_71_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [8, 128, 13]> rel_k_t_15_to_fp16 = const()[name = string("rel_k_t_15_to_fp16"), val = tensor<fp16, [8, 128, 13]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91234304)))];
+            tensor<fp16, [1, 8, 60, 13]> matrix_bd_71_cast_fp16 = matmul(transpose_x = matrix_bd_71_transpose_x_0, transpose_y = matrix_bd_71_transpose_y_0, x = q_flat_15_cast_fp16, y = rel_k_t_15_to_fp16)[name = string("matrix_bd_71_cast_fp16")];
+            tensor<int32, [5]> var_3510 = const()[name = string("op_3510"), val = tensor<int32, [5]>([1, 8, 5, 12, 13])];
+            tensor<fp16, [1, 8, 5, 12, 13]> input_329_cast_fp16 = reshape(shape = var_3510, x = matrix_bd_71_cast_fp16)[name = string("input_329_cast_fp16")];
+            tensor<int32, [10]> matrix_bd_73_pad_0 = const()[name = string("matrix_bd_73_pad_0"), val = tensor<int32, [10]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91260992)))];
+            string matrix_bd_73_mode_0 = const()[name = string("matrix_bd_73_mode_0"), val = string("constant")];
+            fp16 const_114_to_fp16 = const()[name = string("const_114_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 8, 5, 12, 25]> matrix_bd_73_cast_fp16 = pad(constant_val = const_114_to_fp16, mode = matrix_bd_73_mode_0, pad = matrix_bd_73_pad_0, x = input_329_cast_fp16)[name = string("matrix_bd_73_cast_fp16")];
+            tensor<int32, [4]> var_3514 = const()[name = string("op_3514"), val = tensor<int32, [4]>([1, 8, 5, 300])];
+            tensor<fp16, [1, 8, 5, 300]> matrix_bd_75_cast_fp16 = reshape(shape = var_3514, x = matrix_bd_73_cast_fp16)[name = string("matrix_bd_75_cast_fp16")];
+            tensor<int32, [4]> matrix_bd_77_begin_0 = const()[name = string("matrix_bd_77_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> matrix_bd_77_end_0 = const()[name = string("matrix_bd_77_end_0"), val = tensor<int32, [4]>([1, 8, 5, 288])];
+            tensor<bool, [4]> matrix_bd_77_end_mask_0 = const()[name = string("matrix_bd_77_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 8, 5, 288]> matrix_bd_77_cast_fp16 = slice_by_index(begin = matrix_bd_77_begin_0, end = matrix_bd_77_end_0, end_mask = matrix_bd_77_end_mask_0, x = matrix_bd_75_cast_fp16)[name = string("matrix_bd_77_cast_fp16")];
+            tensor<int32, [5]> var_3520 = const()[name = string("op_3520"), val = tensor<int32, [5]>([1, 8, 5, 12, 24])];
+            tensor<fp16, [1, 8, 5, 12, 24]> matrix_bd_79_cast_fp16 = reshape(shape = var_3520, x = matrix_bd_77_cast_fp16)[name = string("matrix_bd_79_cast_fp16")];
+            tensor<fp16, [1, 8, 5, 12, 24]> attn_43_cast_fp16 = add(x = matrix_ac_15_cast_fp16, y = matrix_bd_79_cast_fp16)[name = string("attn_43_cast_fp16")];
+            fp16 _inversed_3523_y_0_to_fp16 = const()[name = string("_inversed_3523_y_0_to_fp16"), val = fp16(0x1.47cp-6)];
+            tensor<fp16, [1, 8, 5, 12, 24]> _inversed_3523_cast_fp16 = mul(x = attn_43_cast_fp16, y = _inversed_3523_y_0_to_fp16)[name = string("_inversed_3523_cast_fp16")];
+            string _inversed_3523_cast_fp16_to_fp32_dtype_0 = const()[name = string("_inversed_3523_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 8, 5, 12, 24]> _inversed_3523_cast_fp16_to_fp32 = cast(dtype = _inversed_3523_cast_fp16_to_fp32_dtype_0, x = _inversed_3523_cast_fp16)[name = string("cast_210")];
+            tensor<fp32, [1, 8, 5, 12, 24]> var_3524 = tanh(x = _inversed_3523_cast_fp16_to_fp32)[name = string("op_3524")];
+            string var_3524_to_fp16_dtype_0 = const()[name = string("op_3524_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 self_attns_7_softcap_to_fp16 = const()[name = string("self_attns_7_softcap_to_fp16"), val = fp16(0x1.9p+5)];
+            tensor<fp16, [1, 8, 5, 12, 24]> var_3524_to_fp16 = cast(dtype = var_3524_to_fp16_dtype_0, x = var_3524)[name = string("cast_209")];
+            tensor<fp16, [1, 8, 5, 12, 24]> attn_45_cast_fp16 = mul(x = var_3524_to_fp16, y = self_attns_7_softcap_to_fp16)[name = string("attn_45_cast_fp16")];
+            string attn_45_cast_fp16_to_fp32_dtype_0 = const()[name = string("attn_45_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 8, 5, 12, 24]> attn_45_cast_fp16_to_fp32 = cast(dtype = attn_45_cast_fp16_to_fp32_dtype_0, x = attn_45_cast_fp16)[name = string("cast_208")];
+            tensor<fp32, [1, 8, 5, 12, 24]> input_331 = select(a = var_3387, b = attn_45_cast_fp16_to_fp32, cond = var_460)[name = string("input_331")];
+            tensor<fp32, [1, 8, 5, 12, 24]> var_3528 = softmax(axis = var_3386, x = input_331)[name = string("op_3528")];
+            tensor<int32, [5]> var_3530 = const()[name = string("op_3530"), val = tensor<int32, [5]>([0, 3, 1, -3, -1])];
+            bool out_43_transpose_x_0 = const()[name = string("out_43_transpose_x_0"), val = bool(false)];
+            bool out_43_transpose_y_0 = const()[name = string("out_43_transpose_y_0"), val = bool(false)];
+            string var_3528_to_fp16_dtype_0 = const()[name = string("op_3528_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 8, 5, 24, 128]> values_t_15_cast_fp16 = transpose(perm = var_3530, x = v_blocks_15_cast_fp16)[name = string("transpose_27")];
+            tensor<fp16, [1, 8, 5, 12, 24]> var_3528_to_fp16 = cast(dtype = var_3528_to_fp16_dtype_0, x = var_3528)[name = string("cast_207")];
+            tensor<fp16, [1, 8, 5, 12, 128]> out_43_cast_fp16 = matmul(transpose_x = out_43_transpose_x_0, transpose_y = out_43_transpose_y_0, x = var_3528_to_fp16, y = values_t_15_cast_fp16)[name = string("out_43_cast_fp16")];
+            tensor<int32, [5]> var_3533 = const()[name = string("op_3533"), val = tensor<int32, [5]>([0, 2, 3, 1, 4])];
+            tensor<int32, [3]> var_3535 = const()[name = string("op_3535"), val = tensor<int32, [3]>([1, 60, 1024])];
+            tensor<fp16, [1, 5, 12, 8, 128]> var_3534_cast_fp16 = transpose(perm = var_3533, x = out_43_cast_fp16)[name = string("transpose_26")];
+            tensor<fp16, [1, 60, 1024]> out_45_cast_fp16 = reshape(shape = var_3535, x = var_3534_cast_fp16)[name = string("out_45_cast_fp16")];
+            tensor<int32, [3]> var_3538_begin_0 = const()[name = string("op_3538_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_3538_end_0 = const()[name = string("op_3538_end_0"), val = tensor<int32, [3]>([1, 50, 1024])];
+            tensor<bool, [3]> var_3538_end_mask_0 = const()[name = string("op_3538_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp16, [1, 50, 1024]> var_3538_cast_fp16 = slice_by_index(begin = var_3538_begin_0, end = var_3538_end_0, end_mask = var_3538_end_mask_0, x = out_45_cast_fp16)[name = string("op_3538_cast_fp16")];
+            fp16 self_attns_7_post_input_min_to_fp16 = const()[name = string("self_attns_7_post_input_min_to_fp16"), val = fp16(-0x1.12p+4)];
+            fp16 self_attns_7_post_input_max_to_fp16 = const()[name = string("self_attns_7_post_input_max_to_fp16"), val = fp16(0x1.1p+4)];
+            tensor<fp16, [1, 50, 1024]> clip_209_cast_fp16 = clip(alpha = self_attns_7_post_input_min_to_fp16, beta = self_attns_7_post_input_max_to_fp16, x = var_3538_cast_fp16)[name = string("clip_209_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_7_post_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91261120))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91785472))))[name = string("self_attns_7_post_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_84_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_7_post_linear_weight_to_fp16_palettized, x = clip_209_cast_fp16)[name = string("linear_84_cast_fp16")];
+            fp16 self_attns_7_post_output_min_to_fp16 = const()[name = string("self_attns_7_post_output_min_to_fp16"), val = fp16(-0x1.c8p+5)];
+            fp16 self_attns_7_post_output_max_to_fp16 = const()[name = string("self_attns_7_post_output_max_to_fp16"), val = fp16(0x1.c4p+5)];
+            tensor<fp16, [1, 50, 1024]> clip_210_cast_fp16 = clip(alpha = self_attns_7_post_output_min_to_fp16, beta = self_attns_7_post_output_max_to_fp16, x = linear_84_cast_fp16)[name = string("clip_210_cast_fp16")];
+            fp16 var_3550_to_fp16 = const()[name = string("op_3550_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_3551_to_fp16 = const()[name = string("op_3551_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_211_cast_fp16 = clip(alpha = var_3550_to_fp16, beta = var_3551_to_fp16, x = clip_210_cast_fp16)[name = string("clip_211_cast_fp16")];
+            string clip_211_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_211_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_3553 = const()[name = string("op_3553"), val = fp32(-0x1p-1)];
+            fp32 var_3557_promoted = const()[name = string("op_3557_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_211_cast_fp16_to_fp32 = cast(dtype = clip_211_cast_fp16_to_fp32_dtype_0, x = clip_211_cast_fp16)[name = string("cast_206")];
+            tensor<fp32, [1, 50, 1024]> var_3563 = pow(x = clip_211_cast_fp16_to_fp32, y = var_3557_promoted)[name = string("op_3563")];
+            tensor<int32, [1]> var_3565_axes_0 = const()[name = string("op_3565_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3565_keep_dims_0 = const()[name = string("op_3565_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_3565 = reduce_mean(axes = var_3565_axes_0, keep_dims = var_3565_keep_dims_0, x = var_3563)[name = string("op_3565")];
+            string var_3565_to_fp16_dtype_0 = const()[name = string("op_3565_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_3566_to_fp16 = const()[name = string("op_3566_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_3565_to_fp16 = cast(dtype = var_3565_to_fp16_dtype_0, x = var_3565)[name = string("cast_205")];
+            tensor<fp16, [1, 50, 1]> mean_squared_133_cast_fp16 = add(x = var_3565_to_fp16, y = var_3566_to_fp16)[name = string("mean_squared_133_cast_fp16")];
+            string mean_squared_133_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_133_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_133_cast_fp16_to_fp32 = cast(dtype = mean_squared_133_cast_fp16_to_fp32_dtype_0, x = mean_squared_133_cast_fp16)[name = string("cast_204")];
+            tensor<fp32, [1, 50, 1]> var_3568 = pow(x = mean_squared_133_cast_fp16_to_fp32, y = var_3553)[name = string("op_3568")];
+            string var_3568_to_fp16_dtype_0 = const()[name = string("op_3568_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_3568_to_fp16 = cast(dtype = var_3568_to_fp16_dtype_0, x = var_3568)[name = string("cast_203")];
+            tensor<fp16, [1, 50, 1024]> normed_output_265_cast_fp16 = mul(x = clip_211_cast_fp16, y = var_3568_to_fp16)[name = string("normed_output_265_cast_fp16")];
+            tensor<fp16, [1024]> const_115_to_fp16 = const()[name = string("const_115_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91786560)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_267_cast_fp16 = mul(x = normed_output_265_cast_fp16, y = const_115_to_fp16)[name = string("normed_output_267_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_775_cast_fp16 = add(x = normed_output_267_cast_fp16, y = hidden_states_749_cast_fp16)[name = string("hidden_states_775_cast_fp16")];
+            string hidden_states_775_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_775_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_3575 = const()[name = string("op_3575"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_3576 = const()[name = string("op_3576"), val = fp32(-0x1.2a05f2p+33)];
+            fp32 var_3588 = const()[name = string("op_3588"), val = fp32(-0x1p-1)];
+            fp32 var_3584_promoted = const()[name = string("op_3584_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> hidden_states_775_cast_fp16_to_fp32 = cast(dtype = hidden_states_775_cast_fp16_to_fp32_dtype_0, x = hidden_states_775_cast_fp16)[name = string("cast_202")];
+            tensor<fp32, [1, 50, 1024]> var_3596 = pow(x = hidden_states_775_cast_fp16_to_fp32, y = var_3584_promoted)[name = string("op_3596")];
+            tensor<int32, [1]> var_3598_axes_0 = const()[name = string("op_3598_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3598_keep_dims_0 = const()[name = string("op_3598_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_3598 = reduce_mean(axes = var_3598_axes_0, keep_dims = var_3598_keep_dims_0, x = var_3596)[name = string("op_3598")];
+            string var_3598_to_fp16_dtype_0 = const()[name = string("op_3598_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_3599_to_fp16 = const()[name = string("op_3599_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_3598_to_fp16 = cast(dtype = var_3598_to_fp16_dtype_0, x = var_3598)[name = string("cast_201")];
+            tensor<fp16, [1, 50, 1]> mean_squared_135_cast_fp16 = add(x = var_3598_to_fp16, y = var_3599_to_fp16)[name = string("mean_squared_135_cast_fp16")];
+            string mean_squared_135_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_135_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_135_cast_fp16_to_fp32 = cast(dtype = mean_squared_135_cast_fp16_to_fp32_dtype_0, x = mean_squared_135_cast_fp16)[name = string("cast_200")];
+            tensor<fp32, [1, 50, 1]> var_3601 = pow(x = mean_squared_135_cast_fp16_to_fp32, y = var_3588)[name = string("op_3601")];
+            string var_3601_to_fp16_dtype_0 = const()[name = string("op_3601_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_3601_to_fp16 = cast(dtype = var_3601_to_fp16_dtype_0, x = var_3601)[name = string("cast_199")];
+            tensor<fp16, [1, 50, 1024]> normed_output_269_cast_fp16 = mul(x = hidden_states_775_cast_fp16, y = var_3601_to_fp16)[name = string("normed_output_269_cast_fp16")];
+            tensor<fp16, [1024]> const_116_to_fp16 = const()[name = string("const_116_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91788672)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_271_cast_fp16 = mul(x = normed_output_269_cast_fp16, y = const_116_to_fp16)[name = string("normed_output_271_cast_fp16")];
+            fp16 lconv1ds_7_linear_start_input_min_to_fp16 = const()[name = string("lconv1ds_7_linear_start_input_min_to_fp16"), val = fp16(-0x1.58p+3)];
+            fp16 lconv1ds_7_linear_start_input_max_to_fp16 = const()[name = string("lconv1ds_7_linear_start_input_max_to_fp16"), val = fp16(0x1.54p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_212_cast_fp16 = clip(alpha = lconv1ds_7_linear_start_input_min_to_fp16, beta = lconv1ds_7_linear_start_input_max_to_fp16, x = normed_output_271_cast_fp16)[name = string("clip_212_cast_fp16")];
+            tensor<fp16, [2048, 1024]> lconv1ds_7_linear_start_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91790784))), lut = tensor<fp16, [64, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(92839424))))[name = string("lconv1ds_7_linear_start_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 2048]> linear_85_cast_fp16 = linear(bias = linear_8_bias_0_to_fp16, weight = lconv1ds_7_linear_start_linear_weight_to_fp16_palettized, x = clip_212_cast_fp16)[name = string("linear_85_cast_fp16")];
+            fp16 lconv1ds_7_linear_start_output_min_to_fp16 = const()[name = string("lconv1ds_7_linear_start_output_min_to_fp16"), val = fp16(-0x1.74p+4)];
+            fp16 lconv1ds_7_linear_start_output_max_to_fp16 = const()[name = string("lconv1ds_7_linear_start_output_max_to_fp16"), val = fp16(0x1.72p+4)];
+            tensor<fp16, [1, 50, 2048]> clip_213_cast_fp16 = clip(alpha = lconv1ds_7_linear_start_output_min_to_fp16, beta = lconv1ds_7_linear_start_output_max_to_fp16, x = linear_85_cast_fp16)[name = string("clip_213_cast_fp16")];
+            int32 hidden_states_783_split_num_splits_0 = const()[name = string("hidden_states_783_split_num_splits_0"), val = int32(2)];
+            int32 hidden_states_783_split_axis_0 = const()[name = string("hidden_states_783_split_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_783_split_cast_fp16_0, tensor<fp16, [1, 50, 1024]> hidden_states_783_split_cast_fp16_1 = split(axis = hidden_states_783_split_axis_0, num_splits = hidden_states_783_split_num_splits_0, x = clip_213_cast_fp16)[name = string("hidden_states_783_split_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_783_split_1_sigmoid_cast_fp16 = sigmoid(x = hidden_states_783_split_cast_fp16_1)[name = string("hidden_states_783_split_1_sigmoid_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_783_cast_fp16 = mul(x = hidden_states_783_split_cast_fp16_0, y = hidden_states_783_split_1_sigmoid_cast_fp16)[name = string("hidden_states_783_cast_fp16")];
+            tensor<int32, [3]> input_339_perm_0 = const()[name = string("input_339_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [6]> input_341_pad_0 = const()[name = string("input_341_pad_0"), val = tensor<int32, [6]>([0, 0, 0, 0, 4, 0])];
+            string input_341_mode_0 = const()[name = string("input_341_mode_0"), val = string("constant")];
+            fp16 const_117_to_fp16 = const()[name = string("const_117_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 1024, 50]> input_339_cast_fp16 = transpose(perm = input_339_perm_0, x = hidden_states_783_cast_fp16)[name = string("transpose_25")];
+            tensor<fp16, [1, 1024, 54]> input_341_cast_fp16 = pad(constant_val = const_117_to_fp16, mode = input_341_mode_0, pad = input_341_pad_0, x = input_339_cast_fp16)[name = string("input_341_cast_fp16")];
+            string var_3627_pad_type_0 = const()[name = string("op_3627_pad_type_0"), val = string("valid")];
+            int32 var_3627_groups_0 = const()[name = string("op_3627_groups_0"), val = int32(1024)];
+            tensor<int32, [1]> var_3627_strides_0 = const()[name = string("op_3627_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_3627_pad_0 = const()[name = string("op_3627_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_3627_dilations_0 = const()[name = string("op_3627_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1024, 1, 5]> lconv1ds_7_depthwise_conv1d_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1, 5]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(92841536))), lut = tensor<fp16, [32, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(92844160))))[name = string("lconv1ds_7_depthwise_conv1d_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 50]> var_3627_cast_fp16 = conv(dilations = var_3627_dilations_0, groups = var_3627_groups_0, pad = var_3627_pad_0, pad_type = var_3627_pad_type_0, strides = var_3627_strides_0, weight = lconv1ds_7_depthwise_conv1d_weight_to_fp16_palettized, x = input_341_cast_fp16)[name = string("op_3627_cast_fp16")];
+            tensor<int32, [3]> hidden_states_785_perm_0 = const()[name = string("hidden_states_785_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            string hidden_states_785_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_785_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_785_cast_fp16 = transpose(perm = hidden_states_785_perm_0, x = var_3627_cast_fp16)[name = string("transpose_24")];
+            tensor<fp32, [1, 50, 1024]> hidden_states_785_cast_fp16_to_fp32 = cast(dtype = hidden_states_785_cast_fp16_to_fp32_dtype_0, x = hidden_states_785_cast_fp16)[name = string("cast_198")];
+            tensor<fp32, [1, 50, 1024]> clip_214 = clip(alpha = var_3576, beta = var_3575, x = hidden_states_785_cast_fp16_to_fp32)[name = string("clip_214")];
+            fp32 var_3584_promoted_1 = const()[name = string("op_3584_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_3632 = pow(x = clip_214, y = var_3584_promoted_1)[name = string("op_3632")];
+            tensor<int32, [1]> var_3634_axes_0 = const()[name = string("op_3634_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3634_keep_dims_0 = const()[name = string("op_3634_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_3634 = reduce_mean(axes = var_3634_axes_0, keep_dims = var_3634_keep_dims_0, x = var_3632)[name = string("op_3634")];
+            string var_3634_to_fp16_dtype_0 = const()[name = string("op_3634_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_3635_to_fp16 = const()[name = string("op_3635_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_3634_to_fp16 = cast(dtype = var_3634_to_fp16_dtype_0, x = var_3634)[name = string("cast_197")];
+            tensor<fp16, [1, 50, 1]> mean_squared_137_cast_fp16 = add(x = var_3634_to_fp16, y = var_3635_to_fp16)[name = string("mean_squared_137_cast_fp16")];
+            string mean_squared_137_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_137_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_137_cast_fp16_to_fp32 = cast(dtype = mean_squared_137_cast_fp16_to_fp32_dtype_0, x = mean_squared_137_cast_fp16)[name = string("cast_196")];
+            tensor<fp32, [1, 50, 1]> var_3637 = pow(x = mean_squared_137_cast_fp16_to_fp32, y = var_3588)[name = string("op_3637")];
+            string clip_214_to_fp16_dtype_0 = const()[name = string("clip_214_to_fp16_dtype_0"), val = string("fp16")];
+            string var_3637_to_fp16_dtype_0 = const()[name = string("op_3637_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_214_to_fp16 = cast(dtype = clip_214_to_fp16_dtype_0, x = clip_214)[name = string("cast_194")];
+            tensor<fp16, [1, 50, 1]> var_3637_to_fp16 = cast(dtype = var_3637_to_fp16_dtype_0, x = var_3637)[name = string("cast_195")];
+            tensor<fp16, [1, 50, 1024]> normed_output_273_cast_fp16 = mul(x = clip_214_to_fp16, y = var_3637_to_fp16)[name = string("normed_output_273_cast_fp16")];
+            tensor<fp16, [1024]> const_118_to_fp16 = const()[name = string("const_118_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(92845248)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_275_cast_fp16 = mul(x = normed_output_273_cast_fp16, y = const_118_to_fp16)[name = string("normed_output_275_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_791_cast_fp16 = silu(x = normed_output_275_cast_fp16)[name = string("hidden_states_791_cast_fp16")];
+            fp16 lconv1ds_7_linear_end_input_min_to_fp16 = const()[name = string("lconv1ds_7_linear_end_input_min_to_fp16"), val = fp16(-0x1p+3)];
+            fp16 lconv1ds_7_linear_end_input_max_to_fp16 = const()[name = string("lconv1ds_7_linear_end_input_max_to_fp16"), val = fp16(0x1.fcp+2)];
+            tensor<fp16, [1, 50, 1024]> clip_215_cast_fp16 = clip(alpha = lconv1ds_7_linear_end_input_min_to_fp16, beta = lconv1ds_7_linear_end_input_max_to_fp16, x = hidden_states_791_cast_fp16)[name = string("clip_215_cast_fp16")];
+            tensor<fp16, [1024, 1024]> lconv1ds_7_linear_end_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(92847360))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(93371712))))[name = string("lconv1ds_7_linear_end_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_86_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = lconv1ds_7_linear_end_linear_weight_to_fp16_palettized, x = clip_215_cast_fp16)[name = string("linear_86_cast_fp16")];
+            fp16 lconv1ds_7_linear_end_output_min_to_fp16 = const()[name = string("lconv1ds_7_linear_end_output_min_to_fp16"), val = fp16(-0x1.cap+2)];
+            fp16 lconv1ds_7_linear_end_output_max_to_fp16 = const()[name = string("lconv1ds_7_linear_end_output_max_to_fp16"), val = fp16(0x1.c8p+2)];
+            tensor<fp16, [1, 50, 1024]> clip_216_cast_fp16 = clip(alpha = lconv1ds_7_linear_end_output_min_to_fp16, beta = lconv1ds_7_linear_end_output_max_to_fp16, x = linear_86_cast_fp16)[name = string("clip_216_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_797_cast_fp16 = add(x = clip_216_cast_fp16, y = hidden_states_775_cast_fp16)[name = string("hidden_states_797_cast_fp16")];
+            string hidden_states_797_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_797_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_3661 = const()[name = string("op_3661"), val = fp32(-0x1p-1)];
+            fp32 var_3662 = const()[name = string("op_3662"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_3663 = const()[name = string("op_3663"), val = fp32(-0x1.2a05f2p+33)];
+            tensor<fp32, [1, 50, 1024]> hidden_states_797_cast_fp16_to_fp32 = cast(dtype = hidden_states_797_cast_fp16_to_fp32_dtype_0, x = hidden_states_797_cast_fp16)[name = string("cast_193")];
+            tensor<fp32, [1, 50, 1024]> clip_217 = clip(alpha = var_3663, beta = var_3662, x = hidden_states_797_cast_fp16_to_fp32)[name = string("clip_217")];
+            fp32 var_3657_promoted = const()[name = string("op_3657_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_3671 = pow(x = clip_217, y = var_3657_promoted)[name = string("op_3671")];
+            tensor<int32, [1]> var_3673_axes_0 = const()[name = string("op_3673_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3673_keep_dims_0 = const()[name = string("op_3673_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_3673 = reduce_mean(axes = var_3673_axes_0, keep_dims = var_3673_keep_dims_0, x = var_3671)[name = string("op_3673")];
+            string var_3673_to_fp16_dtype_0 = const()[name = string("op_3673_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_3674_to_fp16 = const()[name = string("op_3674_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_3673_to_fp16 = cast(dtype = var_3673_to_fp16_dtype_0, x = var_3673)[name = string("cast_192")];
+            tensor<fp16, [1, 50, 1]> mean_squared_139_cast_fp16 = add(x = var_3673_to_fp16, y = var_3674_to_fp16)[name = string("mean_squared_139_cast_fp16")];
+            string mean_squared_139_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_139_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_139_cast_fp16_to_fp32 = cast(dtype = mean_squared_139_cast_fp16_to_fp32_dtype_0, x = mean_squared_139_cast_fp16)[name = string("cast_191")];
+            tensor<fp32, [1, 50, 1]> var_3676 = pow(x = mean_squared_139_cast_fp16_to_fp32, y = var_3661)[name = string("op_3676")];
+            string clip_217_to_fp16_dtype_0 = const()[name = string("clip_217_to_fp16_dtype_0"), val = string("fp16")];
+            string var_3676_to_fp16_dtype_0 = const()[name = string("op_3676_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_217_to_fp16 = cast(dtype = clip_217_to_fp16_dtype_0, x = clip_217)[name = string("cast_189")];
+            tensor<fp16, [1, 50, 1]> var_3676_to_fp16 = cast(dtype = var_3676_to_fp16_dtype_0, x = var_3676)[name = string("cast_190")];
+            tensor<fp16, [1, 50, 1024]> normed_output_277_cast_fp16 = mul(x = clip_217_to_fp16, y = var_3676_to_fp16)[name = string("normed_output_277_cast_fp16")];
+            tensor<fp16, [1024]> const_119_to_fp16 = const()[name = string("const_119_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(93372800)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_279_cast_fp16 = mul(x = normed_output_277_cast_fp16, y = const_119_to_fp16)[name = string("normed_output_279_cast_fp16")];
+            fp16 feed_forward2s_7_ffw_layer_1_input_min_to_fp16 = const()[name = string("feed_forward2s_7_ffw_layer_1_input_min_to_fp16"), val = fp16(-0x1.a8p+3)];
+            fp16 feed_forward2s_7_ffw_layer_1_input_max_to_fp16 = const()[name = string("feed_forward2s_7_ffw_layer_1_input_max_to_fp16"), val = fp16(0x1.a4p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_218_cast_fp16 = clip(alpha = feed_forward2s_7_ffw_layer_1_input_min_to_fp16, beta = feed_forward2s_7_ffw_layer_1_input_max_to_fp16, x = normed_output_279_cast_fp16)[name = string("clip_218_cast_fp16")];
+            tensor<fp16, [4096, 1024]> feed_forward2s_7_ffw_layer_1_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(93374912))), lut = tensor<fp16, [128, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(95472128))))[name = string("feed_forward2s_7_ffw_layer_1_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 4096]> linear_87_cast_fp16 = linear(bias = linear_1_bias_0_to_fp16, weight = feed_forward2s_7_ffw_layer_1_linear_weight_to_fp16_palettized, x = clip_218_cast_fp16)[name = string("linear_87_cast_fp16")];
+            fp16 feed_forward2s_7_ffw_layer_1_output_min_to_fp16 = const()[name = string("feed_forward2s_7_ffw_layer_1_output_min_to_fp16"), val = fp16(-0x1.d6p+4)];
+            fp16 feed_forward2s_7_ffw_layer_1_output_max_to_fp16 = const()[name = string("feed_forward2s_7_ffw_layer_1_output_max_to_fp16"), val = fp16(0x1.d2p+4)];
+            tensor<fp16, [1, 50, 4096]> clip_219_cast_fp16 = clip(alpha = feed_forward2s_7_ffw_layer_1_output_min_to_fp16, beta = feed_forward2s_7_ffw_layer_1_output_max_to_fp16, x = linear_87_cast_fp16)[name = string("clip_219_cast_fp16")];
+            tensor<fp16, [1, 50, 4096]> hidden_states_807_cast_fp16 = silu(x = clip_219_cast_fp16)[name = string("hidden_states_807_cast_fp16")];
+            fp16 feed_forward2s_7_ffw_layer_2_input_min_to_fp16 = const()[name = string("feed_forward2s_7_ffw_layer_2_input_min_to_fp16"), val = fp16(-0x1.4p+3)];
+            fp16 feed_forward2s_7_ffw_layer_2_input_max_to_fp16 = const()[name = string("feed_forward2s_7_ffw_layer_2_input_max_to_fp16"), val = fp16(0x1.3ep+3)];
+            tensor<fp16, [1, 50, 4096]> clip_220_cast_fp16 = clip(alpha = feed_forward2s_7_ffw_layer_2_input_min_to_fp16, beta = feed_forward2s_7_ffw_layer_2_input_max_to_fp16, x = hidden_states_807_cast_fp16)[name = string("clip_220_cast_fp16")];
+            tensor<fp16, [1024, 4096]> feed_forward2s_7_ffw_layer_2_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 4096]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(95476288))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(97573504))))[name = string("feed_forward2s_7_ffw_layer_2_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_88_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = feed_forward2s_7_ffw_layer_2_linear_weight_to_fp16_palettized, x = clip_220_cast_fp16)[name = string("linear_88_cast_fp16")];
+            fp16 feed_forward2s_7_ffw_layer_2_output_min_to_fp16 = const()[name = string("feed_forward2s_7_ffw_layer_2_output_min_to_fp16"), val = fp16(-0x1.bp+5)];
+            fp16 feed_forward2s_7_ffw_layer_2_output_max_to_fp16 = const()[name = string("feed_forward2s_7_ffw_layer_2_output_max_to_fp16"), val = fp16(0x1.aep+5)];
+            tensor<fp16, [1, 50, 1024]> clip_221_cast_fp16 = clip(alpha = feed_forward2s_7_ffw_layer_2_output_min_to_fp16, beta = feed_forward2s_7_ffw_layer_2_output_max_to_fp16, x = linear_88_cast_fp16)[name = string("clip_221_cast_fp16")];
+            string clip_221_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_221_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1024]> clip_221_cast_fp16_to_fp32 = cast(dtype = clip_221_cast_fp16_to_fp32_dtype_0, x = clip_221_cast_fp16)[name = string("cast_188")];
+            tensor<fp32, [1, 50, 1024]> clip_222 = clip(alpha = var_3663, beta = var_3662, x = clip_221_cast_fp16_to_fp32)[name = string("clip_222")];
+            fp32 var_3657_promoted_1 = const()[name = string("op_3657_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_3703 = pow(x = clip_222, y = var_3657_promoted_1)[name = string("op_3703")];
+            tensor<int32, [1]> var_3705_axes_0 = const()[name = string("op_3705_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3705_keep_dims_0 = const()[name = string("op_3705_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_3705 = reduce_mean(axes = var_3705_axes_0, keep_dims = var_3705_keep_dims_0, x = var_3703)[name = string("op_3705")];
+            string var_3705_to_fp16_dtype_0 = const()[name = string("op_3705_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_3706_to_fp16 = const()[name = string("op_3706_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_3705_to_fp16 = cast(dtype = var_3705_to_fp16_dtype_0, x = var_3705)[name = string("cast_187")];
+            tensor<fp16, [1, 50, 1]> mean_squared_141_cast_fp16 = add(x = var_3705_to_fp16, y = var_3706_to_fp16)[name = string("mean_squared_141_cast_fp16")];
+            string mean_squared_141_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_141_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_141_cast_fp16_to_fp32 = cast(dtype = mean_squared_141_cast_fp16_to_fp32_dtype_0, x = mean_squared_141_cast_fp16)[name = string("cast_186")];
+            tensor<fp32, [1, 50, 1]> var_3708 = pow(x = mean_squared_141_cast_fp16_to_fp32, y = var_3661)[name = string("op_3708")];
+            string clip_222_to_fp16_dtype_0 = const()[name = string("clip_222_to_fp16_dtype_0"), val = string("fp16")];
+            string var_3708_to_fp16_dtype_0 = const()[name = string("op_3708_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_222_to_fp16 = cast(dtype = clip_222_to_fp16_dtype_0, x = clip_222)[name = string("cast_184")];
+            tensor<fp16, [1, 50, 1]> var_3708_to_fp16 = cast(dtype = var_3708_to_fp16_dtype_0, x = var_3708)[name = string("cast_185")];
+            tensor<fp16, [1, 50, 1024]> normed_output_281_cast_fp16 = mul(x = clip_222_to_fp16, y = var_3708_to_fp16)[name = string("normed_output_281_cast_fp16")];
+            tensor<fp16, [1024]> const_120_to_fp16 = const()[name = string("const_120_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(97574592)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_283_cast_fp16 = mul(x = normed_output_281_cast_fp16, y = const_120_to_fp16)[name = string("normed_output_283_cast_fp16")];
+            fp16 var_3653_to_fp16 = const()[name = string("op_3653_to_fp16"), val = fp16(0x1p-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_819_cast_fp16 = mul(x = normed_output_283_cast_fp16, y = var_3653_to_fp16)[name = string("hidden_states_819_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_821_cast_fp16 = add(x = hidden_states_819_cast_fp16, y = hidden_states_797_cast_fp16)[name = string("hidden_states_821_cast_fp16")];
+            fp16 var_3715_to_fp16 = const()[name = string("op_3715_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_3716_to_fp16 = const()[name = string("op_3716_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_223_cast_fp16 = clip(alpha = var_3715_to_fp16, beta = var_3716_to_fp16, x = hidden_states_821_cast_fp16)[name = string("clip_223_cast_fp16")];
+            string clip_223_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_223_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_3718 = const()[name = string("op_3718"), val = fp32(-0x1p-1)];
+            fp32 var_3722_promoted = const()[name = string("op_3722_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_223_cast_fp16_to_fp32 = cast(dtype = clip_223_cast_fp16_to_fp32_dtype_0, x = clip_223_cast_fp16)[name = string("cast_183")];
+            tensor<fp32, [1, 50, 1024]> var_3728 = pow(x = clip_223_cast_fp16_to_fp32, y = var_3722_promoted)[name = string("op_3728")];
+            tensor<int32, [1]> var_3730_axes_0 = const()[name = string("op_3730_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3730_keep_dims_0 = const()[name = string("op_3730_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_3730 = reduce_mean(axes = var_3730_axes_0, keep_dims = var_3730_keep_dims_0, x = var_3728)[name = string("op_3730")];
+            string var_3730_to_fp16_dtype_0 = const()[name = string("op_3730_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_3731_to_fp16 = const()[name = string("op_3731_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_3730_to_fp16 = cast(dtype = var_3730_to_fp16_dtype_0, x = var_3730)[name = string("cast_182")];
+            tensor<fp16, [1, 50, 1]> mean_squared_143_cast_fp16 = add(x = var_3730_to_fp16, y = var_3731_to_fp16)[name = string("mean_squared_143_cast_fp16")];
+            string mean_squared_143_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_143_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_143_cast_fp16_to_fp32 = cast(dtype = mean_squared_143_cast_fp16_to_fp32_dtype_0, x = mean_squared_143_cast_fp16)[name = string("cast_181")];
+            tensor<fp32, [1, 50, 1]> var_3733 = pow(x = mean_squared_143_cast_fp16_to_fp32, y = var_3718)[name = string("op_3733")];
+            string var_3733_to_fp16_dtype_0 = const()[name = string("op_3733_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_3733_to_fp16 = cast(dtype = var_3733_to_fp16_dtype_0, x = var_3733)[name = string("cast_180")];
+            tensor<fp16, [1, 50, 1024]> normed_output_285_cast_fp16 = mul(x = clip_223_cast_fp16, y = var_3733_to_fp16)[name = string("normed_output_285_cast_fp16")];
+            tensor<fp16, [1024]> const_121_to_fp16 = const()[name = string("const_121_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(97576704)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_287_cast_fp16 = mul(x = normed_output_285_cast_fp16, y = const_121_to_fp16)[name = string("normed_output_287_cast_fp16")];
+            string normed_output_287_cast_fp16_to_fp32_dtype_0 = const()[name = string("normed_output_287_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_3746 = const()[name = string("op_3746"), val = fp32(-0x1p-1)];
+            fp32 var_3747 = const()[name = string("op_3747"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_3748 = const()[name = string("op_3748"), val = fp32(-0x1.2a05f2p+33)];
+            tensor<fp32, [1, 50, 1024]> normed_output_287_cast_fp16_to_fp32 = cast(dtype = normed_output_287_cast_fp16_to_fp32_dtype_0, x = normed_output_287_cast_fp16)[name = string("cast_179")];
+            tensor<fp32, [1, 50, 1024]> clip_224 = clip(alpha = var_3748, beta = var_3747, x = normed_output_287_cast_fp16_to_fp32)[name = string("clip_224")];
+            fp32 var_3742_promoted = const()[name = string("op_3742_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_3756 = pow(x = clip_224, y = var_3742_promoted)[name = string("op_3756")];
+            tensor<int32, [1]> var_3758_axes_0 = const()[name = string("op_3758_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3758_keep_dims_0 = const()[name = string("op_3758_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_3758 = reduce_mean(axes = var_3758_axes_0, keep_dims = var_3758_keep_dims_0, x = var_3756)[name = string("op_3758")];
+            string var_3758_to_fp16_dtype_0 = const()[name = string("op_3758_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_3759_to_fp16 = const()[name = string("op_3759_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_3758_to_fp16 = cast(dtype = var_3758_to_fp16_dtype_0, x = var_3758)[name = string("cast_178")];
+            tensor<fp16, [1, 50, 1]> mean_squared_145_cast_fp16 = add(x = var_3758_to_fp16, y = var_3759_to_fp16)[name = string("mean_squared_145_cast_fp16")];
+            string mean_squared_145_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_145_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_145_cast_fp16_to_fp32 = cast(dtype = mean_squared_145_cast_fp16_to_fp32_dtype_0, x = mean_squared_145_cast_fp16)[name = string("cast_177")];
+            tensor<fp32, [1, 50, 1]> var_3761 = pow(x = mean_squared_145_cast_fp16_to_fp32, y = var_3746)[name = string("op_3761")];
+            string clip_224_to_fp16_dtype_0 = const()[name = string("clip_224_to_fp16_dtype_0"), val = string("fp16")];
+            string var_3761_to_fp16_dtype_0 = const()[name = string("op_3761_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_224_to_fp16 = cast(dtype = clip_224_to_fp16_dtype_0, x = clip_224)[name = string("cast_175")];
+            tensor<fp16, [1, 50, 1]> var_3761_to_fp16 = cast(dtype = var_3761_to_fp16_dtype_0, x = var_3761)[name = string("cast_176")];
+            tensor<fp16, [1, 50, 1024]> normed_output_289_cast_fp16 = mul(x = clip_224_to_fp16, y = var_3761_to_fp16)[name = string("normed_output_289_cast_fp16")];
+            tensor<fp16, [1024]> const_122_to_fp16 = const()[name = string("const_122_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(97578816)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_291_cast_fp16 = mul(x = normed_output_289_cast_fp16, y = const_122_to_fp16)[name = string("normed_output_291_cast_fp16")];
+            fp16 feed_forward1s_8_ffw_layer_1_input_min_to_fp16 = const()[name = string("feed_forward1s_8_ffw_layer_1_input_min_to_fp16"), val = fp16(-0x1.66p+3)];
+            fp16 feed_forward1s_8_ffw_layer_1_input_max_to_fp16 = const()[name = string("feed_forward1s_8_ffw_layer_1_input_max_to_fp16"), val = fp16(0x1.64p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_225_cast_fp16 = clip(alpha = feed_forward1s_8_ffw_layer_1_input_min_to_fp16, beta = feed_forward1s_8_ffw_layer_1_input_max_to_fp16, x = normed_output_291_cast_fp16)[name = string("clip_225_cast_fp16")];
+            tensor<fp16, [4096, 1024]> feed_forward1s_8_ffw_layer_1_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(97580928))), lut = tensor<fp16, [128, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(99678144))))[name = string("feed_forward1s_8_ffw_layer_1_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 4096]> linear_89_cast_fp16 = linear(bias = linear_1_bias_0_to_fp16, weight = feed_forward1s_8_ffw_layer_1_linear_weight_to_fp16_palettized, x = clip_225_cast_fp16)[name = string("linear_89_cast_fp16")];
+            fp16 feed_forward1s_8_ffw_layer_1_output_min_to_fp16 = const()[name = string("feed_forward1s_8_ffw_layer_1_output_min_to_fp16"), val = fp16(-0x1.6cp+4)];
+            fp16 feed_forward1s_8_ffw_layer_1_output_max_to_fp16 = const()[name = string("feed_forward1s_8_ffw_layer_1_output_max_to_fp16"), val = fp16(0x1.6ap+4)];
+            tensor<fp16, [1, 50, 4096]> clip_226_cast_fp16 = clip(alpha = feed_forward1s_8_ffw_layer_1_output_min_to_fp16, beta = feed_forward1s_8_ffw_layer_1_output_max_to_fp16, x = linear_89_cast_fp16)[name = string("clip_226_cast_fp16")];
+            tensor<fp16, [1, 50, 4096]> hidden_states_837_cast_fp16 = silu(x = clip_226_cast_fp16)[name = string("hidden_states_837_cast_fp16")];
+            fp16 feed_forward1s_8_ffw_layer_2_input_min_to_fp16 = const()[name = string("feed_forward1s_8_ffw_layer_2_input_min_to_fp16"), val = fp16(-0x1.1ap+3)];
+            fp16 feed_forward1s_8_ffw_layer_2_input_max_to_fp16 = const()[name = string("feed_forward1s_8_ffw_layer_2_input_max_to_fp16"), val = fp16(0x1.18p+3)];
+            tensor<fp16, [1, 50, 4096]> clip_227_cast_fp16 = clip(alpha = feed_forward1s_8_ffw_layer_2_input_min_to_fp16, beta = feed_forward1s_8_ffw_layer_2_input_max_to_fp16, x = hidden_states_837_cast_fp16)[name = string("clip_227_cast_fp16")];
+            tensor<fp16, [1024, 4096]> feed_forward1s_8_ffw_layer_2_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 4096]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(99682304))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(101779520))))[name = string("feed_forward1s_8_ffw_layer_2_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_90_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = feed_forward1s_8_ffw_layer_2_linear_weight_to_fp16_palettized, x = clip_227_cast_fp16)[name = string("linear_90_cast_fp16")];
+            fp16 feed_forward1s_8_ffw_layer_2_output_min_to_fp16 = const()[name = string("feed_forward1s_8_ffw_layer_2_output_min_to_fp16"), val = fp16(-0x1.36p+5)];
+            fp16 feed_forward1s_8_ffw_layer_2_output_max_to_fp16 = const()[name = string("feed_forward1s_8_ffw_layer_2_output_max_to_fp16"), val = fp16(0x1.34p+5)];
+            tensor<fp16, [1, 50, 1024]> clip_228_cast_fp16 = clip(alpha = feed_forward1s_8_ffw_layer_2_output_min_to_fp16, beta = feed_forward1s_8_ffw_layer_2_output_max_to_fp16, x = linear_90_cast_fp16)[name = string("clip_228_cast_fp16")];
+            string clip_228_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_228_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1024]> clip_228_cast_fp16_to_fp32 = cast(dtype = clip_228_cast_fp16_to_fp32_dtype_0, x = clip_228_cast_fp16)[name = string("cast_174")];
+            tensor<fp32, [1, 50, 1024]> clip_229 = clip(alpha = var_3748, beta = var_3747, x = clip_228_cast_fp16_to_fp32)[name = string("clip_229")];
+            fp32 var_3742_promoted_1 = const()[name = string("op_3742_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_3788 = pow(x = clip_229, y = var_3742_promoted_1)[name = string("op_3788")];
+            tensor<int32, [1]> var_3790_axes_0 = const()[name = string("op_3790_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3790_keep_dims_0 = const()[name = string("op_3790_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_3790 = reduce_mean(axes = var_3790_axes_0, keep_dims = var_3790_keep_dims_0, x = var_3788)[name = string("op_3790")];
+            string var_3790_to_fp16_dtype_0 = const()[name = string("op_3790_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_3791_to_fp16 = const()[name = string("op_3791_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_3790_to_fp16 = cast(dtype = var_3790_to_fp16_dtype_0, x = var_3790)[name = string("cast_173")];
+            tensor<fp16, [1, 50, 1]> mean_squared_147_cast_fp16 = add(x = var_3790_to_fp16, y = var_3791_to_fp16)[name = string("mean_squared_147_cast_fp16")];
+            string mean_squared_147_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_147_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_147_cast_fp16_to_fp32 = cast(dtype = mean_squared_147_cast_fp16_to_fp32_dtype_0, x = mean_squared_147_cast_fp16)[name = string("cast_172")];
+            tensor<fp32, [1, 50, 1]> var_3793 = pow(x = mean_squared_147_cast_fp16_to_fp32, y = var_3746)[name = string("op_3793")];
+            string clip_229_to_fp16_dtype_0 = const()[name = string("clip_229_to_fp16_dtype_0"), val = string("fp16")];
+            string var_3793_to_fp16_dtype_0 = const()[name = string("op_3793_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_229_to_fp16 = cast(dtype = clip_229_to_fp16_dtype_0, x = clip_229)[name = string("cast_170")];
+            tensor<fp16, [1, 50, 1]> var_3793_to_fp16 = cast(dtype = var_3793_to_fp16_dtype_0, x = var_3793)[name = string("cast_171")];
+            tensor<fp16, [1, 50, 1024]> normed_output_293_cast_fp16 = mul(x = clip_229_to_fp16, y = var_3793_to_fp16)[name = string("normed_output_293_cast_fp16")];
+            tensor<fp16, [1024]> const_123_to_fp16 = const()[name = string("const_123_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(101780608)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_295_cast_fp16 = mul(x = normed_output_293_cast_fp16, y = const_123_to_fp16)[name = string("normed_output_295_cast_fp16")];
+            fp16 var_3738_to_fp16 = const()[name = string("op_3738_to_fp16"), val = fp16(0x1p-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_849_cast_fp16 = mul(x = normed_output_295_cast_fp16, y = var_3738_to_fp16)[name = string("hidden_states_849_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_851_cast_fp16 = add(x = hidden_states_849_cast_fp16, y = normed_output_287_cast_fp16)[name = string("hidden_states_851_cast_fp16")];
+            fp16 var_3800_to_fp16 = const()[name = string("op_3800_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_3801_to_fp16 = const()[name = string("op_3801_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_230_cast_fp16 = clip(alpha = var_3800_to_fp16, beta = var_3801_to_fp16, x = hidden_states_851_cast_fp16)[name = string("clip_230_cast_fp16")];
+            string clip_230_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_230_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_3803 = const()[name = string("op_3803"), val = fp32(-0x1p-1)];
+            fp32 var_3807_promoted = const()[name = string("op_3807_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_230_cast_fp16_to_fp32 = cast(dtype = clip_230_cast_fp16_to_fp32_dtype_0, x = clip_230_cast_fp16)[name = string("cast_169")];
+            tensor<fp32, [1, 50, 1024]> var_3813 = pow(x = clip_230_cast_fp16_to_fp32, y = var_3807_promoted)[name = string("op_3813")];
+            tensor<int32, [1]> var_3815_axes_0 = const()[name = string("op_3815_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3815_keep_dims_0 = const()[name = string("op_3815_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_3815 = reduce_mean(axes = var_3815_axes_0, keep_dims = var_3815_keep_dims_0, x = var_3813)[name = string("op_3815")];
+            string var_3815_to_fp16_dtype_0 = const()[name = string("op_3815_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_3816_to_fp16 = const()[name = string("op_3816_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_3815_to_fp16 = cast(dtype = var_3815_to_fp16_dtype_0, x = var_3815)[name = string("cast_168")];
+            tensor<fp16, [1, 50, 1]> mean_squared_149_cast_fp16 = add(x = var_3815_to_fp16, y = var_3816_to_fp16)[name = string("mean_squared_149_cast_fp16")];
+            string mean_squared_149_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_149_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_149_cast_fp16_to_fp32 = cast(dtype = mean_squared_149_cast_fp16_to_fp32_dtype_0, x = mean_squared_149_cast_fp16)[name = string("cast_167")];
+            tensor<fp32, [1, 50, 1]> var_3818 = pow(x = mean_squared_149_cast_fp16_to_fp32, y = var_3803)[name = string("op_3818")];
+            string var_3818_to_fp16_dtype_0 = const()[name = string("op_3818_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_3818_to_fp16 = cast(dtype = var_3818_to_fp16_dtype_0, x = var_3818)[name = string("cast_166")];
+            tensor<fp16, [1, 50, 1024]> normed_output_297_cast_fp16 = mul(x = clip_230_cast_fp16, y = var_3818_to_fp16)[name = string("normed_output_297_cast_fp16")];
+            tensor<fp16, [1024]> const_124_to_fp16 = const()[name = string("const_124_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(101782720)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_299_cast_fp16 = mul(x = normed_output_297_cast_fp16, y = const_124_to_fp16)[name = string("normed_output_299_cast_fp16")];
+            int32 var_3824 = const()[name = string("op_3824"), val = int32(-1)];
+            fp32 var_3825 = const()[name = string("op_3825"), val = fp32(-0x1.dcd65p+29)];
+            fp16 self_attns_8_q_proj_input_min_to_fp16 = const()[name = string("self_attns_8_q_proj_input_min_to_fp16"), val = fp16(-0x1.22p+3)];
+            fp16 self_attns_8_q_proj_input_max_to_fp16 = const()[name = string("self_attns_8_q_proj_input_max_to_fp16"), val = fp16(0x1.2p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_231_cast_fp16 = clip(alpha = self_attns_8_q_proj_input_min_to_fp16, beta = self_attns_8_q_proj_input_max_to_fp16, x = normed_output_299_cast_fp16)[name = string("clip_231_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_8_q_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(101784832))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(102309184))))[name = string("self_attns_8_q_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_91_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_8_q_proj_linear_weight_to_fp16_palettized, x = clip_231_cast_fp16)[name = string("linear_91_cast_fp16")];
+            fp16 self_attns_8_q_proj_output_min_to_fp16 = const()[name = string("self_attns_8_q_proj_output_min_to_fp16"), val = fp16(-0x1.c2p+3)];
+            fp16 self_attns_8_q_proj_output_max_to_fp16 = const()[name = string("self_attns_8_q_proj_output_max_to_fp16"), val = fp16(0x1.bep+3)];
+            tensor<fp16, [1, 50, 1024]> clip_232_cast_fp16 = clip(alpha = self_attns_8_q_proj_output_min_to_fp16, beta = self_attns_8_q_proj_output_max_to_fp16, x = linear_91_cast_fp16)[name = string("clip_232_cast_fp16")];
+            tensor<int32, [4]> var_3869 = const()[name = string("op_3869"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> q_17_cast_fp16 = reshape(shape = var_3869, x = clip_232_cast_fp16)[name = string("q_17_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_8_k_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(102310272))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(102834624))))[name = string("self_attns_8_k_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_92_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_8_k_proj_linear_weight_to_fp16_palettized, x = clip_231_cast_fp16)[name = string("linear_92_cast_fp16")];
+            fp16 self_attns_8_k_proj_output_min_to_fp16 = const()[name = string("self_attns_8_k_proj_output_min_to_fp16"), val = fp16(-0x1.c2p+3)];
+            fp16 self_attns_8_k_proj_output_max_to_fp16 = const()[name = string("self_attns_8_k_proj_output_max_to_fp16"), val = fp16(0x1.bep+3)];
+            tensor<fp16, [1, 50, 1024]> clip_234_cast_fp16 = clip(alpha = self_attns_8_k_proj_output_min_to_fp16, beta = self_attns_8_k_proj_output_max_to_fp16, x = linear_92_cast_fp16)[name = string("clip_234_cast_fp16")];
+            tensor<int32, [4]> var_3881 = const()[name = string("op_3881"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> k_17_cast_fp16 = reshape(shape = var_3881, x = clip_234_cast_fp16)[name = string("k_17_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_8_v_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(102835712))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(103360064))))[name = string("self_attns_8_v_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_93_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_8_v_proj_linear_weight_to_fp16_palettized, x = clip_231_cast_fp16)[name = string("linear_93_cast_fp16")];
+            fp16 self_attns_8_v_proj_output_min_to_fp16 = const()[name = string("self_attns_8_v_proj_output_min_to_fp16"), val = fp16(-0x1.c2p+3)];
+            fp16 self_attns_8_v_proj_output_max_to_fp16 = const()[name = string("self_attns_8_v_proj_output_max_to_fp16"), val = fp16(0x1.bep+3)];
+            tensor<fp16, [1, 50, 1024]> clip_236_cast_fp16 = clip(alpha = self_attns_8_v_proj_output_min_to_fp16, beta = self_attns_8_v_proj_output_max_to_fp16, x = linear_93_cast_fp16)[name = string("clip_236_cast_fp16")];
+            tensor<int32, [4]> var_3893 = const()[name = string("op_3893"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> input_369_cast_fp16 = reshape(shape = var_3893, x = clip_236_cast_fp16)[name = string("input_369_cast_fp16")];
+            fp16 var_3895_to_fp16 = const()[name = string("op_3895_to_fp16"), val = fp16(0x1.054p-3)];
+            tensor<fp16, [1, 50, 8, 128]> var_3896_cast_fp16 = mul(x = q_17_cast_fp16, y = var_3895_to_fp16)[name = string("op_3896_cast_fp16")];
+            tensor<fp16, [128]> var_3897_to_fp16 = const()[name = string("op_3897_to_fp16"), val = tensor<fp16, [128]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(103361152)))];
+            tensor<fp16, [1, 50, 8, 128]> input_365_cast_fp16 = mul(x = var_3896_cast_fp16, y = var_3897_to_fp16)[name = string("input_365_cast_fp16")];
+            fp16 var_3899_to_fp16 = const()[name = string("op_3899_to_fp16"), val = fp16(0x1.e5p+0)];
+            tensor<fp16, [1, 50, 8, 128]> input_367_cast_fp16 = mul(x = k_17_cast_fp16, y = var_3899_to_fp16)[name = string("input_367_cast_fp16")];
+            tensor<int32, [8]> q_padded_17_pad_0 = const()[name = string("q_padded_17_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 10, 0, 0, 0, 0])];
+            string q_padded_17_mode_0 = const()[name = string("q_padded_17_mode_0"), val = string("constant")];
+            fp16 const_125_to_fp16 = const()[name = string("const_125_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 60, 8, 128]> q_padded_17_cast_fp16 = pad(constant_val = const_125_to_fp16, mode = q_padded_17_mode_0, pad = q_padded_17_pad_0, x = input_365_cast_fp16)[name = string("q_padded_17_cast_fp16")];
+            tensor<int32, [5]> var_3903 = const()[name = string("op_3903"), val = tensor<int32, [5]>([1, 5, 12, 8, 128])];
+            tensor<fp16, [1, 5, 12, 8, 128]> q_blocks_17_cast_fp16 = reshape(shape = var_3903, x = q_padded_17_cast_fp16)[name = string("q_blocks_17_cast_fp16")];
+            tensor<int32, [8]> k_padded_17_pad_0 = const()[name = string("k_padded_17_pad_0"), val = tensor<int32, [8]>([0, 0, 12, 11, 0, 0, 0, 0])];
+            string k_padded_17_mode_0 = const()[name = string("k_padded_17_mode_0"), val = string("constant")];
+            fp16 const_126_to_fp16 = const()[name = string("const_126_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 73, 8, 128]> k_padded_17_cast_fp16 = pad(constant_val = const_126_to_fp16, mode = k_padded_17_mode_0, pad = k_padded_17_pad_0, x = input_367_cast_fp16)[name = string("k_padded_17_cast_fp16")];
+            tensor<int32, [8]> v_padded_17_pad_0 = const()[name = string("v_padded_17_pad_0"), val = tensor<int32, [8]>([0, 0, 12, 11, 0, 0, 0, 0])];
+            string v_padded_17_mode_0 = const()[name = string("v_padded_17_mode_0"), val = string("constant")];
+            fp16 const_127_to_fp16 = const()[name = string("const_127_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 73, 8, 128]> v_padded_17_cast_fp16 = pad(constant_val = const_127_to_fp16, mode = v_padded_17_mode_0, pad = v_padded_17_pad_0, x = input_369_cast_fp16)[name = string("v_padded_17_cast_fp16")];
+            tensor<int32, [4]> var_3910_begin_0 = const()[name = string("op_3910_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3910_end_0 = const()[name = string("op_3910_end_0"), val = tensor<int32, [4]>([1, 24, 8, 128])];
+            tensor<bool, [4]> var_3910_end_mask_0 = const()[name = string("op_3910_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_3910_cast_fp16 = slice_by_index(begin = var_3910_begin_0, end = var_3910_end_0, end_mask = var_3910_end_mask_0, x = k_padded_17_cast_fp16)[name = string("op_3910_cast_fp16")];
+            tensor<int32, [4]> var_3912_begin_0 = const()[name = string("op_3912_begin_0"), val = tensor<int32, [4]>([0, 12, 0, 0])];
+            tensor<int32, [4]> var_3912_end_0 = const()[name = string("op_3912_end_0"), val = tensor<int32, [4]>([1, 36, 8, 128])];
+            tensor<bool, [4]> var_3912_end_mask_0 = const()[name = string("op_3912_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_3912_cast_fp16 = slice_by_index(begin = var_3912_begin_0, end = var_3912_end_0, end_mask = var_3912_end_mask_0, x = k_padded_17_cast_fp16)[name = string("op_3912_cast_fp16")];
+            tensor<int32, [4]> var_3914_begin_0 = const()[name = string("op_3914_begin_0"), val = tensor<int32, [4]>([0, 24, 0, 0])];
+            tensor<int32, [4]> var_3914_end_0 = const()[name = string("op_3914_end_0"), val = tensor<int32, [4]>([1, 48, 8, 128])];
+            tensor<bool, [4]> var_3914_end_mask_0 = const()[name = string("op_3914_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_3914_cast_fp16 = slice_by_index(begin = var_3914_begin_0, end = var_3914_end_0, end_mask = var_3914_end_mask_0, x = k_padded_17_cast_fp16)[name = string("op_3914_cast_fp16")];
+            tensor<int32, [4]> var_3916_begin_0 = const()[name = string("op_3916_begin_0"), val = tensor<int32, [4]>([0, 36, 0, 0])];
+            tensor<int32, [4]> var_3916_end_0 = const()[name = string("op_3916_end_0"), val = tensor<int32, [4]>([1, 60, 8, 128])];
+            tensor<bool, [4]> var_3916_end_mask_0 = const()[name = string("op_3916_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_3916_cast_fp16 = slice_by_index(begin = var_3916_begin_0, end = var_3916_end_0, end_mask = var_3916_end_mask_0, x = k_padded_17_cast_fp16)[name = string("op_3916_cast_fp16")];
+            tensor<int32, [4]> var_3918_begin_0 = const()[name = string("op_3918_begin_0"), val = tensor<int32, [4]>([0, 48, 0, 0])];
+            tensor<int32, [4]> var_3918_end_0 = const()[name = string("op_3918_end_0"), val = tensor<int32, [4]>([1, 72, 8, 128])];
+            tensor<bool, [4]> var_3918_end_mask_0 = const()[name = string("op_3918_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_3918_cast_fp16 = slice_by_index(begin = var_3918_begin_0, end = var_3918_end_0, end_mask = var_3918_end_mask_0, x = k_padded_17_cast_fp16)[name = string("op_3918_cast_fp16")];
+            int32 k_blocks_17_axis_0 = const()[name = string("k_blocks_17_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 5, 24, 8, 128]> k_blocks_17_cast_fp16 = stack(axis = k_blocks_17_axis_0, values = (var_3910_cast_fp16, var_3912_cast_fp16, var_3914_cast_fp16, var_3916_cast_fp16, var_3918_cast_fp16))[name = string("k_blocks_17_cast_fp16")];
+            tensor<int32, [4]> var_3922_begin_0 = const()[name = string("op_3922_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3922_end_0 = const()[name = string("op_3922_end_0"), val = tensor<int32, [4]>([1, 24, 8, 128])];
+            tensor<bool, [4]> var_3922_end_mask_0 = const()[name = string("op_3922_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_3922_cast_fp16 = slice_by_index(begin = var_3922_begin_0, end = var_3922_end_0, end_mask = var_3922_end_mask_0, x = v_padded_17_cast_fp16)[name = string("op_3922_cast_fp16")];
+            tensor<int32, [4]> var_3924_begin_0 = const()[name = string("op_3924_begin_0"), val = tensor<int32, [4]>([0, 12, 0, 0])];
+            tensor<int32, [4]> var_3924_end_0 = const()[name = string("op_3924_end_0"), val = tensor<int32, [4]>([1, 36, 8, 128])];
+            tensor<bool, [4]> var_3924_end_mask_0 = const()[name = string("op_3924_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_3924_cast_fp16 = slice_by_index(begin = var_3924_begin_0, end = var_3924_end_0, end_mask = var_3924_end_mask_0, x = v_padded_17_cast_fp16)[name = string("op_3924_cast_fp16")];
+            tensor<int32, [4]> var_3926_begin_0 = const()[name = string("op_3926_begin_0"), val = tensor<int32, [4]>([0, 24, 0, 0])];
+            tensor<int32, [4]> var_3926_end_0 = const()[name = string("op_3926_end_0"), val = tensor<int32, [4]>([1, 48, 8, 128])];
+            tensor<bool, [4]> var_3926_end_mask_0 = const()[name = string("op_3926_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_3926_cast_fp16 = slice_by_index(begin = var_3926_begin_0, end = var_3926_end_0, end_mask = var_3926_end_mask_0, x = v_padded_17_cast_fp16)[name = string("op_3926_cast_fp16")];
+            tensor<int32, [4]> var_3928_begin_0 = const()[name = string("op_3928_begin_0"), val = tensor<int32, [4]>([0, 36, 0, 0])];
+            tensor<int32, [4]> var_3928_end_0 = const()[name = string("op_3928_end_0"), val = tensor<int32, [4]>([1, 60, 8, 128])];
+            tensor<bool, [4]> var_3928_end_mask_0 = const()[name = string("op_3928_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_3928_cast_fp16 = slice_by_index(begin = var_3928_begin_0, end = var_3928_end_0, end_mask = var_3928_end_mask_0, x = v_padded_17_cast_fp16)[name = string("op_3928_cast_fp16")];
+            tensor<int32, [4]> var_3930_begin_0 = const()[name = string("op_3930_begin_0"), val = tensor<int32, [4]>([0, 48, 0, 0])];
+            tensor<int32, [4]> var_3930_end_0 = const()[name = string("op_3930_end_0"), val = tensor<int32, [4]>([1, 72, 8, 128])];
+            tensor<bool, [4]> var_3930_end_mask_0 = const()[name = string("op_3930_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_3930_cast_fp16 = slice_by_index(begin = var_3930_begin_0, end = var_3930_end_0, end_mask = var_3930_end_mask_0, x = v_padded_17_cast_fp16)[name = string("op_3930_cast_fp16")];
+            int32 v_blocks_17_axis_0 = const()[name = string("v_blocks_17_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 5, 24, 8, 128]> v_blocks_17_cast_fp16 = stack(axis = v_blocks_17_axis_0, values = (var_3922_cast_fp16, var_3924_cast_fp16, var_3926_cast_fp16, var_3928_cast_fp16, var_3930_cast_fp16))[name = string("v_blocks_17_cast_fp16")];
+            tensor<int32, [5]> var_3938 = const()[name = string("op_3938"), val = tensor<int32, [5]>([0, 3, 1, -3, -1])];
+            tensor<int32, [5]> var_3940 = const()[name = string("op_3940"), val = tensor<int32, [5]>([0, 3, 1, -1, -3])];
+            bool matrix_ac_17_transpose_x_0 = const()[name = string("matrix_ac_17_transpose_x_0"), val = bool(false)];
+            bool matrix_ac_17_transpose_y_0 = const()[name = string("matrix_ac_17_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 5, 12, 128]> queries_17_cast_fp16 = transpose(perm = var_3938, x = q_blocks_17_cast_fp16)[name = string("transpose_22")];
+            tensor<fp16, [1, 8, 5, 128, 24]> keys_t_17_cast_fp16 = transpose(perm = var_3940, x = k_blocks_17_cast_fp16)[name = string("transpose_23")];
+            tensor<fp16, [1, 8, 5, 12, 24]> matrix_ac_17_cast_fp16 = matmul(transpose_x = matrix_ac_17_transpose_x_0, transpose_y = matrix_ac_17_transpose_y_0, x = queries_17_cast_fp16, y = keys_t_17_cast_fp16)[name = string("matrix_ac_17_cast_fp16")];
+            tensor<int32, [4]> var_3943 = const()[name = string("op_3943"), val = tensor<int32, [4]>([1, 8, 60, 128])];
+            tensor<fp16, [1, 8, 60, 128]> q_flat_17_cast_fp16 = reshape(shape = var_3943, x = queries_17_cast_fp16)[name = string("q_flat_17_cast_fp16")];
+            bool matrix_bd_81_transpose_x_0 = const()[name = string("matrix_bd_81_transpose_x_0"), val = bool(false)];
+            bool matrix_bd_81_transpose_y_0 = const()[name = string("matrix_bd_81_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [8, 128, 13]> rel_k_t_17_to_fp16 = const()[name = string("rel_k_t_17_to_fp16"), val = tensor<fp16, [8, 128, 13]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(103361472)))];
+            tensor<fp16, [1, 8, 60, 13]> matrix_bd_81_cast_fp16 = matmul(transpose_x = matrix_bd_81_transpose_x_0, transpose_y = matrix_bd_81_transpose_y_0, x = q_flat_17_cast_fp16, y = rel_k_t_17_to_fp16)[name = string("matrix_bd_81_cast_fp16")];
+            tensor<int32, [5]> var_3948 = const()[name = string("op_3948"), val = tensor<int32, [5]>([1, 8, 5, 12, 13])];
+            tensor<fp16, [1, 8, 5, 12, 13]> input_371_cast_fp16 = reshape(shape = var_3948, x = matrix_bd_81_cast_fp16)[name = string("input_371_cast_fp16")];
+            tensor<int32, [10]> matrix_bd_83_pad_0 = const()[name = string("matrix_bd_83_pad_0"), val = tensor<int32, [10]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(103388160)))];
+            string matrix_bd_83_mode_0 = const()[name = string("matrix_bd_83_mode_0"), val = string("constant")];
+            fp16 const_129_to_fp16 = const()[name = string("const_129_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 8, 5, 12, 25]> matrix_bd_83_cast_fp16 = pad(constant_val = const_129_to_fp16, mode = matrix_bd_83_mode_0, pad = matrix_bd_83_pad_0, x = input_371_cast_fp16)[name = string("matrix_bd_83_cast_fp16")];
+            tensor<int32, [4]> var_3952 = const()[name = string("op_3952"), val = tensor<int32, [4]>([1, 8, 5, 300])];
+            tensor<fp16, [1, 8, 5, 300]> matrix_bd_85_cast_fp16 = reshape(shape = var_3952, x = matrix_bd_83_cast_fp16)[name = string("matrix_bd_85_cast_fp16")];
+            tensor<int32, [4]> matrix_bd_87_begin_0 = const()[name = string("matrix_bd_87_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> matrix_bd_87_end_0 = const()[name = string("matrix_bd_87_end_0"), val = tensor<int32, [4]>([1, 8, 5, 288])];
+            tensor<bool, [4]> matrix_bd_87_end_mask_0 = const()[name = string("matrix_bd_87_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 8, 5, 288]> matrix_bd_87_cast_fp16 = slice_by_index(begin = matrix_bd_87_begin_0, end = matrix_bd_87_end_0, end_mask = matrix_bd_87_end_mask_0, x = matrix_bd_85_cast_fp16)[name = string("matrix_bd_87_cast_fp16")];
+            tensor<int32, [5]> var_3958 = const()[name = string("op_3958"), val = tensor<int32, [5]>([1, 8, 5, 12, 24])];
+            tensor<fp16, [1, 8, 5, 12, 24]> matrix_bd_89_cast_fp16 = reshape(shape = var_3958, x = matrix_bd_87_cast_fp16)[name = string("matrix_bd_89_cast_fp16")];
+            tensor<fp16, [1, 8, 5, 12, 24]> attn_49_cast_fp16 = add(x = matrix_ac_17_cast_fp16, y = matrix_bd_89_cast_fp16)[name = string("attn_49_cast_fp16")];
+            fp16 _inversed_3961_y_0_to_fp16 = const()[name = string("_inversed_3961_y_0_to_fp16"), val = fp16(0x1.47cp-6)];
+            tensor<fp16, [1, 8, 5, 12, 24]> _inversed_3961_cast_fp16 = mul(x = attn_49_cast_fp16, y = _inversed_3961_y_0_to_fp16)[name = string("_inversed_3961_cast_fp16")];
+            string _inversed_3961_cast_fp16_to_fp32_dtype_0 = const()[name = string("_inversed_3961_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 8, 5, 12, 24]> _inversed_3961_cast_fp16_to_fp32 = cast(dtype = _inversed_3961_cast_fp16_to_fp32_dtype_0, x = _inversed_3961_cast_fp16)[name = string("cast_165")];
+            tensor<fp32, [1, 8, 5, 12, 24]> var_3962 = tanh(x = _inversed_3961_cast_fp16_to_fp32)[name = string("op_3962")];
+            string var_3962_to_fp16_dtype_0 = const()[name = string("op_3962_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 self_attns_8_softcap_to_fp16 = const()[name = string("self_attns_8_softcap_to_fp16"), val = fp16(0x1.9p+5)];
+            tensor<fp16, [1, 8, 5, 12, 24]> var_3962_to_fp16 = cast(dtype = var_3962_to_fp16_dtype_0, x = var_3962)[name = string("cast_164")];
+            tensor<fp16, [1, 8, 5, 12, 24]> attn_51_cast_fp16 = mul(x = var_3962_to_fp16, y = self_attns_8_softcap_to_fp16)[name = string("attn_51_cast_fp16")];
+            string attn_51_cast_fp16_to_fp32_dtype_0 = const()[name = string("attn_51_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 8, 5, 12, 24]> attn_51_cast_fp16_to_fp32 = cast(dtype = attn_51_cast_fp16_to_fp32_dtype_0, x = attn_51_cast_fp16)[name = string("cast_163")];
+            tensor<fp32, [1, 8, 5, 12, 24]> input_373 = select(a = var_3825, b = attn_51_cast_fp16_to_fp32, cond = var_460)[name = string("input_373")];
+            tensor<fp32, [1, 8, 5, 12, 24]> var_3966 = softmax(axis = var_3824, x = input_373)[name = string("op_3966")];
+            tensor<int32, [5]> var_3968 = const()[name = string("op_3968"), val = tensor<int32, [5]>([0, 3, 1, -3, -1])];
+            bool out_49_transpose_x_0 = const()[name = string("out_49_transpose_x_0"), val = bool(false)];
+            bool out_49_transpose_y_0 = const()[name = string("out_49_transpose_y_0"), val = bool(false)];
+            string var_3966_to_fp16_dtype_0 = const()[name = string("op_3966_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 8, 5, 24, 128]> values_t_17_cast_fp16 = transpose(perm = var_3968, x = v_blocks_17_cast_fp16)[name = string("transpose_21")];
+            tensor<fp16, [1, 8, 5, 12, 24]> var_3966_to_fp16 = cast(dtype = var_3966_to_fp16_dtype_0, x = var_3966)[name = string("cast_162")];
+            tensor<fp16, [1, 8, 5, 12, 128]> out_49_cast_fp16 = matmul(transpose_x = out_49_transpose_x_0, transpose_y = out_49_transpose_y_0, x = var_3966_to_fp16, y = values_t_17_cast_fp16)[name = string("out_49_cast_fp16")];
+            tensor<int32, [5]> var_3971 = const()[name = string("op_3971"), val = tensor<int32, [5]>([0, 2, 3, 1, 4])];
+            tensor<int32, [3]> var_3973 = const()[name = string("op_3973"), val = tensor<int32, [3]>([1, 60, 1024])];
+            tensor<fp16, [1, 5, 12, 8, 128]> var_3972_cast_fp16 = transpose(perm = var_3971, x = out_49_cast_fp16)[name = string("transpose_20")];
+            tensor<fp16, [1, 60, 1024]> out_51_cast_fp16 = reshape(shape = var_3973, x = var_3972_cast_fp16)[name = string("out_51_cast_fp16")];
+            tensor<int32, [3]> var_3976_begin_0 = const()[name = string("op_3976_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_3976_end_0 = const()[name = string("op_3976_end_0"), val = tensor<int32, [3]>([1, 50, 1024])];
+            tensor<bool, [3]> var_3976_end_mask_0 = const()[name = string("op_3976_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp16, [1, 50, 1024]> var_3976_cast_fp16 = slice_by_index(begin = var_3976_begin_0, end = var_3976_end_0, end_mask = var_3976_end_mask_0, x = out_51_cast_fp16)[name = string("op_3976_cast_fp16")];
+            fp16 self_attns_8_post_input_min_to_fp16 = const()[name = string("self_attns_8_post_input_min_to_fp16"), val = fp16(-0x1.aap+3)];
+            fp16 self_attns_8_post_input_max_to_fp16 = const()[name = string("self_attns_8_post_input_max_to_fp16"), val = fp16(0x1.a6p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_237_cast_fp16 = clip(alpha = self_attns_8_post_input_min_to_fp16, beta = self_attns_8_post_input_max_to_fp16, x = var_3976_cast_fp16)[name = string("clip_237_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_8_post_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(103388288))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(103912640))))[name = string("self_attns_8_post_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_95_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_8_post_linear_weight_to_fp16_palettized, x = clip_237_cast_fp16)[name = string("linear_95_cast_fp16")];
+            fp16 self_attns_8_post_output_min_to_fp16 = const()[name = string("self_attns_8_post_output_min_to_fp16"), val = fp16(-0x1.36p+5)];
+            fp16 self_attns_8_post_output_max_to_fp16 = const()[name = string("self_attns_8_post_output_max_to_fp16"), val = fp16(0x1.34p+5)];
+            tensor<fp16, [1, 50, 1024]> clip_238_cast_fp16 = clip(alpha = self_attns_8_post_output_min_to_fp16, beta = self_attns_8_post_output_max_to_fp16, x = linear_95_cast_fp16)[name = string("clip_238_cast_fp16")];
+            fp16 var_3988_to_fp16 = const()[name = string("op_3988_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_3989_to_fp16 = const()[name = string("op_3989_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_239_cast_fp16 = clip(alpha = var_3988_to_fp16, beta = var_3989_to_fp16, x = clip_238_cast_fp16)[name = string("clip_239_cast_fp16")];
+            string clip_239_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_239_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_3991 = const()[name = string("op_3991"), val = fp32(-0x1p-1)];
+            fp32 var_3995_promoted = const()[name = string("op_3995_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_239_cast_fp16_to_fp32 = cast(dtype = clip_239_cast_fp16_to_fp32_dtype_0, x = clip_239_cast_fp16)[name = string("cast_161")];
+            tensor<fp32, [1, 50, 1024]> var_4001 = pow(x = clip_239_cast_fp16_to_fp32, y = var_3995_promoted)[name = string("op_4001")];
+            tensor<int32, [1]> var_4003_axes_0 = const()[name = string("op_4003_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4003_keep_dims_0 = const()[name = string("op_4003_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_4003 = reduce_mean(axes = var_4003_axes_0, keep_dims = var_4003_keep_dims_0, x = var_4001)[name = string("op_4003")];
+            string var_4003_to_fp16_dtype_0 = const()[name = string("op_4003_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_4004_to_fp16 = const()[name = string("op_4004_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_4003_to_fp16 = cast(dtype = var_4003_to_fp16_dtype_0, x = var_4003)[name = string("cast_160")];
+            tensor<fp16, [1, 50, 1]> mean_squared_151_cast_fp16 = add(x = var_4003_to_fp16, y = var_4004_to_fp16)[name = string("mean_squared_151_cast_fp16")];
+            string mean_squared_151_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_151_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_151_cast_fp16_to_fp32 = cast(dtype = mean_squared_151_cast_fp16_to_fp32_dtype_0, x = mean_squared_151_cast_fp16)[name = string("cast_159")];
+            tensor<fp32, [1, 50, 1]> var_4006 = pow(x = mean_squared_151_cast_fp16_to_fp32, y = var_3991)[name = string("op_4006")];
+            string var_4006_to_fp16_dtype_0 = const()[name = string("op_4006_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_4006_to_fp16 = cast(dtype = var_4006_to_fp16_dtype_0, x = var_4006)[name = string("cast_158")];
+            tensor<fp16, [1, 50, 1024]> normed_output_301_cast_fp16 = mul(x = clip_239_cast_fp16, y = var_4006_to_fp16)[name = string("normed_output_301_cast_fp16")];
+            tensor<fp16, [1024]> const_130_to_fp16 = const()[name = string("const_130_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(103913728)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_303_cast_fp16 = mul(x = normed_output_301_cast_fp16, y = const_130_to_fp16)[name = string("normed_output_303_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_877_cast_fp16 = add(x = normed_output_303_cast_fp16, y = hidden_states_851_cast_fp16)[name = string("hidden_states_877_cast_fp16")];
+            string hidden_states_877_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_877_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_4013 = const()[name = string("op_4013"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_4014 = const()[name = string("op_4014"), val = fp32(-0x1.2a05f2p+33)];
+            fp32 var_4026 = const()[name = string("op_4026"), val = fp32(-0x1p-1)];
+            fp32 var_4022_promoted = const()[name = string("op_4022_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> hidden_states_877_cast_fp16_to_fp32 = cast(dtype = hidden_states_877_cast_fp16_to_fp32_dtype_0, x = hidden_states_877_cast_fp16)[name = string("cast_157")];
+            tensor<fp32, [1, 50, 1024]> var_4034 = pow(x = hidden_states_877_cast_fp16_to_fp32, y = var_4022_promoted)[name = string("op_4034")];
+            tensor<int32, [1]> var_4036_axes_0 = const()[name = string("op_4036_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4036_keep_dims_0 = const()[name = string("op_4036_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_4036 = reduce_mean(axes = var_4036_axes_0, keep_dims = var_4036_keep_dims_0, x = var_4034)[name = string("op_4036")];
+            string var_4036_to_fp16_dtype_0 = const()[name = string("op_4036_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_4037_to_fp16 = const()[name = string("op_4037_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_4036_to_fp16 = cast(dtype = var_4036_to_fp16_dtype_0, x = var_4036)[name = string("cast_156")];
+            tensor<fp16, [1, 50, 1]> mean_squared_153_cast_fp16 = add(x = var_4036_to_fp16, y = var_4037_to_fp16)[name = string("mean_squared_153_cast_fp16")];
+            string mean_squared_153_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_153_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_153_cast_fp16_to_fp32 = cast(dtype = mean_squared_153_cast_fp16_to_fp32_dtype_0, x = mean_squared_153_cast_fp16)[name = string("cast_155")];
+            tensor<fp32, [1, 50, 1]> var_4039 = pow(x = mean_squared_153_cast_fp16_to_fp32, y = var_4026)[name = string("op_4039")];
+            string var_4039_to_fp16_dtype_0 = const()[name = string("op_4039_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_4039_to_fp16 = cast(dtype = var_4039_to_fp16_dtype_0, x = var_4039)[name = string("cast_154")];
+            tensor<fp16, [1, 50, 1024]> normed_output_305_cast_fp16 = mul(x = hidden_states_877_cast_fp16, y = var_4039_to_fp16)[name = string("normed_output_305_cast_fp16")];
+            tensor<fp16, [1024]> const_131_to_fp16 = const()[name = string("const_131_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(103915840)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_307_cast_fp16 = mul(x = normed_output_305_cast_fp16, y = const_131_to_fp16)[name = string("normed_output_307_cast_fp16")];
+            fp16 lconv1ds_8_linear_start_input_min_to_fp16 = const()[name = string("lconv1ds_8_linear_start_input_min_to_fp16"), val = fp16(-0x1.5ep+3)];
+            fp16 lconv1ds_8_linear_start_input_max_to_fp16 = const()[name = string("lconv1ds_8_linear_start_input_max_to_fp16"), val = fp16(0x1.5cp+3)];
+            tensor<fp16, [1, 50, 1024]> clip_240_cast_fp16 = clip(alpha = lconv1ds_8_linear_start_input_min_to_fp16, beta = lconv1ds_8_linear_start_input_max_to_fp16, x = normed_output_307_cast_fp16)[name = string("clip_240_cast_fp16")];
+            tensor<fp16, [2048, 1024]> lconv1ds_8_linear_start_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(103917952))), lut = tensor<fp16, [64, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(104966592))))[name = string("lconv1ds_8_linear_start_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 2048]> linear_96_cast_fp16 = linear(bias = linear_8_bias_0_to_fp16, weight = lconv1ds_8_linear_start_linear_weight_to_fp16_palettized, x = clip_240_cast_fp16)[name = string("linear_96_cast_fp16")];
+            fp16 lconv1ds_8_linear_start_output_min_to_fp16 = const()[name = string("lconv1ds_8_linear_start_output_min_to_fp16"), val = fp16(-0x1.7ap+4)];
+            fp16 lconv1ds_8_linear_start_output_max_to_fp16 = const()[name = string("lconv1ds_8_linear_start_output_max_to_fp16"), val = fp16(0x1.76p+4)];
+            tensor<fp16, [1, 50, 2048]> clip_241_cast_fp16 = clip(alpha = lconv1ds_8_linear_start_output_min_to_fp16, beta = lconv1ds_8_linear_start_output_max_to_fp16, x = linear_96_cast_fp16)[name = string("clip_241_cast_fp16")];
+            int32 hidden_states_885_split_num_splits_0 = const()[name = string("hidden_states_885_split_num_splits_0"), val = int32(2)];
+            int32 hidden_states_885_split_axis_0 = const()[name = string("hidden_states_885_split_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_885_split_cast_fp16_0, tensor<fp16, [1, 50, 1024]> hidden_states_885_split_cast_fp16_1 = split(axis = hidden_states_885_split_axis_0, num_splits = hidden_states_885_split_num_splits_0, x = clip_241_cast_fp16)[name = string("hidden_states_885_split_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_885_split_1_sigmoid_cast_fp16 = sigmoid(x = hidden_states_885_split_cast_fp16_1)[name = string("hidden_states_885_split_1_sigmoid_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_885_cast_fp16 = mul(x = hidden_states_885_split_cast_fp16_0, y = hidden_states_885_split_1_sigmoid_cast_fp16)[name = string("hidden_states_885_cast_fp16")];
+            tensor<int32, [3]> input_381_perm_0 = const()[name = string("input_381_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [6]> input_383_pad_0 = const()[name = string("input_383_pad_0"), val = tensor<int32, [6]>([0, 0, 0, 0, 4, 0])];
+            string input_383_mode_0 = const()[name = string("input_383_mode_0"), val = string("constant")];
+            fp16 const_132_to_fp16 = const()[name = string("const_132_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 1024, 50]> input_381_cast_fp16 = transpose(perm = input_381_perm_0, x = hidden_states_885_cast_fp16)[name = string("transpose_19")];
+            tensor<fp16, [1, 1024, 54]> input_383_cast_fp16 = pad(constant_val = const_132_to_fp16, mode = input_383_mode_0, pad = input_383_pad_0, x = input_381_cast_fp16)[name = string("input_383_cast_fp16")];
+            string var_4065_pad_type_0 = const()[name = string("op_4065_pad_type_0"), val = string("valid")];
+            int32 var_4065_groups_0 = const()[name = string("op_4065_groups_0"), val = int32(1024)];
+            tensor<int32, [1]> var_4065_strides_0 = const()[name = string("op_4065_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_4065_pad_0 = const()[name = string("op_4065_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_4065_dilations_0 = const()[name = string("op_4065_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1024, 1, 5]> lconv1ds_8_depthwise_conv1d_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1, 5]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(104968704))), lut = tensor<fp16, [32, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(104971328))))[name = string("lconv1ds_8_depthwise_conv1d_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 50]> var_4065_cast_fp16 = conv(dilations = var_4065_dilations_0, groups = var_4065_groups_0, pad = var_4065_pad_0, pad_type = var_4065_pad_type_0, strides = var_4065_strides_0, weight = lconv1ds_8_depthwise_conv1d_weight_to_fp16_palettized, x = input_383_cast_fp16)[name = string("op_4065_cast_fp16")];
+            tensor<int32, [3]> hidden_states_887_perm_0 = const()[name = string("hidden_states_887_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            string hidden_states_887_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_887_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_887_cast_fp16 = transpose(perm = hidden_states_887_perm_0, x = var_4065_cast_fp16)[name = string("transpose_18")];
+            tensor<fp32, [1, 50, 1024]> hidden_states_887_cast_fp16_to_fp32 = cast(dtype = hidden_states_887_cast_fp16_to_fp32_dtype_0, x = hidden_states_887_cast_fp16)[name = string("cast_153")];
+            tensor<fp32, [1, 50, 1024]> clip_242 = clip(alpha = var_4014, beta = var_4013, x = hidden_states_887_cast_fp16_to_fp32)[name = string("clip_242")];
+            fp32 var_4022_promoted_1 = const()[name = string("op_4022_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_4070 = pow(x = clip_242, y = var_4022_promoted_1)[name = string("op_4070")];
+            tensor<int32, [1]> var_4072_axes_0 = const()[name = string("op_4072_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4072_keep_dims_0 = const()[name = string("op_4072_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_4072 = reduce_mean(axes = var_4072_axes_0, keep_dims = var_4072_keep_dims_0, x = var_4070)[name = string("op_4072")];
+            string var_4072_to_fp16_dtype_0 = const()[name = string("op_4072_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_4073_to_fp16 = const()[name = string("op_4073_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_4072_to_fp16 = cast(dtype = var_4072_to_fp16_dtype_0, x = var_4072)[name = string("cast_152")];
+            tensor<fp16, [1, 50, 1]> mean_squared_155_cast_fp16 = add(x = var_4072_to_fp16, y = var_4073_to_fp16)[name = string("mean_squared_155_cast_fp16")];
+            string mean_squared_155_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_155_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_155_cast_fp16_to_fp32 = cast(dtype = mean_squared_155_cast_fp16_to_fp32_dtype_0, x = mean_squared_155_cast_fp16)[name = string("cast_151")];
+            tensor<fp32, [1, 50, 1]> var_4075 = pow(x = mean_squared_155_cast_fp16_to_fp32, y = var_4026)[name = string("op_4075")];
+            string clip_242_to_fp16_dtype_0 = const()[name = string("clip_242_to_fp16_dtype_0"), val = string("fp16")];
+            string var_4075_to_fp16_dtype_0 = const()[name = string("op_4075_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_242_to_fp16 = cast(dtype = clip_242_to_fp16_dtype_0, x = clip_242)[name = string("cast_149")];
+            tensor<fp16, [1, 50, 1]> var_4075_to_fp16 = cast(dtype = var_4075_to_fp16_dtype_0, x = var_4075)[name = string("cast_150")];
+            tensor<fp16, [1, 50, 1024]> normed_output_309_cast_fp16 = mul(x = clip_242_to_fp16, y = var_4075_to_fp16)[name = string("normed_output_309_cast_fp16")];
+            tensor<fp16, [1024]> const_133_to_fp16 = const()[name = string("const_133_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(104972416)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_311_cast_fp16 = mul(x = normed_output_309_cast_fp16, y = const_133_to_fp16)[name = string("normed_output_311_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_893_cast_fp16 = silu(x = normed_output_311_cast_fp16)[name = string("hidden_states_893_cast_fp16")];
+            fp16 lconv1ds_8_linear_end_input_min_to_fp16 = const()[name = string("lconv1ds_8_linear_end_input_min_to_fp16"), val = fp16(-0x1.fp+2)];
+            fp16 lconv1ds_8_linear_end_input_max_to_fp16 = const()[name = string("lconv1ds_8_linear_end_input_max_to_fp16"), val = fp16(0x1.ecp+2)];
+            tensor<fp16, [1, 50, 1024]> clip_243_cast_fp16 = clip(alpha = lconv1ds_8_linear_end_input_min_to_fp16, beta = lconv1ds_8_linear_end_input_max_to_fp16, x = hidden_states_893_cast_fp16)[name = string("clip_243_cast_fp16")];
+            tensor<fp16, [1024, 1024]> lconv1ds_8_linear_end_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(104974528))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(105498880))))[name = string("lconv1ds_8_linear_end_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_97_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = lconv1ds_8_linear_end_linear_weight_to_fp16_palettized, x = clip_243_cast_fp16)[name = string("linear_97_cast_fp16")];
+            fp16 lconv1ds_8_linear_end_output_min_to_fp16 = const()[name = string("lconv1ds_8_linear_end_output_min_to_fp16"), val = fp16(-0x1.c4p+2)];
+            fp16 lconv1ds_8_linear_end_output_max_to_fp16 = const()[name = string("lconv1ds_8_linear_end_output_max_to_fp16"), val = fp16(0x1.cp+2)];
+            tensor<fp16, [1, 50, 1024]> clip_244_cast_fp16 = clip(alpha = lconv1ds_8_linear_end_output_min_to_fp16, beta = lconv1ds_8_linear_end_output_max_to_fp16, x = linear_97_cast_fp16)[name = string("clip_244_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_899_cast_fp16 = add(x = clip_244_cast_fp16, y = hidden_states_877_cast_fp16)[name = string("hidden_states_899_cast_fp16")];
+            string hidden_states_899_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_899_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_4099 = const()[name = string("op_4099"), val = fp32(-0x1p-1)];
+            fp32 var_4100 = const()[name = string("op_4100"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_4101 = const()[name = string("op_4101"), val = fp32(-0x1.2a05f2p+33)];
+            tensor<fp32, [1, 50, 1024]> hidden_states_899_cast_fp16_to_fp32 = cast(dtype = hidden_states_899_cast_fp16_to_fp32_dtype_0, x = hidden_states_899_cast_fp16)[name = string("cast_148")];
+            tensor<fp32, [1, 50, 1024]> clip_245 = clip(alpha = var_4101, beta = var_4100, x = hidden_states_899_cast_fp16_to_fp32)[name = string("clip_245")];
+            fp32 var_4095_promoted = const()[name = string("op_4095_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_4109 = pow(x = clip_245, y = var_4095_promoted)[name = string("op_4109")];
+            tensor<int32, [1]> var_4111_axes_0 = const()[name = string("op_4111_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4111_keep_dims_0 = const()[name = string("op_4111_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_4111 = reduce_mean(axes = var_4111_axes_0, keep_dims = var_4111_keep_dims_0, x = var_4109)[name = string("op_4111")];
+            string var_4111_to_fp16_dtype_0 = const()[name = string("op_4111_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_4112_to_fp16 = const()[name = string("op_4112_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_4111_to_fp16 = cast(dtype = var_4111_to_fp16_dtype_0, x = var_4111)[name = string("cast_147")];
+            tensor<fp16, [1, 50, 1]> mean_squared_157_cast_fp16 = add(x = var_4111_to_fp16, y = var_4112_to_fp16)[name = string("mean_squared_157_cast_fp16")];
+            string mean_squared_157_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_157_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_157_cast_fp16_to_fp32 = cast(dtype = mean_squared_157_cast_fp16_to_fp32_dtype_0, x = mean_squared_157_cast_fp16)[name = string("cast_146")];
+            tensor<fp32, [1, 50, 1]> var_4114 = pow(x = mean_squared_157_cast_fp16_to_fp32, y = var_4099)[name = string("op_4114")];
+            string clip_245_to_fp16_dtype_0 = const()[name = string("clip_245_to_fp16_dtype_0"), val = string("fp16")];
+            string var_4114_to_fp16_dtype_0 = const()[name = string("op_4114_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_245_to_fp16 = cast(dtype = clip_245_to_fp16_dtype_0, x = clip_245)[name = string("cast_144")];
+            tensor<fp16, [1, 50, 1]> var_4114_to_fp16 = cast(dtype = var_4114_to_fp16_dtype_0, x = var_4114)[name = string("cast_145")];
+            tensor<fp16, [1, 50, 1024]> normed_output_313_cast_fp16 = mul(x = clip_245_to_fp16, y = var_4114_to_fp16)[name = string("normed_output_313_cast_fp16")];
+            tensor<fp16, [1024]> const_134_to_fp16 = const()[name = string("const_134_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(105499968)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_315_cast_fp16 = mul(x = normed_output_313_cast_fp16, y = const_134_to_fp16)[name = string("normed_output_315_cast_fp16")];
+            fp16 feed_forward2s_8_ffw_layer_1_input_min_to_fp16 = const()[name = string("feed_forward2s_8_ffw_layer_1_input_min_to_fp16"), val = fp16(-0x1.7cp+3)];
+            fp16 feed_forward2s_8_ffw_layer_1_input_max_to_fp16 = const()[name = string("feed_forward2s_8_ffw_layer_1_input_max_to_fp16"), val = fp16(0x1.78p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_246_cast_fp16 = clip(alpha = feed_forward2s_8_ffw_layer_1_input_min_to_fp16, beta = feed_forward2s_8_ffw_layer_1_input_max_to_fp16, x = normed_output_315_cast_fp16)[name = string("clip_246_cast_fp16")];
+            tensor<fp16, [4096, 1024]> feed_forward2s_8_ffw_layer_1_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(105502080))), lut = tensor<fp16, [128, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(107599296))))[name = string("feed_forward2s_8_ffw_layer_1_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 4096]> linear_98_cast_fp16 = linear(bias = linear_1_bias_0_to_fp16, weight = feed_forward2s_8_ffw_layer_1_linear_weight_to_fp16_palettized, x = clip_246_cast_fp16)[name = string("linear_98_cast_fp16")];
+            fp16 feed_forward2s_8_ffw_layer_1_output_min_to_fp16 = const()[name = string("feed_forward2s_8_ffw_layer_1_output_min_to_fp16"), val = fp16(-0x1.cp+4)];
+            fp16 feed_forward2s_8_ffw_layer_1_output_max_to_fp16 = const()[name = string("feed_forward2s_8_ffw_layer_1_output_max_to_fp16"), val = fp16(0x1.bcp+4)];
+            tensor<fp16, [1, 50, 4096]> clip_247_cast_fp16 = clip(alpha = feed_forward2s_8_ffw_layer_1_output_min_to_fp16, beta = feed_forward2s_8_ffw_layer_1_output_max_to_fp16, x = linear_98_cast_fp16)[name = string("clip_247_cast_fp16")];
+            tensor<fp16, [1, 50, 4096]> hidden_states_909_cast_fp16 = silu(x = clip_247_cast_fp16)[name = string("hidden_states_909_cast_fp16")];
+            fp16 feed_forward2s_8_ffw_layer_2_input_min_to_fp16 = const()[name = string("feed_forward2s_8_ffw_layer_2_input_min_to_fp16"), val = fp16(-0x1.3ap+3)];
+            fp16 feed_forward2s_8_ffw_layer_2_input_max_to_fp16 = const()[name = string("feed_forward2s_8_ffw_layer_2_input_max_to_fp16"), val = fp16(0x1.38p+3)];
+            tensor<fp16, [1, 50, 4096]> clip_248_cast_fp16 = clip(alpha = feed_forward2s_8_ffw_layer_2_input_min_to_fp16, beta = feed_forward2s_8_ffw_layer_2_input_max_to_fp16, x = hidden_states_909_cast_fp16)[name = string("clip_248_cast_fp16")];
+            tensor<fp16, [1024, 4096]> feed_forward2s_8_ffw_layer_2_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 4096]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(107603456))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(109700672))))[name = string("feed_forward2s_8_ffw_layer_2_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_99_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = feed_forward2s_8_ffw_layer_2_linear_weight_to_fp16_palettized, x = clip_248_cast_fp16)[name = string("linear_99_cast_fp16")];
+            fp16 feed_forward2s_8_ffw_layer_2_output_min_to_fp16 = const()[name = string("feed_forward2s_8_ffw_layer_2_output_min_to_fp16"), val = fp16(-0x1.86p+5)];
+            fp16 feed_forward2s_8_ffw_layer_2_output_max_to_fp16 = const()[name = string("feed_forward2s_8_ffw_layer_2_output_max_to_fp16"), val = fp16(0x1.82p+5)];
+            tensor<fp16, [1, 50, 1024]> clip_249_cast_fp16 = clip(alpha = feed_forward2s_8_ffw_layer_2_output_min_to_fp16, beta = feed_forward2s_8_ffw_layer_2_output_max_to_fp16, x = linear_99_cast_fp16)[name = string("clip_249_cast_fp16")];
+            string clip_249_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_249_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1024]> clip_249_cast_fp16_to_fp32 = cast(dtype = clip_249_cast_fp16_to_fp32_dtype_0, x = clip_249_cast_fp16)[name = string("cast_143")];
+            tensor<fp32, [1, 50, 1024]> clip_250 = clip(alpha = var_4101, beta = var_4100, x = clip_249_cast_fp16_to_fp32)[name = string("clip_250")];
+            fp32 var_4095_promoted_1 = const()[name = string("op_4095_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_4141 = pow(x = clip_250, y = var_4095_promoted_1)[name = string("op_4141")];
+            tensor<int32, [1]> var_4143_axes_0 = const()[name = string("op_4143_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4143_keep_dims_0 = const()[name = string("op_4143_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_4143 = reduce_mean(axes = var_4143_axes_0, keep_dims = var_4143_keep_dims_0, x = var_4141)[name = string("op_4143")];
+            string var_4143_to_fp16_dtype_0 = const()[name = string("op_4143_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_4144_to_fp16 = const()[name = string("op_4144_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_4143_to_fp16 = cast(dtype = var_4143_to_fp16_dtype_0, x = var_4143)[name = string("cast_142")];
+            tensor<fp16, [1, 50, 1]> mean_squared_159_cast_fp16 = add(x = var_4143_to_fp16, y = var_4144_to_fp16)[name = string("mean_squared_159_cast_fp16")];
+            string mean_squared_159_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_159_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_159_cast_fp16_to_fp32 = cast(dtype = mean_squared_159_cast_fp16_to_fp32_dtype_0, x = mean_squared_159_cast_fp16)[name = string("cast_141")];
+            tensor<fp32, [1, 50, 1]> var_4146 = pow(x = mean_squared_159_cast_fp16_to_fp32, y = var_4099)[name = string("op_4146")];
+            string clip_250_to_fp16_dtype_0 = const()[name = string("clip_250_to_fp16_dtype_0"), val = string("fp16")];
+            string var_4146_to_fp16_dtype_0 = const()[name = string("op_4146_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_250_to_fp16 = cast(dtype = clip_250_to_fp16_dtype_0, x = clip_250)[name = string("cast_139")];
+            tensor<fp16, [1, 50, 1]> var_4146_to_fp16 = cast(dtype = var_4146_to_fp16_dtype_0, x = var_4146)[name = string("cast_140")];
+            tensor<fp16, [1, 50, 1024]> normed_output_317_cast_fp16 = mul(x = clip_250_to_fp16, y = var_4146_to_fp16)[name = string("normed_output_317_cast_fp16")];
+            tensor<fp16, [1024]> const_135_to_fp16 = const()[name = string("const_135_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(109701760)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_319_cast_fp16 = mul(x = normed_output_317_cast_fp16, y = const_135_to_fp16)[name = string("normed_output_319_cast_fp16")];
+            fp16 var_4091_to_fp16 = const()[name = string("op_4091_to_fp16"), val = fp16(0x1p-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_921_cast_fp16 = mul(x = normed_output_319_cast_fp16, y = var_4091_to_fp16)[name = string("hidden_states_921_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_923_cast_fp16 = add(x = hidden_states_921_cast_fp16, y = hidden_states_899_cast_fp16)[name = string("hidden_states_923_cast_fp16")];
+            fp16 var_4153_to_fp16 = const()[name = string("op_4153_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_4154_to_fp16 = const()[name = string("op_4154_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_251_cast_fp16 = clip(alpha = var_4153_to_fp16, beta = var_4154_to_fp16, x = hidden_states_923_cast_fp16)[name = string("clip_251_cast_fp16")];
+            string clip_251_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_251_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_4156 = const()[name = string("op_4156"), val = fp32(-0x1p-1)];
+            fp32 var_4160_promoted = const()[name = string("op_4160_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_251_cast_fp16_to_fp32 = cast(dtype = clip_251_cast_fp16_to_fp32_dtype_0, x = clip_251_cast_fp16)[name = string("cast_138")];
+            tensor<fp32, [1, 50, 1024]> var_4166 = pow(x = clip_251_cast_fp16_to_fp32, y = var_4160_promoted)[name = string("op_4166")];
+            tensor<int32, [1]> var_4168_axes_0 = const()[name = string("op_4168_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4168_keep_dims_0 = const()[name = string("op_4168_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_4168 = reduce_mean(axes = var_4168_axes_0, keep_dims = var_4168_keep_dims_0, x = var_4166)[name = string("op_4168")];
+            string var_4168_to_fp16_dtype_0 = const()[name = string("op_4168_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_4169_to_fp16 = const()[name = string("op_4169_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_4168_to_fp16 = cast(dtype = var_4168_to_fp16_dtype_0, x = var_4168)[name = string("cast_137")];
+            tensor<fp16, [1, 50, 1]> mean_squared_161_cast_fp16 = add(x = var_4168_to_fp16, y = var_4169_to_fp16)[name = string("mean_squared_161_cast_fp16")];
+            string mean_squared_161_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_161_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_161_cast_fp16_to_fp32 = cast(dtype = mean_squared_161_cast_fp16_to_fp32_dtype_0, x = mean_squared_161_cast_fp16)[name = string("cast_136")];
+            tensor<fp32, [1, 50, 1]> var_4171 = pow(x = mean_squared_161_cast_fp16_to_fp32, y = var_4156)[name = string("op_4171")];
+            string var_4171_to_fp16_dtype_0 = const()[name = string("op_4171_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_4171_to_fp16 = cast(dtype = var_4171_to_fp16_dtype_0, x = var_4171)[name = string("cast_135")];
+            tensor<fp16, [1, 50, 1024]> normed_output_321_cast_fp16 = mul(x = clip_251_cast_fp16, y = var_4171_to_fp16)[name = string("normed_output_321_cast_fp16")];
+            tensor<fp16, [1024]> const_136_to_fp16 = const()[name = string("const_136_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(109703872)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_323_cast_fp16 = mul(x = normed_output_321_cast_fp16, y = const_136_to_fp16)[name = string("normed_output_323_cast_fp16")];
+            string normed_output_323_cast_fp16_to_fp32_dtype_0 = const()[name = string("normed_output_323_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_4184 = const()[name = string("op_4184"), val = fp32(-0x1p-1)];
+            fp32 var_4185 = const()[name = string("op_4185"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_4186 = const()[name = string("op_4186"), val = fp32(-0x1.2a05f2p+33)];
+            tensor<fp32, [1, 50, 1024]> normed_output_323_cast_fp16_to_fp32 = cast(dtype = normed_output_323_cast_fp16_to_fp32_dtype_0, x = normed_output_323_cast_fp16)[name = string("cast_134")];
+            tensor<fp32, [1, 50, 1024]> clip_252 = clip(alpha = var_4186, beta = var_4185, x = normed_output_323_cast_fp16_to_fp32)[name = string("clip_252")];
+            fp32 var_4180_promoted = const()[name = string("op_4180_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_4194 = pow(x = clip_252, y = var_4180_promoted)[name = string("op_4194")];
+            tensor<int32, [1]> var_4196_axes_0 = const()[name = string("op_4196_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4196_keep_dims_0 = const()[name = string("op_4196_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_4196 = reduce_mean(axes = var_4196_axes_0, keep_dims = var_4196_keep_dims_0, x = var_4194)[name = string("op_4196")];
+            string var_4196_to_fp16_dtype_0 = const()[name = string("op_4196_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_4197_to_fp16 = const()[name = string("op_4197_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_4196_to_fp16 = cast(dtype = var_4196_to_fp16_dtype_0, x = var_4196)[name = string("cast_133")];
+            tensor<fp16, [1, 50, 1]> mean_squared_163_cast_fp16 = add(x = var_4196_to_fp16, y = var_4197_to_fp16)[name = string("mean_squared_163_cast_fp16")];
+            string mean_squared_163_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_163_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_163_cast_fp16_to_fp32 = cast(dtype = mean_squared_163_cast_fp16_to_fp32_dtype_0, x = mean_squared_163_cast_fp16)[name = string("cast_132")];
+            tensor<fp32, [1, 50, 1]> var_4199 = pow(x = mean_squared_163_cast_fp16_to_fp32, y = var_4184)[name = string("op_4199")];
+            string clip_252_to_fp16_dtype_0 = const()[name = string("clip_252_to_fp16_dtype_0"), val = string("fp16")];
+            string var_4199_to_fp16_dtype_0 = const()[name = string("op_4199_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_252_to_fp16 = cast(dtype = clip_252_to_fp16_dtype_0, x = clip_252)[name = string("cast_130")];
+            tensor<fp16, [1, 50, 1]> var_4199_to_fp16 = cast(dtype = var_4199_to_fp16_dtype_0, x = var_4199)[name = string("cast_131")];
+            tensor<fp16, [1, 50, 1024]> normed_output_325_cast_fp16 = mul(x = clip_252_to_fp16, y = var_4199_to_fp16)[name = string("normed_output_325_cast_fp16")];
+            tensor<fp16, [1024]> const_137_to_fp16 = const()[name = string("const_137_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(109705984)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_327_cast_fp16 = mul(x = normed_output_325_cast_fp16, y = const_137_to_fp16)[name = string("normed_output_327_cast_fp16")];
+            fp16 feed_forward1s_9_ffw_layer_1_input_min_to_fp16 = const()[name = string("feed_forward1s_9_ffw_layer_1_input_min_to_fp16"), val = fp16(-0x1.f4p+2)];
+            fp16 feed_forward1s_9_ffw_layer_1_input_max_to_fp16 = const()[name = string("feed_forward1s_9_ffw_layer_1_input_max_to_fp16"), val = fp16(0x1.fp+2)];
+            tensor<fp16, [1, 50, 1024]> clip_253_cast_fp16 = clip(alpha = feed_forward1s_9_ffw_layer_1_input_min_to_fp16, beta = feed_forward1s_9_ffw_layer_1_input_max_to_fp16, x = normed_output_327_cast_fp16)[name = string("clip_253_cast_fp16")];
+            tensor<fp16, [4096, 1024]> feed_forward1s_9_ffw_layer_1_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(109708096))), lut = tensor<fp16, [128, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(111805312))))[name = string("feed_forward1s_9_ffw_layer_1_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 4096]> linear_100_cast_fp16 = linear(bias = linear_1_bias_0_to_fp16, weight = feed_forward1s_9_ffw_layer_1_linear_weight_to_fp16_palettized, x = clip_253_cast_fp16)[name = string("linear_100_cast_fp16")];
+            fp16 feed_forward1s_9_ffw_layer_1_output_min_to_fp16 = const()[name = string("feed_forward1s_9_ffw_layer_1_output_min_to_fp16"), val = fp16(-0x1.c8p+3)];
+            fp16 feed_forward1s_9_ffw_layer_1_output_max_to_fp16 = const()[name = string("feed_forward1s_9_ffw_layer_1_output_max_to_fp16"), val = fp16(0x1.c4p+3)];
+            tensor<fp16, [1, 50, 4096]> clip_254_cast_fp16 = clip(alpha = feed_forward1s_9_ffw_layer_1_output_min_to_fp16, beta = feed_forward1s_9_ffw_layer_1_output_max_to_fp16, x = linear_100_cast_fp16)[name = string("clip_254_cast_fp16")];
+            tensor<fp16, [1, 50, 4096]> hidden_states_939_cast_fp16 = silu(x = clip_254_cast_fp16)[name = string("hidden_states_939_cast_fp16")];
+            fp16 feed_forward1s_9_ffw_layer_2_input_min_to_fp16 = const()[name = string("feed_forward1s_9_ffw_layer_2_input_min_to_fp16"), val = fp16(-0x1.08p+3)];
+            fp16 feed_forward1s_9_ffw_layer_2_input_max_to_fp16 = const()[name = string("feed_forward1s_9_ffw_layer_2_input_max_to_fp16"), val = fp16(0x1.06p+3)];
+            tensor<fp16, [1, 50, 4096]> clip_255_cast_fp16 = clip(alpha = feed_forward1s_9_ffw_layer_2_input_min_to_fp16, beta = feed_forward1s_9_ffw_layer_2_input_max_to_fp16, x = hidden_states_939_cast_fp16)[name = string("clip_255_cast_fp16")];
+            tensor<fp16, [1024, 4096]> feed_forward1s_9_ffw_layer_2_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 4096]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(111809472))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113906688))))[name = string("feed_forward1s_9_ffw_layer_2_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_101_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = feed_forward1s_9_ffw_layer_2_linear_weight_to_fp16_palettized, x = clip_255_cast_fp16)[name = string("linear_101_cast_fp16")];
+            fp16 feed_forward1s_9_ffw_layer_2_output_min_to_fp16 = const()[name = string("feed_forward1s_9_ffw_layer_2_output_min_to_fp16"), val = fp16(-0x1.4p+5)];
+            fp16 feed_forward1s_9_ffw_layer_2_output_max_to_fp16 = const()[name = string("feed_forward1s_9_ffw_layer_2_output_max_to_fp16"), val = fp16(0x1.3ep+5)];
+            tensor<fp16, [1, 50, 1024]> clip_256_cast_fp16 = clip(alpha = feed_forward1s_9_ffw_layer_2_output_min_to_fp16, beta = feed_forward1s_9_ffw_layer_2_output_max_to_fp16, x = linear_101_cast_fp16)[name = string("clip_256_cast_fp16")];
+            string clip_256_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_256_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1024]> clip_256_cast_fp16_to_fp32 = cast(dtype = clip_256_cast_fp16_to_fp32_dtype_0, x = clip_256_cast_fp16)[name = string("cast_129")];
+            tensor<fp32, [1, 50, 1024]> clip_257 = clip(alpha = var_4186, beta = var_4185, x = clip_256_cast_fp16_to_fp32)[name = string("clip_257")];
+            fp32 var_4180_promoted_1 = const()[name = string("op_4180_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_4226 = pow(x = clip_257, y = var_4180_promoted_1)[name = string("op_4226")];
+            tensor<int32, [1]> var_4228_axes_0 = const()[name = string("op_4228_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4228_keep_dims_0 = const()[name = string("op_4228_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_4228 = reduce_mean(axes = var_4228_axes_0, keep_dims = var_4228_keep_dims_0, x = var_4226)[name = string("op_4228")];
+            string var_4228_to_fp16_dtype_0 = const()[name = string("op_4228_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_4229_to_fp16 = const()[name = string("op_4229_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_4228_to_fp16 = cast(dtype = var_4228_to_fp16_dtype_0, x = var_4228)[name = string("cast_128")];
+            tensor<fp16, [1, 50, 1]> mean_squared_165_cast_fp16 = add(x = var_4228_to_fp16, y = var_4229_to_fp16)[name = string("mean_squared_165_cast_fp16")];
+            string mean_squared_165_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_165_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_165_cast_fp16_to_fp32 = cast(dtype = mean_squared_165_cast_fp16_to_fp32_dtype_0, x = mean_squared_165_cast_fp16)[name = string("cast_127")];
+            tensor<fp32, [1, 50, 1]> var_4231 = pow(x = mean_squared_165_cast_fp16_to_fp32, y = var_4184)[name = string("op_4231")];
+            string clip_257_to_fp16_dtype_0 = const()[name = string("clip_257_to_fp16_dtype_0"), val = string("fp16")];
+            string var_4231_to_fp16_dtype_0 = const()[name = string("op_4231_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_257_to_fp16 = cast(dtype = clip_257_to_fp16_dtype_0, x = clip_257)[name = string("cast_125")];
+            tensor<fp16, [1, 50, 1]> var_4231_to_fp16 = cast(dtype = var_4231_to_fp16_dtype_0, x = var_4231)[name = string("cast_126")];
+            tensor<fp16, [1, 50, 1024]> normed_output_329_cast_fp16 = mul(x = clip_257_to_fp16, y = var_4231_to_fp16)[name = string("normed_output_329_cast_fp16")];
+            tensor<fp16, [1024]> const_138_to_fp16 = const()[name = string("const_138_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113907776)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_331_cast_fp16 = mul(x = normed_output_329_cast_fp16, y = const_138_to_fp16)[name = string("normed_output_331_cast_fp16")];
+            fp16 var_4176_to_fp16 = const()[name = string("op_4176_to_fp16"), val = fp16(0x1p-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_951_cast_fp16 = mul(x = normed_output_331_cast_fp16, y = var_4176_to_fp16)[name = string("hidden_states_951_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_953_cast_fp16 = add(x = hidden_states_951_cast_fp16, y = normed_output_323_cast_fp16)[name = string("hidden_states_953_cast_fp16")];
+            fp16 var_4238_to_fp16 = const()[name = string("op_4238_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_4239_to_fp16 = const()[name = string("op_4239_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_258_cast_fp16 = clip(alpha = var_4238_to_fp16, beta = var_4239_to_fp16, x = hidden_states_953_cast_fp16)[name = string("clip_258_cast_fp16")];
+            string clip_258_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_258_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_4241 = const()[name = string("op_4241"), val = fp32(-0x1p-1)];
+            fp32 var_4245_promoted = const()[name = string("op_4245_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_258_cast_fp16_to_fp32 = cast(dtype = clip_258_cast_fp16_to_fp32_dtype_0, x = clip_258_cast_fp16)[name = string("cast_124")];
+            tensor<fp32, [1, 50, 1024]> var_4251 = pow(x = clip_258_cast_fp16_to_fp32, y = var_4245_promoted)[name = string("op_4251")];
+            tensor<int32, [1]> var_4253_axes_0 = const()[name = string("op_4253_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4253_keep_dims_0 = const()[name = string("op_4253_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_4253 = reduce_mean(axes = var_4253_axes_0, keep_dims = var_4253_keep_dims_0, x = var_4251)[name = string("op_4253")];
+            string var_4253_to_fp16_dtype_0 = const()[name = string("op_4253_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_4254_to_fp16 = const()[name = string("op_4254_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_4253_to_fp16 = cast(dtype = var_4253_to_fp16_dtype_0, x = var_4253)[name = string("cast_123")];
+            tensor<fp16, [1, 50, 1]> mean_squared_167_cast_fp16 = add(x = var_4253_to_fp16, y = var_4254_to_fp16)[name = string("mean_squared_167_cast_fp16")];
+            string mean_squared_167_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_167_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_167_cast_fp16_to_fp32 = cast(dtype = mean_squared_167_cast_fp16_to_fp32_dtype_0, x = mean_squared_167_cast_fp16)[name = string("cast_122")];
+            tensor<fp32, [1, 50, 1]> var_4256 = pow(x = mean_squared_167_cast_fp16_to_fp32, y = var_4241)[name = string("op_4256")];
+            string var_4256_to_fp16_dtype_0 = const()[name = string("op_4256_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_4256_to_fp16 = cast(dtype = var_4256_to_fp16_dtype_0, x = var_4256)[name = string("cast_121")];
+            tensor<fp16, [1, 50, 1024]> normed_output_333_cast_fp16 = mul(x = clip_258_cast_fp16, y = var_4256_to_fp16)[name = string("normed_output_333_cast_fp16")];
+            tensor<fp16, [1024]> const_139_to_fp16 = const()[name = string("const_139_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113909888)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_335_cast_fp16 = mul(x = normed_output_333_cast_fp16, y = const_139_to_fp16)[name = string("normed_output_335_cast_fp16")];
+            int32 var_4262 = const()[name = string("op_4262"), val = int32(-1)];
+            fp32 var_4263 = const()[name = string("op_4263"), val = fp32(-0x1.dcd65p+29)];
+            fp16 self_attns_9_q_proj_input_min_to_fp16 = const()[name = string("self_attns_9_q_proj_input_min_to_fp16"), val = fp16(-0x1.24p+3)];
+            fp16 self_attns_9_q_proj_input_max_to_fp16 = const()[name = string("self_attns_9_q_proj_input_max_to_fp16"), val = fp16(0x1.22p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_259_cast_fp16 = clip(alpha = self_attns_9_q_proj_input_min_to_fp16, beta = self_attns_9_q_proj_input_max_to_fp16, x = normed_output_335_cast_fp16)[name = string("clip_259_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_9_q_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113912000))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114436352))))[name = string("self_attns_9_q_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_102_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_9_q_proj_linear_weight_to_fp16_palettized, x = clip_259_cast_fp16)[name = string("linear_102_cast_fp16")];
+            fp16 self_attns_9_q_proj_output_min_to_fp16 = const()[name = string("self_attns_9_q_proj_output_min_to_fp16"), val = fp16(-0x1.06p+4)];
+            fp16 self_attns_9_q_proj_output_max_to_fp16 = const()[name = string("self_attns_9_q_proj_output_max_to_fp16"), val = fp16(0x1.04p+4)];
+            tensor<fp16, [1, 50, 1024]> clip_260_cast_fp16 = clip(alpha = self_attns_9_q_proj_output_min_to_fp16, beta = self_attns_9_q_proj_output_max_to_fp16, x = linear_102_cast_fp16)[name = string("clip_260_cast_fp16")];
+            tensor<int32, [4]> var_4307 = const()[name = string("op_4307"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> q_19_cast_fp16 = reshape(shape = var_4307, x = clip_260_cast_fp16)[name = string("q_19_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_9_k_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114437440))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114961792))))[name = string("self_attns_9_k_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_103_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_9_k_proj_linear_weight_to_fp16_palettized, x = clip_259_cast_fp16)[name = string("linear_103_cast_fp16")];
+            fp16 self_attns_9_k_proj_output_min_to_fp16 = const()[name = string("self_attns_9_k_proj_output_min_to_fp16"), val = fp16(-0x1.06p+4)];
+            fp16 self_attns_9_k_proj_output_max_to_fp16 = const()[name = string("self_attns_9_k_proj_output_max_to_fp16"), val = fp16(0x1.04p+4)];
+            tensor<fp16, [1, 50, 1024]> clip_262_cast_fp16 = clip(alpha = self_attns_9_k_proj_output_min_to_fp16, beta = self_attns_9_k_proj_output_max_to_fp16, x = linear_103_cast_fp16)[name = string("clip_262_cast_fp16")];
+            tensor<int32, [4]> var_4319 = const()[name = string("op_4319"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> k_19_cast_fp16 = reshape(shape = var_4319, x = clip_262_cast_fp16)[name = string("k_19_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_9_v_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114962880))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(115487232))))[name = string("self_attns_9_v_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_104_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_9_v_proj_linear_weight_to_fp16_palettized, x = clip_259_cast_fp16)[name = string("linear_104_cast_fp16")];
+            fp16 self_attns_9_v_proj_output_min_to_fp16 = const()[name = string("self_attns_9_v_proj_output_min_to_fp16"), val = fp16(-0x1.06p+4)];
+            fp16 self_attns_9_v_proj_output_max_to_fp16 = const()[name = string("self_attns_9_v_proj_output_max_to_fp16"), val = fp16(0x1.04p+4)];
+            tensor<fp16, [1, 50, 1024]> clip_264_cast_fp16 = clip(alpha = self_attns_9_v_proj_output_min_to_fp16, beta = self_attns_9_v_proj_output_max_to_fp16, x = linear_104_cast_fp16)[name = string("clip_264_cast_fp16")];
+            tensor<int32, [4]> var_4331 = const()[name = string("op_4331"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> input_411_cast_fp16 = reshape(shape = var_4331, x = clip_264_cast_fp16)[name = string("input_411_cast_fp16")];
+            fp16 var_4333_to_fp16 = const()[name = string("op_4333_to_fp16"), val = fp16(0x1.054p-3)];
+            tensor<fp16, [1, 50, 8, 128]> var_4334_cast_fp16 = mul(x = q_19_cast_fp16, y = var_4333_to_fp16)[name = string("op_4334_cast_fp16")];
+            tensor<fp16, [128]> var_4335_to_fp16 = const()[name = string("op_4335_to_fp16"), val = tensor<fp16, [128]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(115488320)))];
+            tensor<fp16, [1, 50, 8, 128]> input_407_cast_fp16 = mul(x = var_4334_cast_fp16, y = var_4335_to_fp16)[name = string("input_407_cast_fp16")];
+            fp16 var_4337_to_fp16 = const()[name = string("op_4337_to_fp16"), val = fp16(0x1.e5p+0)];
+            tensor<fp16, [1, 50, 8, 128]> input_409_cast_fp16 = mul(x = k_19_cast_fp16, y = var_4337_to_fp16)[name = string("input_409_cast_fp16")];
+            tensor<int32, [8]> q_padded_19_pad_0 = const()[name = string("q_padded_19_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 10, 0, 0, 0, 0])];
+            string q_padded_19_mode_0 = const()[name = string("q_padded_19_mode_0"), val = string("constant")];
+            fp16 const_140_to_fp16 = const()[name = string("const_140_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 60, 8, 128]> q_padded_19_cast_fp16 = pad(constant_val = const_140_to_fp16, mode = q_padded_19_mode_0, pad = q_padded_19_pad_0, x = input_407_cast_fp16)[name = string("q_padded_19_cast_fp16")];
+            tensor<int32, [5]> var_4341 = const()[name = string("op_4341"), val = tensor<int32, [5]>([1, 5, 12, 8, 128])];
+            tensor<fp16, [1, 5, 12, 8, 128]> q_blocks_19_cast_fp16 = reshape(shape = var_4341, x = q_padded_19_cast_fp16)[name = string("q_blocks_19_cast_fp16")];
+            tensor<int32, [8]> k_padded_19_pad_0 = const()[name = string("k_padded_19_pad_0"), val = tensor<int32, [8]>([0, 0, 12, 11, 0, 0, 0, 0])];
+            string k_padded_19_mode_0 = const()[name = string("k_padded_19_mode_0"), val = string("constant")];
+            fp16 const_141_to_fp16 = const()[name = string("const_141_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 73, 8, 128]> k_padded_19_cast_fp16 = pad(constant_val = const_141_to_fp16, mode = k_padded_19_mode_0, pad = k_padded_19_pad_0, x = input_409_cast_fp16)[name = string("k_padded_19_cast_fp16")];
+            tensor<int32, [8]> v_padded_19_pad_0 = const()[name = string("v_padded_19_pad_0"), val = tensor<int32, [8]>([0, 0, 12, 11, 0, 0, 0, 0])];
+            string v_padded_19_mode_0 = const()[name = string("v_padded_19_mode_0"), val = string("constant")];
+            fp16 const_142_to_fp16 = const()[name = string("const_142_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 73, 8, 128]> v_padded_19_cast_fp16 = pad(constant_val = const_142_to_fp16, mode = v_padded_19_mode_0, pad = v_padded_19_pad_0, x = input_411_cast_fp16)[name = string("v_padded_19_cast_fp16")];
+            tensor<int32, [4]> var_4348_begin_0 = const()[name = string("op_4348_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4348_end_0 = const()[name = string("op_4348_end_0"), val = tensor<int32, [4]>([1, 24, 8, 128])];
+            tensor<bool, [4]> var_4348_end_mask_0 = const()[name = string("op_4348_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_4348_cast_fp16 = slice_by_index(begin = var_4348_begin_0, end = var_4348_end_0, end_mask = var_4348_end_mask_0, x = k_padded_19_cast_fp16)[name = string("op_4348_cast_fp16")];
+            tensor<int32, [4]> var_4350_begin_0 = const()[name = string("op_4350_begin_0"), val = tensor<int32, [4]>([0, 12, 0, 0])];
+            tensor<int32, [4]> var_4350_end_0 = const()[name = string("op_4350_end_0"), val = tensor<int32, [4]>([1, 36, 8, 128])];
+            tensor<bool, [4]> var_4350_end_mask_0 = const()[name = string("op_4350_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_4350_cast_fp16 = slice_by_index(begin = var_4350_begin_0, end = var_4350_end_0, end_mask = var_4350_end_mask_0, x = k_padded_19_cast_fp16)[name = string("op_4350_cast_fp16")];
+            tensor<int32, [4]> var_4352_begin_0 = const()[name = string("op_4352_begin_0"), val = tensor<int32, [4]>([0, 24, 0, 0])];
+            tensor<int32, [4]> var_4352_end_0 = const()[name = string("op_4352_end_0"), val = tensor<int32, [4]>([1, 48, 8, 128])];
+            tensor<bool, [4]> var_4352_end_mask_0 = const()[name = string("op_4352_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_4352_cast_fp16 = slice_by_index(begin = var_4352_begin_0, end = var_4352_end_0, end_mask = var_4352_end_mask_0, x = k_padded_19_cast_fp16)[name = string("op_4352_cast_fp16")];
+            tensor<int32, [4]> var_4354_begin_0 = const()[name = string("op_4354_begin_0"), val = tensor<int32, [4]>([0, 36, 0, 0])];
+            tensor<int32, [4]> var_4354_end_0 = const()[name = string("op_4354_end_0"), val = tensor<int32, [4]>([1, 60, 8, 128])];
+            tensor<bool, [4]> var_4354_end_mask_0 = const()[name = string("op_4354_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_4354_cast_fp16 = slice_by_index(begin = var_4354_begin_0, end = var_4354_end_0, end_mask = var_4354_end_mask_0, x = k_padded_19_cast_fp16)[name = string("op_4354_cast_fp16")];
+            tensor<int32, [4]> var_4356_begin_0 = const()[name = string("op_4356_begin_0"), val = tensor<int32, [4]>([0, 48, 0, 0])];
+            tensor<int32, [4]> var_4356_end_0 = const()[name = string("op_4356_end_0"), val = tensor<int32, [4]>([1, 72, 8, 128])];
+            tensor<bool, [4]> var_4356_end_mask_0 = const()[name = string("op_4356_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_4356_cast_fp16 = slice_by_index(begin = var_4356_begin_0, end = var_4356_end_0, end_mask = var_4356_end_mask_0, x = k_padded_19_cast_fp16)[name = string("op_4356_cast_fp16")];
+            int32 k_blocks_19_axis_0 = const()[name = string("k_blocks_19_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 5, 24, 8, 128]> k_blocks_19_cast_fp16 = stack(axis = k_blocks_19_axis_0, values = (var_4348_cast_fp16, var_4350_cast_fp16, var_4352_cast_fp16, var_4354_cast_fp16, var_4356_cast_fp16))[name = string("k_blocks_19_cast_fp16")];
+            tensor<int32, [4]> var_4360_begin_0 = const()[name = string("op_4360_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4360_end_0 = const()[name = string("op_4360_end_0"), val = tensor<int32, [4]>([1, 24, 8, 128])];
+            tensor<bool, [4]> var_4360_end_mask_0 = const()[name = string("op_4360_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_4360_cast_fp16 = slice_by_index(begin = var_4360_begin_0, end = var_4360_end_0, end_mask = var_4360_end_mask_0, x = v_padded_19_cast_fp16)[name = string("op_4360_cast_fp16")];
+            tensor<int32, [4]> var_4362_begin_0 = const()[name = string("op_4362_begin_0"), val = tensor<int32, [4]>([0, 12, 0, 0])];
+            tensor<int32, [4]> var_4362_end_0 = const()[name = string("op_4362_end_0"), val = tensor<int32, [4]>([1, 36, 8, 128])];
+            tensor<bool, [4]> var_4362_end_mask_0 = const()[name = string("op_4362_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_4362_cast_fp16 = slice_by_index(begin = var_4362_begin_0, end = var_4362_end_0, end_mask = var_4362_end_mask_0, x = v_padded_19_cast_fp16)[name = string("op_4362_cast_fp16")];
+            tensor<int32, [4]> var_4364_begin_0 = const()[name = string("op_4364_begin_0"), val = tensor<int32, [4]>([0, 24, 0, 0])];
+            tensor<int32, [4]> var_4364_end_0 = const()[name = string("op_4364_end_0"), val = tensor<int32, [4]>([1, 48, 8, 128])];
+            tensor<bool, [4]> var_4364_end_mask_0 = const()[name = string("op_4364_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_4364_cast_fp16 = slice_by_index(begin = var_4364_begin_0, end = var_4364_end_0, end_mask = var_4364_end_mask_0, x = v_padded_19_cast_fp16)[name = string("op_4364_cast_fp16")];
+            tensor<int32, [4]> var_4366_begin_0 = const()[name = string("op_4366_begin_0"), val = tensor<int32, [4]>([0, 36, 0, 0])];
+            tensor<int32, [4]> var_4366_end_0 = const()[name = string("op_4366_end_0"), val = tensor<int32, [4]>([1, 60, 8, 128])];
+            tensor<bool, [4]> var_4366_end_mask_0 = const()[name = string("op_4366_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_4366_cast_fp16 = slice_by_index(begin = var_4366_begin_0, end = var_4366_end_0, end_mask = var_4366_end_mask_0, x = v_padded_19_cast_fp16)[name = string("op_4366_cast_fp16")];
+            tensor<int32, [4]> var_4368_begin_0 = const()[name = string("op_4368_begin_0"), val = tensor<int32, [4]>([0, 48, 0, 0])];
+            tensor<int32, [4]> var_4368_end_0 = const()[name = string("op_4368_end_0"), val = tensor<int32, [4]>([1, 72, 8, 128])];
+            tensor<bool, [4]> var_4368_end_mask_0 = const()[name = string("op_4368_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_4368_cast_fp16 = slice_by_index(begin = var_4368_begin_0, end = var_4368_end_0, end_mask = var_4368_end_mask_0, x = v_padded_19_cast_fp16)[name = string("op_4368_cast_fp16")];
+            int32 v_blocks_19_axis_0 = const()[name = string("v_blocks_19_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 5, 24, 8, 128]> v_blocks_19_cast_fp16 = stack(axis = v_blocks_19_axis_0, values = (var_4360_cast_fp16, var_4362_cast_fp16, var_4364_cast_fp16, var_4366_cast_fp16, var_4368_cast_fp16))[name = string("v_blocks_19_cast_fp16")];
+            tensor<int32, [5]> var_4376 = const()[name = string("op_4376"), val = tensor<int32, [5]>([0, 3, 1, -3, -1])];
+            tensor<int32, [5]> var_4378 = const()[name = string("op_4378"), val = tensor<int32, [5]>([0, 3, 1, -1, -3])];
+            bool matrix_ac_19_transpose_x_0 = const()[name = string("matrix_ac_19_transpose_x_0"), val = bool(false)];
+            bool matrix_ac_19_transpose_y_0 = const()[name = string("matrix_ac_19_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 5, 12, 128]> queries_19_cast_fp16 = transpose(perm = var_4376, x = q_blocks_19_cast_fp16)[name = string("transpose_16")];
+            tensor<fp16, [1, 8, 5, 128, 24]> keys_t_19_cast_fp16 = transpose(perm = var_4378, x = k_blocks_19_cast_fp16)[name = string("transpose_17")];
+            tensor<fp16, [1, 8, 5, 12, 24]> matrix_ac_19_cast_fp16 = matmul(transpose_x = matrix_ac_19_transpose_x_0, transpose_y = matrix_ac_19_transpose_y_0, x = queries_19_cast_fp16, y = keys_t_19_cast_fp16)[name = string("matrix_ac_19_cast_fp16")];
+            tensor<int32, [4]> var_4381 = const()[name = string("op_4381"), val = tensor<int32, [4]>([1, 8, 60, 128])];
+            tensor<fp16, [1, 8, 60, 128]> q_flat_19_cast_fp16 = reshape(shape = var_4381, x = queries_19_cast_fp16)[name = string("q_flat_19_cast_fp16")];
+            bool matrix_bd_91_transpose_x_0 = const()[name = string("matrix_bd_91_transpose_x_0"), val = bool(false)];
+            bool matrix_bd_91_transpose_y_0 = const()[name = string("matrix_bd_91_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [8, 128, 13]> rel_k_t_19_to_fp16 = const()[name = string("rel_k_t_19_to_fp16"), val = tensor<fp16, [8, 128, 13]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(115488640)))];
+            tensor<fp16, [1, 8, 60, 13]> matrix_bd_91_cast_fp16 = matmul(transpose_x = matrix_bd_91_transpose_x_0, transpose_y = matrix_bd_91_transpose_y_0, x = q_flat_19_cast_fp16, y = rel_k_t_19_to_fp16)[name = string("matrix_bd_91_cast_fp16")];
+            tensor<int32, [5]> var_4386 = const()[name = string("op_4386"), val = tensor<int32, [5]>([1, 8, 5, 12, 13])];
+            tensor<fp16, [1, 8, 5, 12, 13]> input_413_cast_fp16 = reshape(shape = var_4386, x = matrix_bd_91_cast_fp16)[name = string("input_413_cast_fp16")];
+            tensor<int32, [10]> matrix_bd_93_pad_0 = const()[name = string("matrix_bd_93_pad_0"), val = tensor<int32, [10]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(115515328)))];
+            string matrix_bd_93_mode_0 = const()[name = string("matrix_bd_93_mode_0"), val = string("constant")];
+            fp16 const_144_to_fp16 = const()[name = string("const_144_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 8, 5, 12, 25]> matrix_bd_93_cast_fp16 = pad(constant_val = const_144_to_fp16, mode = matrix_bd_93_mode_0, pad = matrix_bd_93_pad_0, x = input_413_cast_fp16)[name = string("matrix_bd_93_cast_fp16")];
+            tensor<int32, [4]> var_4390 = const()[name = string("op_4390"), val = tensor<int32, [4]>([1, 8, 5, 300])];
+            tensor<fp16, [1, 8, 5, 300]> matrix_bd_95_cast_fp16 = reshape(shape = var_4390, x = matrix_bd_93_cast_fp16)[name = string("matrix_bd_95_cast_fp16")];
+            tensor<int32, [4]> matrix_bd_97_begin_0 = const()[name = string("matrix_bd_97_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> matrix_bd_97_end_0 = const()[name = string("matrix_bd_97_end_0"), val = tensor<int32, [4]>([1, 8, 5, 288])];
+            tensor<bool, [4]> matrix_bd_97_end_mask_0 = const()[name = string("matrix_bd_97_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 8, 5, 288]> matrix_bd_97_cast_fp16 = slice_by_index(begin = matrix_bd_97_begin_0, end = matrix_bd_97_end_0, end_mask = matrix_bd_97_end_mask_0, x = matrix_bd_95_cast_fp16)[name = string("matrix_bd_97_cast_fp16")];
+            tensor<int32, [5]> var_4396 = const()[name = string("op_4396"), val = tensor<int32, [5]>([1, 8, 5, 12, 24])];
+            tensor<fp16, [1, 8, 5, 12, 24]> matrix_bd_99_cast_fp16 = reshape(shape = var_4396, x = matrix_bd_97_cast_fp16)[name = string("matrix_bd_99_cast_fp16")];
+            tensor<fp16, [1, 8, 5, 12, 24]> attn_55_cast_fp16 = add(x = matrix_ac_19_cast_fp16, y = matrix_bd_99_cast_fp16)[name = string("attn_55_cast_fp16")];
+            fp16 _inversed_4399_y_0_to_fp16 = const()[name = string("_inversed_4399_y_0_to_fp16"), val = fp16(0x1.47cp-6)];
+            tensor<fp16, [1, 8, 5, 12, 24]> _inversed_4399_cast_fp16 = mul(x = attn_55_cast_fp16, y = _inversed_4399_y_0_to_fp16)[name = string("_inversed_4399_cast_fp16")];
+            string _inversed_4399_cast_fp16_to_fp32_dtype_0 = const()[name = string("_inversed_4399_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 8, 5, 12, 24]> _inversed_4399_cast_fp16_to_fp32 = cast(dtype = _inversed_4399_cast_fp16_to_fp32_dtype_0, x = _inversed_4399_cast_fp16)[name = string("cast_120")];
+            tensor<fp32, [1, 8, 5, 12, 24]> var_4400 = tanh(x = _inversed_4399_cast_fp16_to_fp32)[name = string("op_4400")];
+            string var_4400_to_fp16_dtype_0 = const()[name = string("op_4400_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 self_attns_9_softcap_to_fp16 = const()[name = string("self_attns_9_softcap_to_fp16"), val = fp16(0x1.9p+5)];
+            tensor<fp16, [1, 8, 5, 12, 24]> var_4400_to_fp16 = cast(dtype = var_4400_to_fp16_dtype_0, x = var_4400)[name = string("cast_119")];
+            tensor<fp16, [1, 8, 5, 12, 24]> attn_57_cast_fp16 = mul(x = var_4400_to_fp16, y = self_attns_9_softcap_to_fp16)[name = string("attn_57_cast_fp16")];
+            string attn_57_cast_fp16_to_fp32_dtype_0 = const()[name = string("attn_57_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 8, 5, 12, 24]> attn_57_cast_fp16_to_fp32 = cast(dtype = attn_57_cast_fp16_to_fp32_dtype_0, x = attn_57_cast_fp16)[name = string("cast_118")];
+            tensor<fp32, [1, 8, 5, 12, 24]> input_415 = select(a = var_4263, b = attn_57_cast_fp16_to_fp32, cond = var_460)[name = string("input_415")];
+            tensor<fp32, [1, 8, 5, 12, 24]> var_4404 = softmax(axis = var_4262, x = input_415)[name = string("op_4404")];
+            tensor<int32, [5]> var_4406 = const()[name = string("op_4406"), val = tensor<int32, [5]>([0, 3, 1, -3, -1])];
+            bool out_55_transpose_x_0 = const()[name = string("out_55_transpose_x_0"), val = bool(false)];
+            bool out_55_transpose_y_0 = const()[name = string("out_55_transpose_y_0"), val = bool(false)];
+            string var_4404_to_fp16_dtype_0 = const()[name = string("op_4404_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 8, 5, 24, 128]> values_t_19_cast_fp16 = transpose(perm = var_4406, x = v_blocks_19_cast_fp16)[name = string("transpose_15")];
+            tensor<fp16, [1, 8, 5, 12, 24]> var_4404_to_fp16 = cast(dtype = var_4404_to_fp16_dtype_0, x = var_4404)[name = string("cast_117")];
+            tensor<fp16, [1, 8, 5, 12, 128]> out_55_cast_fp16 = matmul(transpose_x = out_55_transpose_x_0, transpose_y = out_55_transpose_y_0, x = var_4404_to_fp16, y = values_t_19_cast_fp16)[name = string("out_55_cast_fp16")];
+            tensor<int32, [5]> var_4409 = const()[name = string("op_4409"), val = tensor<int32, [5]>([0, 2, 3, 1, 4])];
+            tensor<int32, [3]> var_4411 = const()[name = string("op_4411"), val = tensor<int32, [3]>([1, 60, 1024])];
+            tensor<fp16, [1, 5, 12, 8, 128]> var_4410_cast_fp16 = transpose(perm = var_4409, x = out_55_cast_fp16)[name = string("transpose_14")];
+            tensor<fp16, [1, 60, 1024]> out_57_cast_fp16 = reshape(shape = var_4411, x = var_4410_cast_fp16)[name = string("out_57_cast_fp16")];
+            tensor<int32, [3]> var_4414_begin_0 = const()[name = string("op_4414_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_4414_end_0 = const()[name = string("op_4414_end_0"), val = tensor<int32, [3]>([1, 50, 1024])];
+            tensor<bool, [3]> var_4414_end_mask_0 = const()[name = string("op_4414_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp16, [1, 50, 1024]> var_4414_cast_fp16 = slice_by_index(begin = var_4414_begin_0, end = var_4414_end_0, end_mask = var_4414_end_mask_0, x = out_57_cast_fp16)[name = string("op_4414_cast_fp16")];
+            fp16 self_attns_9_post_input_min_to_fp16 = const()[name = string("self_attns_9_post_input_min_to_fp16"), val = fp16(-0x1.fap+3)];
+            fp16 self_attns_9_post_input_max_to_fp16 = const()[name = string("self_attns_9_post_input_max_to_fp16"), val = fp16(0x1.f6p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_265_cast_fp16 = clip(alpha = self_attns_9_post_input_min_to_fp16, beta = self_attns_9_post_input_max_to_fp16, x = var_4414_cast_fp16)[name = string("clip_265_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_9_post_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(115515456))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(116039808))))[name = string("self_attns_9_post_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_106_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_9_post_linear_weight_to_fp16_palettized, x = clip_265_cast_fp16)[name = string("linear_106_cast_fp16")];
+            fp16 self_attns_9_post_output_min_to_fp16 = const()[name = string("self_attns_9_post_output_min_to_fp16"), val = fp16(-0x1.b8p+5)];
+            fp16 self_attns_9_post_output_max_to_fp16 = const()[name = string("self_attns_9_post_output_max_to_fp16"), val = fp16(0x1.b4p+5)];
+            tensor<fp16, [1, 50, 1024]> clip_266_cast_fp16 = clip(alpha = self_attns_9_post_output_min_to_fp16, beta = self_attns_9_post_output_max_to_fp16, x = linear_106_cast_fp16)[name = string("clip_266_cast_fp16")];
+            fp16 var_4426_to_fp16 = const()[name = string("op_4426_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_4427_to_fp16 = const()[name = string("op_4427_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_267_cast_fp16 = clip(alpha = var_4426_to_fp16, beta = var_4427_to_fp16, x = clip_266_cast_fp16)[name = string("clip_267_cast_fp16")];
+            string clip_267_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_267_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_4429 = const()[name = string("op_4429"), val = fp32(-0x1p-1)];
+            fp32 var_4433_promoted = const()[name = string("op_4433_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_267_cast_fp16_to_fp32 = cast(dtype = clip_267_cast_fp16_to_fp32_dtype_0, x = clip_267_cast_fp16)[name = string("cast_116")];
+            tensor<fp32, [1, 50, 1024]> var_4439 = pow(x = clip_267_cast_fp16_to_fp32, y = var_4433_promoted)[name = string("op_4439")];
+            tensor<int32, [1]> var_4441_axes_0 = const()[name = string("op_4441_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4441_keep_dims_0 = const()[name = string("op_4441_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_4441 = reduce_mean(axes = var_4441_axes_0, keep_dims = var_4441_keep_dims_0, x = var_4439)[name = string("op_4441")];
+            string var_4441_to_fp16_dtype_0 = const()[name = string("op_4441_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_4442_to_fp16 = const()[name = string("op_4442_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_4441_to_fp16 = cast(dtype = var_4441_to_fp16_dtype_0, x = var_4441)[name = string("cast_115")];
+            tensor<fp16, [1, 50, 1]> mean_squared_169_cast_fp16 = add(x = var_4441_to_fp16, y = var_4442_to_fp16)[name = string("mean_squared_169_cast_fp16")];
+            string mean_squared_169_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_169_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_169_cast_fp16_to_fp32 = cast(dtype = mean_squared_169_cast_fp16_to_fp32_dtype_0, x = mean_squared_169_cast_fp16)[name = string("cast_114")];
+            tensor<fp32, [1, 50, 1]> var_4444 = pow(x = mean_squared_169_cast_fp16_to_fp32, y = var_4429)[name = string("op_4444")];
+            string var_4444_to_fp16_dtype_0 = const()[name = string("op_4444_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_4444_to_fp16 = cast(dtype = var_4444_to_fp16_dtype_0, x = var_4444)[name = string("cast_113")];
+            tensor<fp16, [1, 50, 1024]> normed_output_337_cast_fp16 = mul(x = clip_267_cast_fp16, y = var_4444_to_fp16)[name = string("normed_output_337_cast_fp16")];
+            tensor<fp16, [1024]> const_145_to_fp16 = const()[name = string("const_145_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(116040896)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_339_cast_fp16 = mul(x = normed_output_337_cast_fp16, y = const_145_to_fp16)[name = string("normed_output_339_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_979_cast_fp16 = add(x = normed_output_339_cast_fp16, y = hidden_states_953_cast_fp16)[name = string("hidden_states_979_cast_fp16")];
+            string hidden_states_979_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_979_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_4451 = const()[name = string("op_4451"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_4452 = const()[name = string("op_4452"), val = fp32(-0x1.2a05f2p+33)];
+            fp32 var_4464 = const()[name = string("op_4464"), val = fp32(-0x1p-1)];
+            fp32 var_4460_promoted = const()[name = string("op_4460_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> hidden_states_979_cast_fp16_to_fp32 = cast(dtype = hidden_states_979_cast_fp16_to_fp32_dtype_0, x = hidden_states_979_cast_fp16)[name = string("cast_112")];
+            tensor<fp32, [1, 50, 1024]> var_4472 = pow(x = hidden_states_979_cast_fp16_to_fp32, y = var_4460_promoted)[name = string("op_4472")];
+            tensor<int32, [1]> var_4474_axes_0 = const()[name = string("op_4474_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4474_keep_dims_0 = const()[name = string("op_4474_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_4474 = reduce_mean(axes = var_4474_axes_0, keep_dims = var_4474_keep_dims_0, x = var_4472)[name = string("op_4474")];
+            string var_4474_to_fp16_dtype_0 = const()[name = string("op_4474_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_4475_to_fp16 = const()[name = string("op_4475_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_4474_to_fp16 = cast(dtype = var_4474_to_fp16_dtype_0, x = var_4474)[name = string("cast_111")];
+            tensor<fp16, [1, 50, 1]> mean_squared_171_cast_fp16 = add(x = var_4474_to_fp16, y = var_4475_to_fp16)[name = string("mean_squared_171_cast_fp16")];
+            string mean_squared_171_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_171_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_171_cast_fp16_to_fp32 = cast(dtype = mean_squared_171_cast_fp16_to_fp32_dtype_0, x = mean_squared_171_cast_fp16)[name = string("cast_110")];
+            tensor<fp32, [1, 50, 1]> var_4477 = pow(x = mean_squared_171_cast_fp16_to_fp32, y = var_4464)[name = string("op_4477")];
+            string var_4477_to_fp16_dtype_0 = const()[name = string("op_4477_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_4477_to_fp16 = cast(dtype = var_4477_to_fp16_dtype_0, x = var_4477)[name = string("cast_109")];
+            tensor<fp16, [1, 50, 1024]> normed_output_341_cast_fp16 = mul(x = hidden_states_979_cast_fp16, y = var_4477_to_fp16)[name = string("normed_output_341_cast_fp16")];
+            tensor<fp16, [1024]> const_146_to_fp16 = const()[name = string("const_146_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(116043008)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_343_cast_fp16 = mul(x = normed_output_341_cast_fp16, y = const_146_to_fp16)[name = string("normed_output_343_cast_fp16")];
+            fp16 lconv1ds_9_linear_start_input_min_to_fp16 = const()[name = string("lconv1ds_9_linear_start_input_min_to_fp16"), val = fp16(-0x1.5p+3)];
+            fp16 lconv1ds_9_linear_start_input_max_to_fp16 = const()[name = string("lconv1ds_9_linear_start_input_max_to_fp16"), val = fp16(0x1.4ep+3)];
+            tensor<fp16, [1, 50, 1024]> clip_268_cast_fp16 = clip(alpha = lconv1ds_9_linear_start_input_min_to_fp16, beta = lconv1ds_9_linear_start_input_max_to_fp16, x = normed_output_343_cast_fp16)[name = string("clip_268_cast_fp16")];
+            tensor<fp16, [2048, 1024]> lconv1ds_9_linear_start_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(116045120))), lut = tensor<fp16, [64, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(117093760))))[name = string("lconv1ds_9_linear_start_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 2048]> linear_107_cast_fp16 = linear(bias = linear_8_bias_0_to_fp16, weight = lconv1ds_9_linear_start_linear_weight_to_fp16_palettized, x = clip_268_cast_fp16)[name = string("linear_107_cast_fp16")];
+            fp16 lconv1ds_9_linear_start_output_min_to_fp16 = const()[name = string("lconv1ds_9_linear_start_output_min_to_fp16"), val = fp16(-0x1.86p+4)];
+            fp16 lconv1ds_9_linear_start_output_max_to_fp16 = const()[name = string("lconv1ds_9_linear_start_output_max_to_fp16"), val = fp16(0x1.82p+4)];
+            tensor<fp16, [1, 50, 2048]> clip_269_cast_fp16 = clip(alpha = lconv1ds_9_linear_start_output_min_to_fp16, beta = lconv1ds_9_linear_start_output_max_to_fp16, x = linear_107_cast_fp16)[name = string("clip_269_cast_fp16")];
+            int32 hidden_states_987_split_num_splits_0 = const()[name = string("hidden_states_987_split_num_splits_0"), val = int32(2)];
+            int32 hidden_states_987_split_axis_0 = const()[name = string("hidden_states_987_split_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_987_split_cast_fp16_0, tensor<fp16, [1, 50, 1024]> hidden_states_987_split_cast_fp16_1 = split(axis = hidden_states_987_split_axis_0, num_splits = hidden_states_987_split_num_splits_0, x = clip_269_cast_fp16)[name = string("hidden_states_987_split_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_987_split_1_sigmoid_cast_fp16 = sigmoid(x = hidden_states_987_split_cast_fp16_1)[name = string("hidden_states_987_split_1_sigmoid_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_987_cast_fp16 = mul(x = hidden_states_987_split_cast_fp16_0, y = hidden_states_987_split_1_sigmoid_cast_fp16)[name = string("hidden_states_987_cast_fp16")];
+            tensor<int32, [3]> input_423_perm_0 = const()[name = string("input_423_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [6]> input_425_pad_0 = const()[name = string("input_425_pad_0"), val = tensor<int32, [6]>([0, 0, 0, 0, 4, 0])];
+            string input_425_mode_0 = const()[name = string("input_425_mode_0"), val = string("constant")];
+            fp16 const_147_to_fp16 = const()[name = string("const_147_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 1024, 50]> input_423_cast_fp16 = transpose(perm = input_423_perm_0, x = hidden_states_987_cast_fp16)[name = string("transpose_13")];
+            tensor<fp16, [1, 1024, 54]> input_425_cast_fp16 = pad(constant_val = const_147_to_fp16, mode = input_425_mode_0, pad = input_425_pad_0, x = input_423_cast_fp16)[name = string("input_425_cast_fp16")];
+            string var_4503_pad_type_0 = const()[name = string("op_4503_pad_type_0"), val = string("valid")];
+            int32 var_4503_groups_0 = const()[name = string("op_4503_groups_0"), val = int32(1024)];
+            tensor<int32, [1]> var_4503_strides_0 = const()[name = string("op_4503_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_4503_pad_0 = const()[name = string("op_4503_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_4503_dilations_0 = const()[name = string("op_4503_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1024, 1, 5]> lconv1ds_9_depthwise_conv1d_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1, 5]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(117095872))), lut = tensor<fp16, [32, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(117098496))))[name = string("lconv1ds_9_depthwise_conv1d_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 50]> var_4503_cast_fp16 = conv(dilations = var_4503_dilations_0, groups = var_4503_groups_0, pad = var_4503_pad_0, pad_type = var_4503_pad_type_0, strides = var_4503_strides_0, weight = lconv1ds_9_depthwise_conv1d_weight_to_fp16_palettized, x = input_425_cast_fp16)[name = string("op_4503_cast_fp16")];
+            tensor<int32, [3]> hidden_states_989_perm_0 = const()[name = string("hidden_states_989_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            string hidden_states_989_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_989_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_989_cast_fp16 = transpose(perm = hidden_states_989_perm_0, x = var_4503_cast_fp16)[name = string("transpose_12")];
+            tensor<fp32, [1, 50, 1024]> hidden_states_989_cast_fp16_to_fp32 = cast(dtype = hidden_states_989_cast_fp16_to_fp32_dtype_0, x = hidden_states_989_cast_fp16)[name = string("cast_108")];
+            tensor<fp32, [1, 50, 1024]> clip_270 = clip(alpha = var_4452, beta = var_4451, x = hidden_states_989_cast_fp16_to_fp32)[name = string("clip_270")];
+            fp32 var_4460_promoted_1 = const()[name = string("op_4460_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_4508 = pow(x = clip_270, y = var_4460_promoted_1)[name = string("op_4508")];
+            tensor<int32, [1]> var_4510_axes_0 = const()[name = string("op_4510_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4510_keep_dims_0 = const()[name = string("op_4510_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_4510 = reduce_mean(axes = var_4510_axes_0, keep_dims = var_4510_keep_dims_0, x = var_4508)[name = string("op_4510")];
+            string var_4510_to_fp16_dtype_0 = const()[name = string("op_4510_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_4511_to_fp16 = const()[name = string("op_4511_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_4510_to_fp16 = cast(dtype = var_4510_to_fp16_dtype_0, x = var_4510)[name = string("cast_107")];
+            tensor<fp16, [1, 50, 1]> mean_squared_173_cast_fp16 = add(x = var_4510_to_fp16, y = var_4511_to_fp16)[name = string("mean_squared_173_cast_fp16")];
+            string mean_squared_173_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_173_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_173_cast_fp16_to_fp32 = cast(dtype = mean_squared_173_cast_fp16_to_fp32_dtype_0, x = mean_squared_173_cast_fp16)[name = string("cast_106")];
+            tensor<fp32, [1, 50, 1]> var_4513 = pow(x = mean_squared_173_cast_fp16_to_fp32, y = var_4464)[name = string("op_4513")];
+            string clip_270_to_fp16_dtype_0 = const()[name = string("clip_270_to_fp16_dtype_0"), val = string("fp16")];
+            string var_4513_to_fp16_dtype_0 = const()[name = string("op_4513_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_270_to_fp16 = cast(dtype = clip_270_to_fp16_dtype_0, x = clip_270)[name = string("cast_104")];
+            tensor<fp16, [1, 50, 1]> var_4513_to_fp16 = cast(dtype = var_4513_to_fp16_dtype_0, x = var_4513)[name = string("cast_105")];
+            tensor<fp16, [1, 50, 1024]> normed_output_345_cast_fp16 = mul(x = clip_270_to_fp16, y = var_4513_to_fp16)[name = string("normed_output_345_cast_fp16")];
+            tensor<fp16, [1024]> const_148_to_fp16 = const()[name = string("const_148_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(117099584)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_347_cast_fp16 = mul(x = normed_output_345_cast_fp16, y = const_148_to_fp16)[name = string("normed_output_347_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_995_cast_fp16 = silu(x = normed_output_347_cast_fp16)[name = string("hidden_states_995_cast_fp16")];
+            fp16 lconv1ds_9_linear_end_input_min_to_fp16 = const()[name = string("lconv1ds_9_linear_end_input_min_to_fp16"), val = fp16(-0x1.02p+3)];
+            fp16 lconv1ds_9_linear_end_input_max_to_fp16 = const()[name = string("lconv1ds_9_linear_end_input_max_to_fp16"), val = fp16(0x1p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_271_cast_fp16 = clip(alpha = lconv1ds_9_linear_end_input_min_to_fp16, beta = lconv1ds_9_linear_end_input_max_to_fp16, x = hidden_states_995_cast_fp16)[name = string("clip_271_cast_fp16")];
+            tensor<fp16, [1024, 1024]> lconv1ds_9_linear_end_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(117101696))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(117626048))))[name = string("lconv1ds_9_linear_end_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_108_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = lconv1ds_9_linear_end_linear_weight_to_fp16_palettized, x = clip_271_cast_fp16)[name = string("linear_108_cast_fp16")];
+            fp16 lconv1ds_9_linear_end_output_min_to_fp16 = const()[name = string("lconv1ds_9_linear_end_output_min_to_fp16"), val = fp16(-0x1.02p+3)];
+            fp16 lconv1ds_9_linear_end_output_max_to_fp16 = const()[name = string("lconv1ds_9_linear_end_output_max_to_fp16"), val = fp16(0x1p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_272_cast_fp16 = clip(alpha = lconv1ds_9_linear_end_output_min_to_fp16, beta = lconv1ds_9_linear_end_output_max_to_fp16, x = linear_108_cast_fp16)[name = string("clip_272_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_1001_cast_fp16 = add(x = clip_272_cast_fp16, y = hidden_states_979_cast_fp16)[name = string("hidden_states_1001_cast_fp16")];
+            string hidden_states_1001_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_1001_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_4537 = const()[name = string("op_4537"), val = fp32(-0x1p-1)];
+            fp32 var_4538 = const()[name = string("op_4538"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_4539 = const()[name = string("op_4539"), val = fp32(-0x1.2a05f2p+33)];
+            tensor<fp32, [1, 50, 1024]> hidden_states_1001_cast_fp16_to_fp32 = cast(dtype = hidden_states_1001_cast_fp16_to_fp32_dtype_0, x = hidden_states_1001_cast_fp16)[name = string("cast_103")];
+            tensor<fp32, [1, 50, 1024]> clip_273 = clip(alpha = var_4539, beta = var_4538, x = hidden_states_1001_cast_fp16_to_fp32)[name = string("clip_273")];
+            fp32 var_4533_promoted = const()[name = string("op_4533_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_4547 = pow(x = clip_273, y = var_4533_promoted)[name = string("op_4547")];
+            tensor<int32, [1]> var_4549_axes_0 = const()[name = string("op_4549_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4549_keep_dims_0 = const()[name = string("op_4549_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_4549 = reduce_mean(axes = var_4549_axes_0, keep_dims = var_4549_keep_dims_0, x = var_4547)[name = string("op_4549")];
+            string var_4549_to_fp16_dtype_0 = const()[name = string("op_4549_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_4550_to_fp16 = const()[name = string("op_4550_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_4549_to_fp16 = cast(dtype = var_4549_to_fp16_dtype_0, x = var_4549)[name = string("cast_102")];
+            tensor<fp16, [1, 50, 1]> mean_squared_175_cast_fp16 = add(x = var_4549_to_fp16, y = var_4550_to_fp16)[name = string("mean_squared_175_cast_fp16")];
+            string mean_squared_175_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_175_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_175_cast_fp16_to_fp32 = cast(dtype = mean_squared_175_cast_fp16_to_fp32_dtype_0, x = mean_squared_175_cast_fp16)[name = string("cast_101")];
+            tensor<fp32, [1, 50, 1]> var_4552 = pow(x = mean_squared_175_cast_fp16_to_fp32, y = var_4537)[name = string("op_4552")];
+            string clip_273_to_fp16_dtype_0 = const()[name = string("clip_273_to_fp16_dtype_0"), val = string("fp16")];
+            string var_4552_to_fp16_dtype_0 = const()[name = string("op_4552_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_273_to_fp16 = cast(dtype = clip_273_to_fp16_dtype_0, x = clip_273)[name = string("cast_99")];
+            tensor<fp16, [1, 50, 1]> var_4552_to_fp16 = cast(dtype = var_4552_to_fp16_dtype_0, x = var_4552)[name = string("cast_100")];
+            tensor<fp16, [1, 50, 1024]> normed_output_349_cast_fp16 = mul(x = clip_273_to_fp16, y = var_4552_to_fp16)[name = string("normed_output_349_cast_fp16")];
+            tensor<fp16, [1024]> const_149_to_fp16 = const()[name = string("const_149_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(117627136)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_351_cast_fp16 = mul(x = normed_output_349_cast_fp16, y = const_149_to_fp16)[name = string("normed_output_351_cast_fp16")];
+            fp16 feed_forward2s_9_ffw_layer_1_input_min_to_fp16 = const()[name = string("feed_forward2s_9_ffw_layer_1_input_min_to_fp16"), val = fp16(-0x1.92p+3)];
+            fp16 feed_forward2s_9_ffw_layer_1_input_max_to_fp16 = const()[name = string("feed_forward2s_9_ffw_layer_1_input_max_to_fp16"), val = fp16(0x1.9p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_274_cast_fp16 = clip(alpha = feed_forward2s_9_ffw_layer_1_input_min_to_fp16, beta = feed_forward2s_9_ffw_layer_1_input_max_to_fp16, x = normed_output_351_cast_fp16)[name = string("clip_274_cast_fp16")];
+            tensor<fp16, [4096, 1024]> feed_forward2s_9_ffw_layer_1_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(117629248))), lut = tensor<fp16, [128, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(119726464))))[name = string("feed_forward2s_9_ffw_layer_1_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 4096]> linear_109_cast_fp16 = linear(bias = linear_1_bias_0_to_fp16, weight = feed_forward2s_9_ffw_layer_1_linear_weight_to_fp16_palettized, x = clip_274_cast_fp16)[name = string("linear_109_cast_fp16")];
+            fp16 feed_forward2s_9_ffw_layer_1_output_min_to_fp16 = const()[name = string("feed_forward2s_9_ffw_layer_1_output_min_to_fp16"), val = fp16(-0x1.e6p+4)];
+            fp16 feed_forward2s_9_ffw_layer_1_output_max_to_fp16 = const()[name = string("feed_forward2s_9_ffw_layer_1_output_max_to_fp16"), val = fp16(0x1.e2p+4)];
+            tensor<fp16, [1, 50, 4096]> clip_275_cast_fp16 = clip(alpha = feed_forward2s_9_ffw_layer_1_output_min_to_fp16, beta = feed_forward2s_9_ffw_layer_1_output_max_to_fp16, x = linear_109_cast_fp16)[name = string("clip_275_cast_fp16")];
+            tensor<fp16, [1, 50, 4096]> hidden_states_1011_cast_fp16 = silu(x = clip_275_cast_fp16)[name = string("hidden_states_1011_cast_fp16")];
+            fp16 feed_forward2s_9_ffw_layer_2_input_min_to_fp16 = const()[name = string("feed_forward2s_9_ffw_layer_2_input_min_to_fp16"), val = fp16(-0x1.b4p+3)];
+            fp16 feed_forward2s_9_ffw_layer_2_input_max_to_fp16 = const()[name = string("feed_forward2s_9_ffw_layer_2_input_max_to_fp16"), val = fp16(0x1.bp+3)];
+            tensor<fp16, [1, 50, 4096]> clip_276_cast_fp16 = clip(alpha = feed_forward2s_9_ffw_layer_2_input_min_to_fp16, beta = feed_forward2s_9_ffw_layer_2_input_max_to_fp16, x = hidden_states_1011_cast_fp16)[name = string("clip_276_cast_fp16")];
+            tensor<fp16, [1024, 4096]> feed_forward2s_9_ffw_layer_2_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 4096]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(119730624))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(121827840))))[name = string("feed_forward2s_9_ffw_layer_2_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_110_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = feed_forward2s_9_ffw_layer_2_linear_weight_to_fp16_palettized, x = clip_276_cast_fp16)[name = string("linear_110_cast_fp16")];
+            fp16 feed_forward2s_9_ffw_layer_2_output_min_to_fp16 = const()[name = string("feed_forward2s_9_ffw_layer_2_output_min_to_fp16"), val = fp16(-0x1.68p+6)];
+            fp16 feed_forward2s_9_ffw_layer_2_output_max_to_fp16 = const()[name = string("feed_forward2s_9_ffw_layer_2_output_max_to_fp16"), val = fp16(0x1.66p+6)];
+            tensor<fp16, [1, 50, 1024]> clip_277_cast_fp16 = clip(alpha = feed_forward2s_9_ffw_layer_2_output_min_to_fp16, beta = feed_forward2s_9_ffw_layer_2_output_max_to_fp16, x = linear_110_cast_fp16)[name = string("clip_277_cast_fp16")];
+            string clip_277_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_277_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1024]> clip_277_cast_fp16_to_fp32 = cast(dtype = clip_277_cast_fp16_to_fp32_dtype_0, x = clip_277_cast_fp16)[name = string("cast_98")];
+            tensor<fp32, [1, 50, 1024]> clip_278 = clip(alpha = var_4539, beta = var_4538, x = clip_277_cast_fp16_to_fp32)[name = string("clip_278")];
+            fp32 var_4533_promoted_1 = const()[name = string("op_4533_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_4579 = pow(x = clip_278, y = var_4533_promoted_1)[name = string("op_4579")];
+            tensor<int32, [1]> var_4581_axes_0 = const()[name = string("op_4581_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4581_keep_dims_0 = const()[name = string("op_4581_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_4581 = reduce_mean(axes = var_4581_axes_0, keep_dims = var_4581_keep_dims_0, x = var_4579)[name = string("op_4581")];
+            string var_4581_to_fp16_dtype_0 = const()[name = string("op_4581_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_4582_to_fp16 = const()[name = string("op_4582_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_4581_to_fp16 = cast(dtype = var_4581_to_fp16_dtype_0, x = var_4581)[name = string("cast_97")];
+            tensor<fp16, [1, 50, 1]> mean_squared_177_cast_fp16 = add(x = var_4581_to_fp16, y = var_4582_to_fp16)[name = string("mean_squared_177_cast_fp16")];
+            string mean_squared_177_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_177_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_177_cast_fp16_to_fp32 = cast(dtype = mean_squared_177_cast_fp16_to_fp32_dtype_0, x = mean_squared_177_cast_fp16)[name = string("cast_96")];
+            tensor<fp32, [1, 50, 1]> var_4584 = pow(x = mean_squared_177_cast_fp16_to_fp32, y = var_4537)[name = string("op_4584")];
+            string clip_278_to_fp16_dtype_0 = const()[name = string("clip_278_to_fp16_dtype_0"), val = string("fp16")];
+            string var_4584_to_fp16_dtype_0 = const()[name = string("op_4584_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_278_to_fp16 = cast(dtype = clip_278_to_fp16_dtype_0, x = clip_278)[name = string("cast_94")];
+            tensor<fp16, [1, 50, 1]> var_4584_to_fp16 = cast(dtype = var_4584_to_fp16_dtype_0, x = var_4584)[name = string("cast_95")];
+            tensor<fp16, [1, 50, 1024]> normed_output_353_cast_fp16 = mul(x = clip_278_to_fp16, y = var_4584_to_fp16)[name = string("normed_output_353_cast_fp16")];
+            tensor<fp16, [1024]> const_150_to_fp16 = const()[name = string("const_150_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(121828928)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_355_cast_fp16 = mul(x = normed_output_353_cast_fp16, y = const_150_to_fp16)[name = string("normed_output_355_cast_fp16")];
+            fp16 var_4529_to_fp16 = const()[name = string("op_4529_to_fp16"), val = fp16(0x1p-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_1023_cast_fp16 = mul(x = normed_output_355_cast_fp16, y = var_4529_to_fp16)[name = string("hidden_states_1023_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_1025_cast_fp16 = add(x = hidden_states_1023_cast_fp16, y = hidden_states_1001_cast_fp16)[name = string("hidden_states_1025_cast_fp16")];
+            fp16 var_4591_to_fp16 = const()[name = string("op_4591_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_4592_to_fp16 = const()[name = string("op_4592_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_279_cast_fp16 = clip(alpha = var_4591_to_fp16, beta = var_4592_to_fp16, x = hidden_states_1025_cast_fp16)[name = string("clip_279_cast_fp16")];
+            string clip_279_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_279_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_4594 = const()[name = string("op_4594"), val = fp32(-0x1p-1)];
+            fp32 var_4598_promoted = const()[name = string("op_4598_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_279_cast_fp16_to_fp32 = cast(dtype = clip_279_cast_fp16_to_fp32_dtype_0, x = clip_279_cast_fp16)[name = string("cast_93")];
+            tensor<fp32, [1, 50, 1024]> var_4604 = pow(x = clip_279_cast_fp16_to_fp32, y = var_4598_promoted)[name = string("op_4604")];
+            tensor<int32, [1]> var_4606_axes_0 = const()[name = string("op_4606_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4606_keep_dims_0 = const()[name = string("op_4606_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_4606 = reduce_mean(axes = var_4606_axes_0, keep_dims = var_4606_keep_dims_0, x = var_4604)[name = string("op_4606")];
+            string var_4606_to_fp16_dtype_0 = const()[name = string("op_4606_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_4607_to_fp16 = const()[name = string("op_4607_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_4606_to_fp16 = cast(dtype = var_4606_to_fp16_dtype_0, x = var_4606)[name = string("cast_92")];
+            tensor<fp16, [1, 50, 1]> mean_squared_179_cast_fp16 = add(x = var_4606_to_fp16, y = var_4607_to_fp16)[name = string("mean_squared_179_cast_fp16")];
+            string mean_squared_179_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_179_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_179_cast_fp16_to_fp32 = cast(dtype = mean_squared_179_cast_fp16_to_fp32_dtype_0, x = mean_squared_179_cast_fp16)[name = string("cast_91")];
+            tensor<fp32, [1, 50, 1]> var_4609 = pow(x = mean_squared_179_cast_fp16_to_fp32, y = var_4594)[name = string("op_4609")];
+            string var_4609_to_fp16_dtype_0 = const()[name = string("op_4609_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_4609_to_fp16 = cast(dtype = var_4609_to_fp16_dtype_0, x = var_4609)[name = string("cast_90")];
+            tensor<fp16, [1, 50, 1024]> normed_output_357_cast_fp16 = mul(x = clip_279_cast_fp16, y = var_4609_to_fp16)[name = string("normed_output_357_cast_fp16")];
+            tensor<fp16, [1024]> const_151_to_fp16 = const()[name = string("const_151_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(121831040)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_359_cast_fp16 = mul(x = normed_output_357_cast_fp16, y = const_151_to_fp16)[name = string("normed_output_359_cast_fp16")];
+            string normed_output_359_cast_fp16_to_fp32_dtype_0 = const()[name = string("normed_output_359_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_4622 = const()[name = string("op_4622"), val = fp32(-0x1p-1)];
+            fp32 var_4623 = const()[name = string("op_4623"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_4624 = const()[name = string("op_4624"), val = fp32(-0x1.2a05f2p+33)];
+            tensor<fp32, [1, 50, 1024]> normed_output_359_cast_fp16_to_fp32 = cast(dtype = normed_output_359_cast_fp16_to_fp32_dtype_0, x = normed_output_359_cast_fp16)[name = string("cast_89")];
+            tensor<fp32, [1, 50, 1024]> clip_280 = clip(alpha = var_4624, beta = var_4623, x = normed_output_359_cast_fp16_to_fp32)[name = string("clip_280")];
+            fp32 var_4618_promoted = const()[name = string("op_4618_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_4632 = pow(x = clip_280, y = var_4618_promoted)[name = string("op_4632")];
+            tensor<int32, [1]> var_4634_axes_0 = const()[name = string("op_4634_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4634_keep_dims_0 = const()[name = string("op_4634_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_4634 = reduce_mean(axes = var_4634_axes_0, keep_dims = var_4634_keep_dims_0, x = var_4632)[name = string("op_4634")];
+            string var_4634_to_fp16_dtype_0 = const()[name = string("op_4634_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_4635_to_fp16 = const()[name = string("op_4635_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_4634_to_fp16 = cast(dtype = var_4634_to_fp16_dtype_0, x = var_4634)[name = string("cast_88")];
+            tensor<fp16, [1, 50, 1]> mean_squared_181_cast_fp16 = add(x = var_4634_to_fp16, y = var_4635_to_fp16)[name = string("mean_squared_181_cast_fp16")];
+            string mean_squared_181_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_181_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_181_cast_fp16_to_fp32 = cast(dtype = mean_squared_181_cast_fp16_to_fp32_dtype_0, x = mean_squared_181_cast_fp16)[name = string("cast_87")];
+            tensor<fp32, [1, 50, 1]> var_4637 = pow(x = mean_squared_181_cast_fp16_to_fp32, y = var_4622)[name = string("op_4637")];
+            string clip_280_to_fp16_dtype_0 = const()[name = string("clip_280_to_fp16_dtype_0"), val = string("fp16")];
+            string var_4637_to_fp16_dtype_0 = const()[name = string("op_4637_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_280_to_fp16 = cast(dtype = clip_280_to_fp16_dtype_0, x = clip_280)[name = string("cast_85")];
+            tensor<fp16, [1, 50, 1]> var_4637_to_fp16 = cast(dtype = var_4637_to_fp16_dtype_0, x = var_4637)[name = string("cast_86")];
+            tensor<fp16, [1, 50, 1024]> normed_output_361_cast_fp16 = mul(x = clip_280_to_fp16, y = var_4637_to_fp16)[name = string("normed_output_361_cast_fp16")];
+            tensor<fp16, [1024]> const_152_to_fp16 = const()[name = string("const_152_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(121833152)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_363_cast_fp16 = mul(x = normed_output_361_cast_fp16, y = const_152_to_fp16)[name = string("normed_output_363_cast_fp16")];
+            fp16 feed_forward1s_10_ffw_layer_1_input_min_to_fp16 = const()[name = string("feed_forward1s_10_ffw_layer_1_input_min_to_fp16"), val = fp16(-0x1.aap+2)];
+            fp16 feed_forward1s_10_ffw_layer_1_input_max_to_fp16 = const()[name = string("feed_forward1s_10_ffw_layer_1_input_max_to_fp16"), val = fp16(0x1.a6p+2)];
+            tensor<fp16, [1, 50, 1024]> clip_281_cast_fp16 = clip(alpha = feed_forward1s_10_ffw_layer_1_input_min_to_fp16, beta = feed_forward1s_10_ffw_layer_1_input_max_to_fp16, x = normed_output_363_cast_fp16)[name = string("clip_281_cast_fp16")];
+            tensor<fp16, [4096, 1024]> feed_forward1s_10_ffw_layer_1_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(121835264))), lut = tensor<fp16, [128, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(123932480))))[name = string("feed_forward1s_10_ffw_layer_1_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 4096]> linear_111_cast_fp16 = linear(bias = linear_1_bias_0_to_fp16, weight = feed_forward1s_10_ffw_layer_1_linear_weight_to_fp16_palettized, x = clip_281_cast_fp16)[name = string("linear_111_cast_fp16")];
+            fp16 feed_forward1s_10_ffw_layer_1_output_min_to_fp16 = const()[name = string("feed_forward1s_10_ffw_layer_1_output_min_to_fp16"), val = fp16(-0x1.b4p+3)];
+            fp16 feed_forward1s_10_ffw_layer_1_output_max_to_fp16 = const()[name = string("feed_forward1s_10_ffw_layer_1_output_max_to_fp16"), val = fp16(0x1.b2p+3)];
+            tensor<fp16, [1, 50, 4096]> clip_282_cast_fp16 = clip(alpha = feed_forward1s_10_ffw_layer_1_output_min_to_fp16, beta = feed_forward1s_10_ffw_layer_1_output_max_to_fp16, x = linear_111_cast_fp16)[name = string("clip_282_cast_fp16")];
+            tensor<fp16, [1, 50, 4096]> hidden_states_1041_cast_fp16 = silu(x = clip_282_cast_fp16)[name = string("hidden_states_1041_cast_fp16")];
+            fp16 feed_forward1s_10_ffw_layer_2_input_min_to_fp16 = const()[name = string("feed_forward1s_10_ffw_layer_2_input_min_to_fp16"), val = fp16(-0x1.1ap+3)];
+            fp16 feed_forward1s_10_ffw_layer_2_input_max_to_fp16 = const()[name = string("feed_forward1s_10_ffw_layer_2_input_max_to_fp16"), val = fp16(0x1.18p+3)];
+            tensor<fp16, [1, 50, 4096]> clip_283_cast_fp16 = clip(alpha = feed_forward1s_10_ffw_layer_2_input_min_to_fp16, beta = feed_forward1s_10_ffw_layer_2_input_max_to_fp16, x = hidden_states_1041_cast_fp16)[name = string("clip_283_cast_fp16")];
+            tensor<fp16, [1024, 4096]> feed_forward1s_10_ffw_layer_2_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 4096]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(123936640))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(126033856))))[name = string("feed_forward1s_10_ffw_layer_2_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_112_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = feed_forward1s_10_ffw_layer_2_linear_weight_to_fp16_palettized, x = clip_283_cast_fp16)[name = string("linear_112_cast_fp16")];
+            fp16 feed_forward1s_10_ffw_layer_2_output_min_to_fp16 = const()[name = string("feed_forward1s_10_ffw_layer_2_output_min_to_fp16"), val = fp16(-0x1.8ep+5)];
+            fp16 feed_forward1s_10_ffw_layer_2_output_max_to_fp16 = const()[name = string("feed_forward1s_10_ffw_layer_2_output_max_to_fp16"), val = fp16(0x1.8cp+5)];
+            tensor<fp16, [1, 50, 1024]> clip_284_cast_fp16 = clip(alpha = feed_forward1s_10_ffw_layer_2_output_min_to_fp16, beta = feed_forward1s_10_ffw_layer_2_output_max_to_fp16, x = linear_112_cast_fp16)[name = string("clip_284_cast_fp16")];
+            string clip_284_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_284_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1024]> clip_284_cast_fp16_to_fp32 = cast(dtype = clip_284_cast_fp16_to_fp32_dtype_0, x = clip_284_cast_fp16)[name = string("cast_84")];
+            tensor<fp32, [1, 50, 1024]> clip_285 = clip(alpha = var_4624, beta = var_4623, x = clip_284_cast_fp16_to_fp32)[name = string("clip_285")];
+            fp32 var_4618_promoted_1 = const()[name = string("op_4618_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_4664 = pow(x = clip_285, y = var_4618_promoted_1)[name = string("op_4664")];
+            tensor<int32, [1]> var_4666_axes_0 = const()[name = string("op_4666_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4666_keep_dims_0 = const()[name = string("op_4666_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_4666 = reduce_mean(axes = var_4666_axes_0, keep_dims = var_4666_keep_dims_0, x = var_4664)[name = string("op_4666")];
+            string var_4666_to_fp16_dtype_0 = const()[name = string("op_4666_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_4667_to_fp16 = const()[name = string("op_4667_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_4666_to_fp16 = cast(dtype = var_4666_to_fp16_dtype_0, x = var_4666)[name = string("cast_83")];
+            tensor<fp16, [1, 50, 1]> mean_squared_183_cast_fp16 = add(x = var_4666_to_fp16, y = var_4667_to_fp16)[name = string("mean_squared_183_cast_fp16")];
+            string mean_squared_183_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_183_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_183_cast_fp16_to_fp32 = cast(dtype = mean_squared_183_cast_fp16_to_fp32_dtype_0, x = mean_squared_183_cast_fp16)[name = string("cast_82")];
+            tensor<fp32, [1, 50, 1]> var_4669 = pow(x = mean_squared_183_cast_fp16_to_fp32, y = var_4622)[name = string("op_4669")];
+            string clip_285_to_fp16_dtype_0 = const()[name = string("clip_285_to_fp16_dtype_0"), val = string("fp16")];
+            string var_4669_to_fp16_dtype_0 = const()[name = string("op_4669_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_285_to_fp16 = cast(dtype = clip_285_to_fp16_dtype_0, x = clip_285)[name = string("cast_80")];
+            tensor<fp16, [1, 50, 1]> var_4669_to_fp16 = cast(dtype = var_4669_to_fp16_dtype_0, x = var_4669)[name = string("cast_81")];
+            tensor<fp16, [1, 50, 1024]> normed_output_365_cast_fp16 = mul(x = clip_285_to_fp16, y = var_4669_to_fp16)[name = string("normed_output_365_cast_fp16")];
+            tensor<fp16, [1024]> const_153_to_fp16 = const()[name = string("const_153_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(126034944)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_367_cast_fp16 = mul(x = normed_output_365_cast_fp16, y = const_153_to_fp16)[name = string("normed_output_367_cast_fp16")];
+            fp16 var_4614_to_fp16 = const()[name = string("op_4614_to_fp16"), val = fp16(0x1p-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_1053_cast_fp16 = mul(x = normed_output_367_cast_fp16, y = var_4614_to_fp16)[name = string("hidden_states_1053_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_1055_cast_fp16 = add(x = hidden_states_1053_cast_fp16, y = normed_output_359_cast_fp16)[name = string("hidden_states_1055_cast_fp16")];
+            fp16 var_4676_to_fp16 = const()[name = string("op_4676_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_4677_to_fp16 = const()[name = string("op_4677_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_286_cast_fp16 = clip(alpha = var_4676_to_fp16, beta = var_4677_to_fp16, x = hidden_states_1055_cast_fp16)[name = string("clip_286_cast_fp16")];
+            string clip_286_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_286_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_4679 = const()[name = string("op_4679"), val = fp32(-0x1p-1)];
+            fp32 var_4683_promoted = const()[name = string("op_4683_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_286_cast_fp16_to_fp32 = cast(dtype = clip_286_cast_fp16_to_fp32_dtype_0, x = clip_286_cast_fp16)[name = string("cast_79")];
+            tensor<fp32, [1, 50, 1024]> var_4689 = pow(x = clip_286_cast_fp16_to_fp32, y = var_4683_promoted)[name = string("op_4689")];
+            tensor<int32, [1]> var_4691_axes_0 = const()[name = string("op_4691_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4691_keep_dims_0 = const()[name = string("op_4691_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_4691 = reduce_mean(axes = var_4691_axes_0, keep_dims = var_4691_keep_dims_0, x = var_4689)[name = string("op_4691")];
+            string var_4691_to_fp16_dtype_0 = const()[name = string("op_4691_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_4692_to_fp16 = const()[name = string("op_4692_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_4691_to_fp16 = cast(dtype = var_4691_to_fp16_dtype_0, x = var_4691)[name = string("cast_78")];
+            tensor<fp16, [1, 50, 1]> mean_squared_185_cast_fp16 = add(x = var_4691_to_fp16, y = var_4692_to_fp16)[name = string("mean_squared_185_cast_fp16")];
+            string mean_squared_185_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_185_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_185_cast_fp16_to_fp32 = cast(dtype = mean_squared_185_cast_fp16_to_fp32_dtype_0, x = mean_squared_185_cast_fp16)[name = string("cast_77")];
+            tensor<fp32, [1, 50, 1]> var_4694 = pow(x = mean_squared_185_cast_fp16_to_fp32, y = var_4679)[name = string("op_4694")];
+            string var_4694_to_fp16_dtype_0 = const()[name = string("op_4694_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_4694_to_fp16 = cast(dtype = var_4694_to_fp16_dtype_0, x = var_4694)[name = string("cast_76")];
+            tensor<fp16, [1, 50, 1024]> normed_output_369_cast_fp16 = mul(x = clip_286_cast_fp16, y = var_4694_to_fp16)[name = string("normed_output_369_cast_fp16")];
+            tensor<fp16, [1024]> const_154_to_fp16 = const()[name = string("const_154_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(126037056)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_371_cast_fp16 = mul(x = normed_output_369_cast_fp16, y = const_154_to_fp16)[name = string("normed_output_371_cast_fp16")];
+            int32 var_4700 = const()[name = string("op_4700"), val = int32(-1)];
+            fp32 var_4701 = const()[name = string("op_4701"), val = fp32(-0x1.dcd65p+29)];
+            fp16 self_attns_10_q_proj_input_min_to_fp16 = const()[name = string("self_attns_10_q_proj_input_min_to_fp16"), val = fp16(-0x1.4ep+3)];
+            fp16 self_attns_10_q_proj_input_max_to_fp16 = const()[name = string("self_attns_10_q_proj_input_max_to_fp16"), val = fp16(0x1.4ap+3)];
+            tensor<fp16, [1, 50, 1024]> clip_287_cast_fp16 = clip(alpha = self_attns_10_q_proj_input_min_to_fp16, beta = self_attns_10_q_proj_input_max_to_fp16, x = normed_output_371_cast_fp16)[name = string("clip_287_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_10_q_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(126039168))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(126563520))))[name = string("self_attns_10_q_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_113_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_10_q_proj_linear_weight_to_fp16_palettized, x = clip_287_cast_fp16)[name = string("linear_113_cast_fp16")];
+            fp16 self_attns_10_q_proj_output_min_to_fp16 = const()[name = string("self_attns_10_q_proj_output_min_to_fp16"), val = fp16(-0x1.36p+4)];
+            fp16 self_attns_10_q_proj_output_max_to_fp16 = const()[name = string("self_attns_10_q_proj_output_max_to_fp16"), val = fp16(0x1.32p+4)];
+            tensor<fp16, [1, 50, 1024]> clip_288_cast_fp16 = clip(alpha = self_attns_10_q_proj_output_min_to_fp16, beta = self_attns_10_q_proj_output_max_to_fp16, x = linear_113_cast_fp16)[name = string("clip_288_cast_fp16")];
+            tensor<int32, [4]> var_4745 = const()[name = string("op_4745"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> q_21_cast_fp16 = reshape(shape = var_4745, x = clip_288_cast_fp16)[name = string("q_21_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_10_k_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(126564608))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(127088960))))[name = string("self_attns_10_k_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_114_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_10_k_proj_linear_weight_to_fp16_palettized, x = clip_287_cast_fp16)[name = string("linear_114_cast_fp16")];
+            fp16 self_attns_10_k_proj_output_min_to_fp16 = const()[name = string("self_attns_10_k_proj_output_min_to_fp16"), val = fp16(-0x1.36p+4)];
+            fp16 self_attns_10_k_proj_output_max_to_fp16 = const()[name = string("self_attns_10_k_proj_output_max_to_fp16"), val = fp16(0x1.32p+4)];
+            tensor<fp16, [1, 50, 1024]> clip_290_cast_fp16 = clip(alpha = self_attns_10_k_proj_output_min_to_fp16, beta = self_attns_10_k_proj_output_max_to_fp16, x = linear_114_cast_fp16)[name = string("clip_290_cast_fp16")];
+            tensor<int32, [4]> var_4757 = const()[name = string("op_4757"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> k_21_cast_fp16 = reshape(shape = var_4757, x = clip_290_cast_fp16)[name = string("k_21_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_10_v_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(127090048))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(127614400))))[name = string("self_attns_10_v_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_115_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_10_v_proj_linear_weight_to_fp16_palettized, x = clip_287_cast_fp16)[name = string("linear_115_cast_fp16")];
+            fp16 self_attns_10_v_proj_output_min_to_fp16 = const()[name = string("self_attns_10_v_proj_output_min_to_fp16"), val = fp16(-0x1.36p+4)];
+            fp16 self_attns_10_v_proj_output_max_to_fp16 = const()[name = string("self_attns_10_v_proj_output_max_to_fp16"), val = fp16(0x1.32p+4)];
+            tensor<fp16, [1, 50, 1024]> clip_292_cast_fp16 = clip(alpha = self_attns_10_v_proj_output_min_to_fp16, beta = self_attns_10_v_proj_output_max_to_fp16, x = linear_115_cast_fp16)[name = string("clip_292_cast_fp16")];
+            tensor<int32, [4]> var_4769 = const()[name = string("op_4769"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> input_453_cast_fp16 = reshape(shape = var_4769, x = clip_292_cast_fp16)[name = string("input_453_cast_fp16")];
+            fp16 var_4771_to_fp16 = const()[name = string("op_4771_to_fp16"), val = fp16(0x1.054p-3)];
+            tensor<fp16, [1, 50, 8, 128]> var_4772_cast_fp16 = mul(x = q_21_cast_fp16, y = var_4771_to_fp16)[name = string("op_4772_cast_fp16")];
+            tensor<fp16, [128]> var_4773_to_fp16 = const()[name = string("op_4773_to_fp16"), val = tensor<fp16, [128]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(127615488)))];
+            tensor<fp16, [1, 50, 8, 128]> input_449_cast_fp16 = mul(x = var_4772_cast_fp16, y = var_4773_to_fp16)[name = string("input_449_cast_fp16")];
+            fp16 var_4775_to_fp16 = const()[name = string("op_4775_to_fp16"), val = fp16(0x1.e5p+0)];
+            tensor<fp16, [1, 50, 8, 128]> input_451_cast_fp16 = mul(x = k_21_cast_fp16, y = var_4775_to_fp16)[name = string("input_451_cast_fp16")];
+            tensor<int32, [8]> q_padded_21_pad_0 = const()[name = string("q_padded_21_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 10, 0, 0, 0, 0])];
+            string q_padded_21_mode_0 = const()[name = string("q_padded_21_mode_0"), val = string("constant")];
+            fp16 const_155_to_fp16 = const()[name = string("const_155_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 60, 8, 128]> q_padded_21_cast_fp16 = pad(constant_val = const_155_to_fp16, mode = q_padded_21_mode_0, pad = q_padded_21_pad_0, x = input_449_cast_fp16)[name = string("q_padded_21_cast_fp16")];
+            tensor<int32, [5]> var_4779 = const()[name = string("op_4779"), val = tensor<int32, [5]>([1, 5, 12, 8, 128])];
+            tensor<fp16, [1, 5, 12, 8, 128]> q_blocks_21_cast_fp16 = reshape(shape = var_4779, x = q_padded_21_cast_fp16)[name = string("q_blocks_21_cast_fp16")];
+            tensor<int32, [8]> k_padded_21_pad_0 = const()[name = string("k_padded_21_pad_0"), val = tensor<int32, [8]>([0, 0, 12, 11, 0, 0, 0, 0])];
+            string k_padded_21_mode_0 = const()[name = string("k_padded_21_mode_0"), val = string("constant")];
+            fp16 const_156_to_fp16 = const()[name = string("const_156_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 73, 8, 128]> k_padded_21_cast_fp16 = pad(constant_val = const_156_to_fp16, mode = k_padded_21_mode_0, pad = k_padded_21_pad_0, x = input_451_cast_fp16)[name = string("k_padded_21_cast_fp16")];
+            tensor<int32, [8]> v_padded_21_pad_0 = const()[name = string("v_padded_21_pad_0"), val = tensor<int32, [8]>([0, 0, 12, 11, 0, 0, 0, 0])];
+            string v_padded_21_mode_0 = const()[name = string("v_padded_21_mode_0"), val = string("constant")];
+            fp16 const_157_to_fp16 = const()[name = string("const_157_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 73, 8, 128]> v_padded_21_cast_fp16 = pad(constant_val = const_157_to_fp16, mode = v_padded_21_mode_0, pad = v_padded_21_pad_0, x = input_453_cast_fp16)[name = string("v_padded_21_cast_fp16")];
+            tensor<int32, [4]> var_4786_begin_0 = const()[name = string("op_4786_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4786_end_0 = const()[name = string("op_4786_end_0"), val = tensor<int32, [4]>([1, 24, 8, 128])];
+            tensor<bool, [4]> var_4786_end_mask_0 = const()[name = string("op_4786_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_4786_cast_fp16 = slice_by_index(begin = var_4786_begin_0, end = var_4786_end_0, end_mask = var_4786_end_mask_0, x = k_padded_21_cast_fp16)[name = string("op_4786_cast_fp16")];
+            tensor<int32, [4]> var_4788_begin_0 = const()[name = string("op_4788_begin_0"), val = tensor<int32, [4]>([0, 12, 0, 0])];
+            tensor<int32, [4]> var_4788_end_0 = const()[name = string("op_4788_end_0"), val = tensor<int32, [4]>([1, 36, 8, 128])];
+            tensor<bool, [4]> var_4788_end_mask_0 = const()[name = string("op_4788_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_4788_cast_fp16 = slice_by_index(begin = var_4788_begin_0, end = var_4788_end_0, end_mask = var_4788_end_mask_0, x = k_padded_21_cast_fp16)[name = string("op_4788_cast_fp16")];
+            tensor<int32, [4]> var_4790_begin_0 = const()[name = string("op_4790_begin_0"), val = tensor<int32, [4]>([0, 24, 0, 0])];
+            tensor<int32, [4]> var_4790_end_0 = const()[name = string("op_4790_end_0"), val = tensor<int32, [4]>([1, 48, 8, 128])];
+            tensor<bool, [4]> var_4790_end_mask_0 = const()[name = string("op_4790_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_4790_cast_fp16 = slice_by_index(begin = var_4790_begin_0, end = var_4790_end_0, end_mask = var_4790_end_mask_0, x = k_padded_21_cast_fp16)[name = string("op_4790_cast_fp16")];
+            tensor<int32, [4]> var_4792_begin_0 = const()[name = string("op_4792_begin_0"), val = tensor<int32, [4]>([0, 36, 0, 0])];
+            tensor<int32, [4]> var_4792_end_0 = const()[name = string("op_4792_end_0"), val = tensor<int32, [4]>([1, 60, 8, 128])];
+            tensor<bool, [4]> var_4792_end_mask_0 = const()[name = string("op_4792_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_4792_cast_fp16 = slice_by_index(begin = var_4792_begin_0, end = var_4792_end_0, end_mask = var_4792_end_mask_0, x = k_padded_21_cast_fp16)[name = string("op_4792_cast_fp16")];
+            tensor<int32, [4]> var_4794_begin_0 = const()[name = string("op_4794_begin_0"), val = tensor<int32, [4]>([0, 48, 0, 0])];
+            tensor<int32, [4]> var_4794_end_0 = const()[name = string("op_4794_end_0"), val = tensor<int32, [4]>([1, 72, 8, 128])];
+            tensor<bool, [4]> var_4794_end_mask_0 = const()[name = string("op_4794_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_4794_cast_fp16 = slice_by_index(begin = var_4794_begin_0, end = var_4794_end_0, end_mask = var_4794_end_mask_0, x = k_padded_21_cast_fp16)[name = string("op_4794_cast_fp16")];
+            int32 k_blocks_21_axis_0 = const()[name = string("k_blocks_21_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 5, 24, 8, 128]> k_blocks_21_cast_fp16 = stack(axis = k_blocks_21_axis_0, values = (var_4786_cast_fp16, var_4788_cast_fp16, var_4790_cast_fp16, var_4792_cast_fp16, var_4794_cast_fp16))[name = string("k_blocks_21_cast_fp16")];
+            tensor<int32, [4]> var_4798_begin_0 = const()[name = string("op_4798_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4798_end_0 = const()[name = string("op_4798_end_0"), val = tensor<int32, [4]>([1, 24, 8, 128])];
+            tensor<bool, [4]> var_4798_end_mask_0 = const()[name = string("op_4798_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_4798_cast_fp16 = slice_by_index(begin = var_4798_begin_0, end = var_4798_end_0, end_mask = var_4798_end_mask_0, x = v_padded_21_cast_fp16)[name = string("op_4798_cast_fp16")];
+            tensor<int32, [4]> var_4800_begin_0 = const()[name = string("op_4800_begin_0"), val = tensor<int32, [4]>([0, 12, 0, 0])];
+            tensor<int32, [4]> var_4800_end_0 = const()[name = string("op_4800_end_0"), val = tensor<int32, [4]>([1, 36, 8, 128])];
+            tensor<bool, [4]> var_4800_end_mask_0 = const()[name = string("op_4800_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_4800_cast_fp16 = slice_by_index(begin = var_4800_begin_0, end = var_4800_end_0, end_mask = var_4800_end_mask_0, x = v_padded_21_cast_fp16)[name = string("op_4800_cast_fp16")];
+            tensor<int32, [4]> var_4802_begin_0 = const()[name = string("op_4802_begin_0"), val = tensor<int32, [4]>([0, 24, 0, 0])];
+            tensor<int32, [4]> var_4802_end_0 = const()[name = string("op_4802_end_0"), val = tensor<int32, [4]>([1, 48, 8, 128])];
+            tensor<bool, [4]> var_4802_end_mask_0 = const()[name = string("op_4802_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_4802_cast_fp16 = slice_by_index(begin = var_4802_begin_0, end = var_4802_end_0, end_mask = var_4802_end_mask_0, x = v_padded_21_cast_fp16)[name = string("op_4802_cast_fp16")];
+            tensor<int32, [4]> var_4804_begin_0 = const()[name = string("op_4804_begin_0"), val = tensor<int32, [4]>([0, 36, 0, 0])];
+            tensor<int32, [4]> var_4804_end_0 = const()[name = string("op_4804_end_0"), val = tensor<int32, [4]>([1, 60, 8, 128])];
+            tensor<bool, [4]> var_4804_end_mask_0 = const()[name = string("op_4804_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_4804_cast_fp16 = slice_by_index(begin = var_4804_begin_0, end = var_4804_end_0, end_mask = var_4804_end_mask_0, x = v_padded_21_cast_fp16)[name = string("op_4804_cast_fp16")];
+            tensor<int32, [4]> var_4806_begin_0 = const()[name = string("op_4806_begin_0"), val = tensor<int32, [4]>([0, 48, 0, 0])];
+            tensor<int32, [4]> var_4806_end_0 = const()[name = string("op_4806_end_0"), val = tensor<int32, [4]>([1, 72, 8, 128])];
+            tensor<bool, [4]> var_4806_end_mask_0 = const()[name = string("op_4806_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_4806_cast_fp16 = slice_by_index(begin = var_4806_begin_0, end = var_4806_end_0, end_mask = var_4806_end_mask_0, x = v_padded_21_cast_fp16)[name = string("op_4806_cast_fp16")];
+            int32 v_blocks_21_axis_0 = const()[name = string("v_blocks_21_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 5, 24, 8, 128]> v_blocks_21_cast_fp16 = stack(axis = v_blocks_21_axis_0, values = (var_4798_cast_fp16, var_4800_cast_fp16, var_4802_cast_fp16, var_4804_cast_fp16, var_4806_cast_fp16))[name = string("v_blocks_21_cast_fp16")];
+            tensor<int32, [5]> var_4814 = const()[name = string("op_4814"), val = tensor<int32, [5]>([0, 3, 1, -3, -1])];
+            tensor<int32, [5]> var_4816 = const()[name = string("op_4816"), val = tensor<int32, [5]>([0, 3, 1, -1, -3])];
+            bool matrix_ac_21_transpose_x_0 = const()[name = string("matrix_ac_21_transpose_x_0"), val = bool(false)];
+            bool matrix_ac_21_transpose_y_0 = const()[name = string("matrix_ac_21_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 5, 12, 128]> queries_21_cast_fp16 = transpose(perm = var_4814, x = q_blocks_21_cast_fp16)[name = string("transpose_10")];
+            tensor<fp16, [1, 8, 5, 128, 24]> keys_t_21_cast_fp16 = transpose(perm = var_4816, x = k_blocks_21_cast_fp16)[name = string("transpose_11")];
+            tensor<fp16, [1, 8, 5, 12, 24]> matrix_ac_21_cast_fp16 = matmul(transpose_x = matrix_ac_21_transpose_x_0, transpose_y = matrix_ac_21_transpose_y_0, x = queries_21_cast_fp16, y = keys_t_21_cast_fp16)[name = string("matrix_ac_21_cast_fp16")];
+            tensor<int32, [4]> var_4819 = const()[name = string("op_4819"), val = tensor<int32, [4]>([1, 8, 60, 128])];
+            tensor<fp16, [1, 8, 60, 128]> q_flat_21_cast_fp16 = reshape(shape = var_4819, x = queries_21_cast_fp16)[name = string("q_flat_21_cast_fp16")];
+            bool matrix_bd_101_transpose_x_0 = const()[name = string("matrix_bd_101_transpose_x_0"), val = bool(false)];
+            bool matrix_bd_101_transpose_y_0 = const()[name = string("matrix_bd_101_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [8, 128, 13]> rel_k_t_21_to_fp16 = const()[name = string("rel_k_t_21_to_fp16"), val = tensor<fp16, [8, 128, 13]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(127615808)))];
+            tensor<fp16, [1, 8, 60, 13]> matrix_bd_101_cast_fp16 = matmul(transpose_x = matrix_bd_101_transpose_x_0, transpose_y = matrix_bd_101_transpose_y_0, x = q_flat_21_cast_fp16, y = rel_k_t_21_to_fp16)[name = string("matrix_bd_101_cast_fp16")];
+            tensor<int32, [5]> var_4824 = const()[name = string("op_4824"), val = tensor<int32, [5]>([1, 8, 5, 12, 13])];
+            tensor<fp16, [1, 8, 5, 12, 13]> input_455_cast_fp16 = reshape(shape = var_4824, x = matrix_bd_101_cast_fp16)[name = string("input_455_cast_fp16")];
+            tensor<int32, [10]> matrix_bd_103_pad_0 = const()[name = string("matrix_bd_103_pad_0"), val = tensor<int32, [10]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(127642496)))];
+            string matrix_bd_103_mode_0 = const()[name = string("matrix_bd_103_mode_0"), val = string("constant")];
+            fp16 const_159_to_fp16 = const()[name = string("const_159_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 8, 5, 12, 25]> matrix_bd_103_cast_fp16 = pad(constant_val = const_159_to_fp16, mode = matrix_bd_103_mode_0, pad = matrix_bd_103_pad_0, x = input_455_cast_fp16)[name = string("matrix_bd_103_cast_fp16")];
+            tensor<int32, [4]> var_4828 = const()[name = string("op_4828"), val = tensor<int32, [4]>([1, 8, 5, 300])];
+            tensor<fp16, [1, 8, 5, 300]> matrix_bd_105_cast_fp16 = reshape(shape = var_4828, x = matrix_bd_103_cast_fp16)[name = string("matrix_bd_105_cast_fp16")];
+            tensor<int32, [4]> matrix_bd_107_begin_0 = const()[name = string("matrix_bd_107_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> matrix_bd_107_end_0 = const()[name = string("matrix_bd_107_end_0"), val = tensor<int32, [4]>([1, 8, 5, 288])];
+            tensor<bool, [4]> matrix_bd_107_end_mask_0 = const()[name = string("matrix_bd_107_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 8, 5, 288]> matrix_bd_107_cast_fp16 = slice_by_index(begin = matrix_bd_107_begin_0, end = matrix_bd_107_end_0, end_mask = matrix_bd_107_end_mask_0, x = matrix_bd_105_cast_fp16)[name = string("matrix_bd_107_cast_fp16")];
+            tensor<int32, [5]> var_4834 = const()[name = string("op_4834"), val = tensor<int32, [5]>([1, 8, 5, 12, 24])];
+            tensor<fp16, [1, 8, 5, 12, 24]> matrix_bd_109_cast_fp16 = reshape(shape = var_4834, x = matrix_bd_107_cast_fp16)[name = string("matrix_bd_109_cast_fp16")];
+            tensor<fp16, [1, 8, 5, 12, 24]> attn_61_cast_fp16 = add(x = matrix_ac_21_cast_fp16, y = matrix_bd_109_cast_fp16)[name = string("attn_61_cast_fp16")];
+            fp16 _inversed_4837_y_0_to_fp16 = const()[name = string("_inversed_4837_y_0_to_fp16"), val = fp16(0x1.47cp-6)];
+            tensor<fp16, [1, 8, 5, 12, 24]> _inversed_4837_cast_fp16 = mul(x = attn_61_cast_fp16, y = _inversed_4837_y_0_to_fp16)[name = string("_inversed_4837_cast_fp16")];
+            string _inversed_4837_cast_fp16_to_fp32_dtype_0 = const()[name = string("_inversed_4837_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 8, 5, 12, 24]> _inversed_4837_cast_fp16_to_fp32 = cast(dtype = _inversed_4837_cast_fp16_to_fp32_dtype_0, x = _inversed_4837_cast_fp16)[name = string("cast_75")];
+            tensor<fp32, [1, 8, 5, 12, 24]> var_4838 = tanh(x = _inversed_4837_cast_fp16_to_fp32)[name = string("op_4838")];
+            string var_4838_to_fp16_dtype_0 = const()[name = string("op_4838_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 self_attns_10_softcap_to_fp16 = const()[name = string("self_attns_10_softcap_to_fp16"), val = fp16(0x1.9p+5)];
+            tensor<fp16, [1, 8, 5, 12, 24]> var_4838_to_fp16 = cast(dtype = var_4838_to_fp16_dtype_0, x = var_4838)[name = string("cast_74")];
+            tensor<fp16, [1, 8, 5, 12, 24]> attn_63_cast_fp16 = mul(x = var_4838_to_fp16, y = self_attns_10_softcap_to_fp16)[name = string("attn_63_cast_fp16")];
+            string attn_63_cast_fp16_to_fp32_dtype_0 = const()[name = string("attn_63_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 8, 5, 12, 24]> attn_63_cast_fp16_to_fp32 = cast(dtype = attn_63_cast_fp16_to_fp32_dtype_0, x = attn_63_cast_fp16)[name = string("cast_73")];
+            tensor<fp32, [1, 8, 5, 12, 24]> input_457 = select(a = var_4701, b = attn_63_cast_fp16_to_fp32, cond = var_460)[name = string("input_457")];
+            tensor<fp32, [1, 8, 5, 12, 24]> var_4842 = softmax(axis = var_4700, x = input_457)[name = string("op_4842")];
+            tensor<int32, [5]> var_4844 = const()[name = string("op_4844"), val = tensor<int32, [5]>([0, 3, 1, -3, -1])];
+            bool out_61_transpose_x_0 = const()[name = string("out_61_transpose_x_0"), val = bool(false)];
+            bool out_61_transpose_y_0 = const()[name = string("out_61_transpose_y_0"), val = bool(false)];
+            string var_4842_to_fp16_dtype_0 = const()[name = string("op_4842_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 8, 5, 24, 128]> values_t_21_cast_fp16 = transpose(perm = var_4844, x = v_blocks_21_cast_fp16)[name = string("transpose_9")];
+            tensor<fp16, [1, 8, 5, 12, 24]> var_4842_to_fp16 = cast(dtype = var_4842_to_fp16_dtype_0, x = var_4842)[name = string("cast_72")];
+            tensor<fp16, [1, 8, 5, 12, 128]> out_61_cast_fp16 = matmul(transpose_x = out_61_transpose_x_0, transpose_y = out_61_transpose_y_0, x = var_4842_to_fp16, y = values_t_21_cast_fp16)[name = string("out_61_cast_fp16")];
+            tensor<int32, [5]> var_4847 = const()[name = string("op_4847"), val = tensor<int32, [5]>([0, 2, 3, 1, 4])];
+            tensor<int32, [3]> var_4849 = const()[name = string("op_4849"), val = tensor<int32, [3]>([1, 60, 1024])];
+            tensor<fp16, [1, 5, 12, 8, 128]> var_4848_cast_fp16 = transpose(perm = var_4847, x = out_61_cast_fp16)[name = string("transpose_8")];
+            tensor<fp16, [1, 60, 1024]> out_63_cast_fp16 = reshape(shape = var_4849, x = var_4848_cast_fp16)[name = string("out_63_cast_fp16")];
+            tensor<int32, [3]> var_4852_begin_0 = const()[name = string("op_4852_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_4852_end_0 = const()[name = string("op_4852_end_0"), val = tensor<int32, [3]>([1, 50, 1024])];
+            tensor<bool, [3]> var_4852_end_mask_0 = const()[name = string("op_4852_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp16, [1, 50, 1024]> var_4852_cast_fp16 = slice_by_index(begin = var_4852_begin_0, end = var_4852_end_0, end_mask = var_4852_end_mask_0, x = out_63_cast_fp16)[name = string("op_4852_cast_fp16")];
+            fp16 self_attns_10_post_input_min_to_fp16 = const()[name = string("self_attns_10_post_input_min_to_fp16"), val = fp16(-0x1.3p+4)];
+            fp16 self_attns_10_post_input_max_to_fp16 = const()[name = string("self_attns_10_post_input_max_to_fp16"), val = fp16(0x1.2cp+4)];
+            tensor<fp16, [1, 50, 1024]> clip_293_cast_fp16 = clip(alpha = self_attns_10_post_input_min_to_fp16, beta = self_attns_10_post_input_max_to_fp16, x = var_4852_cast_fp16)[name = string("clip_293_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_10_post_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(127642624))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(128166976))))[name = string("self_attns_10_post_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_117_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_10_post_linear_weight_to_fp16_palettized, x = clip_293_cast_fp16)[name = string("linear_117_cast_fp16")];
+            fp16 self_attns_10_post_output_min_to_fp16 = const()[name = string("self_attns_10_post_output_min_to_fp16"), val = fp16(-0x1.9p+6)];
+            fp16 self_attns_10_post_output_max_to_fp16 = const()[name = string("self_attns_10_post_output_max_to_fp16"), val = fp16(0x1.8cp+6)];
+            tensor<fp16, [1, 50, 1024]> clip_294_cast_fp16 = clip(alpha = self_attns_10_post_output_min_to_fp16, beta = self_attns_10_post_output_max_to_fp16, x = linear_117_cast_fp16)[name = string("clip_294_cast_fp16")];
+            fp16 var_4864_to_fp16 = const()[name = string("op_4864_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_4865_to_fp16 = const()[name = string("op_4865_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_295_cast_fp16 = clip(alpha = var_4864_to_fp16, beta = var_4865_to_fp16, x = clip_294_cast_fp16)[name = string("clip_295_cast_fp16")];
+            string clip_295_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_295_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_4867 = const()[name = string("op_4867"), val = fp32(-0x1p-1)];
+            fp32 var_4871_promoted = const()[name = string("op_4871_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_295_cast_fp16_to_fp32 = cast(dtype = clip_295_cast_fp16_to_fp32_dtype_0, x = clip_295_cast_fp16)[name = string("cast_71")];
+            tensor<fp32, [1, 50, 1024]> var_4877 = pow(x = clip_295_cast_fp16_to_fp32, y = var_4871_promoted)[name = string("op_4877")];
+            tensor<int32, [1]> var_4879_axes_0 = const()[name = string("op_4879_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4879_keep_dims_0 = const()[name = string("op_4879_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_4879 = reduce_mean(axes = var_4879_axes_0, keep_dims = var_4879_keep_dims_0, x = var_4877)[name = string("op_4879")];
+            string var_4879_to_fp16_dtype_0 = const()[name = string("op_4879_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_4880_to_fp16 = const()[name = string("op_4880_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_4879_to_fp16 = cast(dtype = var_4879_to_fp16_dtype_0, x = var_4879)[name = string("cast_70")];
+            tensor<fp16, [1, 50, 1]> mean_squared_187_cast_fp16 = add(x = var_4879_to_fp16, y = var_4880_to_fp16)[name = string("mean_squared_187_cast_fp16")];
+            string mean_squared_187_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_187_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_187_cast_fp16_to_fp32 = cast(dtype = mean_squared_187_cast_fp16_to_fp32_dtype_0, x = mean_squared_187_cast_fp16)[name = string("cast_69")];
+            tensor<fp32, [1, 50, 1]> var_4882 = pow(x = mean_squared_187_cast_fp16_to_fp32, y = var_4867)[name = string("op_4882")];
+            string var_4882_to_fp16_dtype_0 = const()[name = string("op_4882_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_4882_to_fp16 = cast(dtype = var_4882_to_fp16_dtype_0, x = var_4882)[name = string("cast_68")];
+            tensor<fp16, [1, 50, 1024]> normed_output_373_cast_fp16 = mul(x = clip_295_cast_fp16, y = var_4882_to_fp16)[name = string("normed_output_373_cast_fp16")];
+            tensor<fp16, [1024]> const_160_to_fp16 = const()[name = string("const_160_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(128168064)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_375_cast_fp16 = mul(x = normed_output_373_cast_fp16, y = const_160_to_fp16)[name = string("normed_output_375_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_1081_cast_fp16 = add(x = normed_output_375_cast_fp16, y = hidden_states_1055_cast_fp16)[name = string("hidden_states_1081_cast_fp16")];
+            string hidden_states_1081_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_1081_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_4889 = const()[name = string("op_4889"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_4890 = const()[name = string("op_4890"), val = fp32(-0x1.2a05f2p+33)];
+            fp32 var_4902 = const()[name = string("op_4902"), val = fp32(-0x1p-1)];
+            fp32 var_4898_promoted = const()[name = string("op_4898_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> hidden_states_1081_cast_fp16_to_fp32 = cast(dtype = hidden_states_1081_cast_fp16_to_fp32_dtype_0, x = hidden_states_1081_cast_fp16)[name = string("cast_67")];
+            tensor<fp32, [1, 50, 1024]> var_4910 = pow(x = hidden_states_1081_cast_fp16_to_fp32, y = var_4898_promoted)[name = string("op_4910")];
+            tensor<int32, [1]> var_4912_axes_0 = const()[name = string("op_4912_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4912_keep_dims_0 = const()[name = string("op_4912_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_4912 = reduce_mean(axes = var_4912_axes_0, keep_dims = var_4912_keep_dims_0, x = var_4910)[name = string("op_4912")];
+            string var_4912_to_fp16_dtype_0 = const()[name = string("op_4912_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_4913_to_fp16 = const()[name = string("op_4913_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_4912_to_fp16 = cast(dtype = var_4912_to_fp16_dtype_0, x = var_4912)[name = string("cast_66")];
+            tensor<fp16, [1, 50, 1]> mean_squared_189_cast_fp16 = add(x = var_4912_to_fp16, y = var_4913_to_fp16)[name = string("mean_squared_189_cast_fp16")];
+            string mean_squared_189_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_189_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_189_cast_fp16_to_fp32 = cast(dtype = mean_squared_189_cast_fp16_to_fp32_dtype_0, x = mean_squared_189_cast_fp16)[name = string("cast_65")];
+            tensor<fp32, [1, 50, 1]> var_4915 = pow(x = mean_squared_189_cast_fp16_to_fp32, y = var_4902)[name = string("op_4915")];
+            string var_4915_to_fp16_dtype_0 = const()[name = string("op_4915_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_4915_to_fp16 = cast(dtype = var_4915_to_fp16_dtype_0, x = var_4915)[name = string("cast_64")];
+            tensor<fp16, [1, 50, 1024]> normed_output_377_cast_fp16 = mul(x = hidden_states_1081_cast_fp16, y = var_4915_to_fp16)[name = string("normed_output_377_cast_fp16")];
+            tensor<fp16, [1024]> const_161_to_fp16 = const()[name = string("const_161_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(128170176)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_379_cast_fp16 = mul(x = normed_output_377_cast_fp16, y = const_161_to_fp16)[name = string("normed_output_379_cast_fp16")];
+            fp16 lconv1ds_10_linear_start_input_min_to_fp16 = const()[name = string("lconv1ds_10_linear_start_input_min_to_fp16"), val = fp16(-0x1.4ap+3)];
+            fp16 lconv1ds_10_linear_start_input_max_to_fp16 = const()[name = string("lconv1ds_10_linear_start_input_max_to_fp16"), val = fp16(0x1.48p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_296_cast_fp16 = clip(alpha = lconv1ds_10_linear_start_input_min_to_fp16, beta = lconv1ds_10_linear_start_input_max_to_fp16, x = normed_output_379_cast_fp16)[name = string("clip_296_cast_fp16")];
+            tensor<fp16, [2048, 1024]> lconv1ds_10_linear_start_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(128172288))), lut = tensor<fp16, [64, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(129220928))))[name = string("lconv1ds_10_linear_start_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 2048]> linear_118_cast_fp16 = linear(bias = linear_8_bias_0_to_fp16, weight = lconv1ds_10_linear_start_linear_weight_to_fp16_palettized, x = clip_296_cast_fp16)[name = string("linear_118_cast_fp16")];
+            fp16 lconv1ds_10_linear_start_output_min_to_fp16 = const()[name = string("lconv1ds_10_linear_start_output_min_to_fp16"), val = fp16(-0x1.bap+4)];
+            fp16 lconv1ds_10_linear_start_output_max_to_fp16 = const()[name = string("lconv1ds_10_linear_start_output_max_to_fp16"), val = fp16(0x1.b6p+4)];
+            tensor<fp16, [1, 50, 2048]> clip_297_cast_fp16 = clip(alpha = lconv1ds_10_linear_start_output_min_to_fp16, beta = lconv1ds_10_linear_start_output_max_to_fp16, x = linear_118_cast_fp16)[name = string("clip_297_cast_fp16")];
+            int32 hidden_states_1089_split_num_splits_0 = const()[name = string("hidden_states_1089_split_num_splits_0"), val = int32(2)];
+            int32 hidden_states_1089_split_axis_0 = const()[name = string("hidden_states_1089_split_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_1089_split_cast_fp16_0, tensor<fp16, [1, 50, 1024]> hidden_states_1089_split_cast_fp16_1 = split(axis = hidden_states_1089_split_axis_0, num_splits = hidden_states_1089_split_num_splits_0, x = clip_297_cast_fp16)[name = string("hidden_states_1089_split_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_1089_split_1_sigmoid_cast_fp16 = sigmoid(x = hidden_states_1089_split_cast_fp16_1)[name = string("hidden_states_1089_split_1_sigmoid_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_1089_cast_fp16 = mul(x = hidden_states_1089_split_cast_fp16_0, y = hidden_states_1089_split_1_sigmoid_cast_fp16)[name = string("hidden_states_1089_cast_fp16")];
+            tensor<int32, [3]> input_465_perm_0 = const()[name = string("input_465_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [6]> input_467_pad_0 = const()[name = string("input_467_pad_0"), val = tensor<int32, [6]>([0, 0, 0, 0, 4, 0])];
+            string input_467_mode_0 = const()[name = string("input_467_mode_0"), val = string("constant")];
+            fp16 const_162_to_fp16 = const()[name = string("const_162_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 1024, 50]> input_465_cast_fp16 = transpose(perm = input_465_perm_0, x = hidden_states_1089_cast_fp16)[name = string("transpose_7")];
+            tensor<fp16, [1, 1024, 54]> input_467_cast_fp16 = pad(constant_val = const_162_to_fp16, mode = input_467_mode_0, pad = input_467_pad_0, x = input_465_cast_fp16)[name = string("input_467_cast_fp16")];
+            string var_4941_pad_type_0 = const()[name = string("op_4941_pad_type_0"), val = string("valid")];
+            int32 var_4941_groups_0 = const()[name = string("op_4941_groups_0"), val = int32(1024)];
+            tensor<int32, [1]> var_4941_strides_0 = const()[name = string("op_4941_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_4941_pad_0 = const()[name = string("op_4941_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_4941_dilations_0 = const()[name = string("op_4941_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1024, 1, 5]> lconv1ds_10_depthwise_conv1d_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1, 5]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(129223040))), lut = tensor<fp16, [32, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(129225664))))[name = string("lconv1ds_10_depthwise_conv1d_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 50]> var_4941_cast_fp16 = conv(dilations = var_4941_dilations_0, groups = var_4941_groups_0, pad = var_4941_pad_0, pad_type = var_4941_pad_type_0, strides = var_4941_strides_0, weight = lconv1ds_10_depthwise_conv1d_weight_to_fp16_palettized, x = input_467_cast_fp16)[name = string("op_4941_cast_fp16")];
+            tensor<int32, [3]> hidden_states_1091_perm_0 = const()[name = string("hidden_states_1091_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            string hidden_states_1091_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_1091_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_1091_cast_fp16 = transpose(perm = hidden_states_1091_perm_0, x = var_4941_cast_fp16)[name = string("transpose_6")];
+            tensor<fp32, [1, 50, 1024]> hidden_states_1091_cast_fp16_to_fp32 = cast(dtype = hidden_states_1091_cast_fp16_to_fp32_dtype_0, x = hidden_states_1091_cast_fp16)[name = string("cast_63")];
+            tensor<fp32, [1, 50, 1024]> clip_298 = clip(alpha = var_4890, beta = var_4889, x = hidden_states_1091_cast_fp16_to_fp32)[name = string("clip_298")];
+            fp32 var_4898_promoted_1 = const()[name = string("op_4898_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_4946 = pow(x = clip_298, y = var_4898_promoted_1)[name = string("op_4946")];
+            tensor<int32, [1]> var_4948_axes_0 = const()[name = string("op_4948_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4948_keep_dims_0 = const()[name = string("op_4948_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_4948 = reduce_mean(axes = var_4948_axes_0, keep_dims = var_4948_keep_dims_0, x = var_4946)[name = string("op_4948")];
+            string var_4948_to_fp16_dtype_0 = const()[name = string("op_4948_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_4949_to_fp16 = const()[name = string("op_4949_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_4948_to_fp16 = cast(dtype = var_4948_to_fp16_dtype_0, x = var_4948)[name = string("cast_62")];
+            tensor<fp16, [1, 50, 1]> mean_squared_191_cast_fp16 = add(x = var_4948_to_fp16, y = var_4949_to_fp16)[name = string("mean_squared_191_cast_fp16")];
+            string mean_squared_191_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_191_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_191_cast_fp16_to_fp32 = cast(dtype = mean_squared_191_cast_fp16_to_fp32_dtype_0, x = mean_squared_191_cast_fp16)[name = string("cast_61")];
+            tensor<fp32, [1, 50, 1]> var_4951 = pow(x = mean_squared_191_cast_fp16_to_fp32, y = var_4902)[name = string("op_4951")];
+            string clip_298_to_fp16_dtype_0 = const()[name = string("clip_298_to_fp16_dtype_0"), val = string("fp16")];
+            string var_4951_to_fp16_dtype_0 = const()[name = string("op_4951_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_298_to_fp16 = cast(dtype = clip_298_to_fp16_dtype_0, x = clip_298)[name = string("cast_59")];
+            tensor<fp16, [1, 50, 1]> var_4951_to_fp16 = cast(dtype = var_4951_to_fp16_dtype_0, x = var_4951)[name = string("cast_60")];
+            tensor<fp16, [1, 50, 1024]> normed_output_381_cast_fp16 = mul(x = clip_298_to_fp16, y = var_4951_to_fp16)[name = string("normed_output_381_cast_fp16")];
+            tensor<fp16, [1024]> const_163_to_fp16 = const()[name = string("const_163_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(129226752)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_383_cast_fp16 = mul(x = normed_output_381_cast_fp16, y = const_163_to_fp16)[name = string("normed_output_383_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_1097_cast_fp16 = silu(x = normed_output_383_cast_fp16)[name = string("hidden_states_1097_cast_fp16")];
+            fp16 lconv1ds_10_linear_end_input_min_to_fp16 = const()[name = string("lconv1ds_10_linear_end_input_min_to_fp16"), val = fp16(-0x1.f4p+3)];
+            fp16 lconv1ds_10_linear_end_input_max_to_fp16 = const()[name = string("lconv1ds_10_linear_end_input_max_to_fp16"), val = fp16(0x1.fp+3)];
+            tensor<fp16, [1, 50, 1024]> clip_299_cast_fp16 = clip(alpha = lconv1ds_10_linear_end_input_min_to_fp16, beta = lconv1ds_10_linear_end_input_max_to_fp16, x = hidden_states_1097_cast_fp16)[name = string("clip_299_cast_fp16")];
+            tensor<fp16, [1024, 1024]> lconv1ds_10_linear_end_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(129228864))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(129753216))))[name = string("lconv1ds_10_linear_end_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_119_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = lconv1ds_10_linear_end_linear_weight_to_fp16_palettized, x = clip_299_cast_fp16)[name = string("linear_119_cast_fp16")];
+            fp16 lconv1ds_10_linear_end_output_min_to_fp16 = const()[name = string("lconv1ds_10_linear_end_output_min_to_fp16"), val = fp16(-0x1.6p+3)];
+            fp16 lconv1ds_10_linear_end_output_max_to_fp16 = const()[name = string("lconv1ds_10_linear_end_output_max_to_fp16"), val = fp16(0x1.5ep+3)];
+            tensor<fp16, [1, 50, 1024]> clip_300_cast_fp16 = clip(alpha = lconv1ds_10_linear_end_output_min_to_fp16, beta = lconv1ds_10_linear_end_output_max_to_fp16, x = linear_119_cast_fp16)[name = string("clip_300_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_1103_cast_fp16 = add(x = clip_300_cast_fp16, y = hidden_states_1081_cast_fp16)[name = string("hidden_states_1103_cast_fp16")];
+            string hidden_states_1103_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_1103_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_4975 = const()[name = string("op_4975"), val = fp32(-0x1p-1)];
+            fp32 var_4976 = const()[name = string("op_4976"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_4977 = const()[name = string("op_4977"), val = fp32(-0x1.2a05f2p+33)];
+            tensor<fp32, [1, 50, 1024]> hidden_states_1103_cast_fp16_to_fp32 = cast(dtype = hidden_states_1103_cast_fp16_to_fp32_dtype_0, x = hidden_states_1103_cast_fp16)[name = string("cast_58")];
+            tensor<fp32, [1, 50, 1024]> clip_301 = clip(alpha = var_4977, beta = var_4976, x = hidden_states_1103_cast_fp16_to_fp32)[name = string("clip_301")];
+            fp32 var_4971_promoted = const()[name = string("op_4971_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_4985 = pow(x = clip_301, y = var_4971_promoted)[name = string("op_4985")];
+            tensor<int32, [1]> var_4987_axes_0 = const()[name = string("op_4987_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4987_keep_dims_0 = const()[name = string("op_4987_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_4987 = reduce_mean(axes = var_4987_axes_0, keep_dims = var_4987_keep_dims_0, x = var_4985)[name = string("op_4987")];
+            string var_4987_to_fp16_dtype_0 = const()[name = string("op_4987_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_4988_to_fp16 = const()[name = string("op_4988_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_4987_to_fp16 = cast(dtype = var_4987_to_fp16_dtype_0, x = var_4987)[name = string("cast_57")];
+            tensor<fp16, [1, 50, 1]> mean_squared_193_cast_fp16 = add(x = var_4987_to_fp16, y = var_4988_to_fp16)[name = string("mean_squared_193_cast_fp16")];
+            string mean_squared_193_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_193_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_193_cast_fp16_to_fp32 = cast(dtype = mean_squared_193_cast_fp16_to_fp32_dtype_0, x = mean_squared_193_cast_fp16)[name = string("cast_56")];
+            tensor<fp32, [1, 50, 1]> var_4990 = pow(x = mean_squared_193_cast_fp16_to_fp32, y = var_4975)[name = string("op_4990")];
+            string clip_301_to_fp16_dtype_0 = const()[name = string("clip_301_to_fp16_dtype_0"), val = string("fp16")];
+            string var_4990_to_fp16_dtype_0 = const()[name = string("op_4990_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_301_to_fp16 = cast(dtype = clip_301_to_fp16_dtype_0, x = clip_301)[name = string("cast_54")];
+            tensor<fp16, [1, 50, 1]> var_4990_to_fp16 = cast(dtype = var_4990_to_fp16_dtype_0, x = var_4990)[name = string("cast_55")];
+            tensor<fp16, [1, 50, 1024]> normed_output_385_cast_fp16 = mul(x = clip_301_to_fp16, y = var_4990_to_fp16)[name = string("normed_output_385_cast_fp16")];
+            tensor<fp16, [1024]> const_164_to_fp16 = const()[name = string("const_164_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(129754304)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_387_cast_fp16 = mul(x = normed_output_385_cast_fp16, y = const_164_to_fp16)[name = string("normed_output_387_cast_fp16")];
+            fp16 feed_forward2s_10_ffw_layer_1_input_min_to_fp16 = const()[name = string("feed_forward2s_10_ffw_layer_1_input_min_to_fp16"), val = fp16(-0x1.8ap+4)];
+            fp16 feed_forward2s_10_ffw_layer_1_input_max_to_fp16 = const()[name = string("feed_forward2s_10_ffw_layer_1_input_max_to_fp16"), val = fp16(0x1.88p+4)];
+            tensor<fp16, [1, 50, 1024]> clip_302_cast_fp16 = clip(alpha = feed_forward2s_10_ffw_layer_1_input_min_to_fp16, beta = feed_forward2s_10_ffw_layer_1_input_max_to_fp16, x = normed_output_387_cast_fp16)[name = string("clip_302_cast_fp16")];
+            tensor<fp16, [4096, 1024]> feed_forward2s_10_ffw_layer_1_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(129756416))), lut = tensor<fp16, [128, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(131853632))))[name = string("feed_forward2s_10_ffw_layer_1_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 4096]> linear_120_cast_fp16 = linear(bias = linear_1_bias_0_to_fp16, weight = feed_forward2s_10_ffw_layer_1_linear_weight_to_fp16_palettized, x = clip_302_cast_fp16)[name = string("linear_120_cast_fp16")];
+            fp16 feed_forward2s_10_ffw_layer_1_output_min_to_fp16 = const()[name = string("feed_forward2s_10_ffw_layer_1_output_min_to_fp16"), val = fp16(-0x1.12p+6)];
+            fp16 feed_forward2s_10_ffw_layer_1_output_max_to_fp16 = const()[name = string("feed_forward2s_10_ffw_layer_1_output_max_to_fp16"), val = fp16(0x1.1p+6)];
+            tensor<fp16, [1, 50, 4096]> clip_303_cast_fp16 = clip(alpha = feed_forward2s_10_ffw_layer_1_output_min_to_fp16, beta = feed_forward2s_10_ffw_layer_1_output_max_to_fp16, x = linear_120_cast_fp16)[name = string("clip_303_cast_fp16")];
+            tensor<fp16, [1, 50, 4096]> hidden_states_1113_cast_fp16 = silu(x = clip_303_cast_fp16)[name = string("hidden_states_1113_cast_fp16")];
+            fp16 feed_forward2s_10_ffw_layer_2_input_min_to_fp16 = const()[name = string("feed_forward2s_10_ffw_layer_2_input_min_to_fp16"), val = fp16(-0x1.76p+4)];
+            fp16 feed_forward2s_10_ffw_layer_2_input_max_to_fp16 = const()[name = string("feed_forward2s_10_ffw_layer_2_input_max_to_fp16"), val = fp16(0x1.72p+4)];
+            tensor<fp16, [1, 50, 4096]> clip_304_cast_fp16 = clip(alpha = feed_forward2s_10_ffw_layer_2_input_min_to_fp16, beta = feed_forward2s_10_ffw_layer_2_input_max_to_fp16, x = hidden_states_1113_cast_fp16)[name = string("clip_304_cast_fp16")];
+            tensor<fp16, [1024, 4096]> feed_forward2s_10_ffw_layer_2_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 4096]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(131857792))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(133955008))))[name = string("feed_forward2s_10_ffw_layer_2_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_121_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = feed_forward2s_10_ffw_layer_2_linear_weight_to_fp16_palettized, x = clip_304_cast_fp16)[name = string("linear_121_cast_fp16")];
+            fp16 feed_forward2s_10_ffw_layer_2_output_min_to_fp16 = const()[name = string("feed_forward2s_10_ffw_layer_2_output_min_to_fp16"), val = fp16(-0x1.a8p+7)];
+            fp16 feed_forward2s_10_ffw_layer_2_output_max_to_fp16 = const()[name = string("feed_forward2s_10_ffw_layer_2_output_max_to_fp16"), val = fp16(0x1.a6p+7)];
+            tensor<fp16, [1, 50, 1024]> clip_305_cast_fp16 = clip(alpha = feed_forward2s_10_ffw_layer_2_output_min_to_fp16, beta = feed_forward2s_10_ffw_layer_2_output_max_to_fp16, x = linear_121_cast_fp16)[name = string("clip_305_cast_fp16")];
+            string clip_305_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_305_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1024]> clip_305_cast_fp16_to_fp32 = cast(dtype = clip_305_cast_fp16_to_fp32_dtype_0, x = clip_305_cast_fp16)[name = string("cast_53")];
+            tensor<fp32, [1, 50, 1024]> clip_306 = clip(alpha = var_4977, beta = var_4976, x = clip_305_cast_fp16_to_fp32)[name = string("clip_306")];
+            fp32 var_4971_promoted_1 = const()[name = string("op_4971_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_5017 = pow(x = clip_306, y = var_4971_promoted_1)[name = string("op_5017")];
+            tensor<int32, [1]> var_5019_axes_0 = const()[name = string("op_5019_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5019_keep_dims_0 = const()[name = string("op_5019_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_5019 = reduce_mean(axes = var_5019_axes_0, keep_dims = var_5019_keep_dims_0, x = var_5017)[name = string("op_5019")];
+            string var_5019_to_fp16_dtype_0 = const()[name = string("op_5019_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_5020_to_fp16 = const()[name = string("op_5020_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_5019_to_fp16 = cast(dtype = var_5019_to_fp16_dtype_0, x = var_5019)[name = string("cast_52")];
+            tensor<fp16, [1, 50, 1]> mean_squared_195_cast_fp16 = add(x = var_5019_to_fp16, y = var_5020_to_fp16)[name = string("mean_squared_195_cast_fp16")];
+            string mean_squared_195_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_195_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_195_cast_fp16_to_fp32 = cast(dtype = mean_squared_195_cast_fp16_to_fp32_dtype_0, x = mean_squared_195_cast_fp16)[name = string("cast_51")];
+            tensor<fp32, [1, 50, 1]> var_5022 = pow(x = mean_squared_195_cast_fp16_to_fp32, y = var_4975)[name = string("op_5022")];
+            string clip_306_to_fp16_dtype_0 = const()[name = string("clip_306_to_fp16_dtype_0"), val = string("fp16")];
+            string var_5022_to_fp16_dtype_0 = const()[name = string("op_5022_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_306_to_fp16 = cast(dtype = clip_306_to_fp16_dtype_0, x = clip_306)[name = string("cast_49")];
+            tensor<fp16, [1, 50, 1]> var_5022_to_fp16 = cast(dtype = var_5022_to_fp16_dtype_0, x = var_5022)[name = string("cast_50")];
+            tensor<fp16, [1, 50, 1024]> normed_output_389_cast_fp16 = mul(x = clip_306_to_fp16, y = var_5022_to_fp16)[name = string("normed_output_389_cast_fp16")];
+            tensor<fp16, [1024]> const_165_to_fp16 = const()[name = string("const_165_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(133956096)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_391_cast_fp16 = mul(x = normed_output_389_cast_fp16, y = const_165_to_fp16)[name = string("normed_output_391_cast_fp16")];
+            fp16 var_4967_to_fp16 = const()[name = string("op_4967_to_fp16"), val = fp16(0x1p-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_1125_cast_fp16 = mul(x = normed_output_391_cast_fp16, y = var_4967_to_fp16)[name = string("hidden_states_1125_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_1127_cast_fp16 = add(x = hidden_states_1125_cast_fp16, y = hidden_states_1103_cast_fp16)[name = string("hidden_states_1127_cast_fp16")];
+            fp16 var_5029_to_fp16 = const()[name = string("op_5029_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_5030_to_fp16 = const()[name = string("op_5030_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_307_cast_fp16 = clip(alpha = var_5029_to_fp16, beta = var_5030_to_fp16, x = hidden_states_1127_cast_fp16)[name = string("clip_307_cast_fp16")];
+            string clip_307_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_307_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_5032 = const()[name = string("op_5032"), val = fp32(-0x1p-1)];
+            fp32 var_5036_promoted = const()[name = string("op_5036_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_307_cast_fp16_to_fp32 = cast(dtype = clip_307_cast_fp16_to_fp32_dtype_0, x = clip_307_cast_fp16)[name = string("cast_48")];
+            tensor<fp32, [1, 50, 1024]> var_5042 = pow(x = clip_307_cast_fp16_to_fp32, y = var_5036_promoted)[name = string("op_5042")];
+            tensor<int32, [1]> var_5044_axes_0 = const()[name = string("op_5044_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5044_keep_dims_0 = const()[name = string("op_5044_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_5044 = reduce_mean(axes = var_5044_axes_0, keep_dims = var_5044_keep_dims_0, x = var_5042)[name = string("op_5044")];
+            string var_5044_to_fp16_dtype_0 = const()[name = string("op_5044_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_5045_to_fp16 = const()[name = string("op_5045_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_5044_to_fp16 = cast(dtype = var_5044_to_fp16_dtype_0, x = var_5044)[name = string("cast_47")];
+            tensor<fp16, [1, 50, 1]> mean_squared_197_cast_fp16 = add(x = var_5044_to_fp16, y = var_5045_to_fp16)[name = string("mean_squared_197_cast_fp16")];
+            string mean_squared_197_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_197_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_197_cast_fp16_to_fp32 = cast(dtype = mean_squared_197_cast_fp16_to_fp32_dtype_0, x = mean_squared_197_cast_fp16)[name = string("cast_46")];
+            tensor<fp32, [1, 50, 1]> var_5047 = pow(x = mean_squared_197_cast_fp16_to_fp32, y = var_5032)[name = string("op_5047")];
+            string var_5047_to_fp16_dtype_0 = const()[name = string("op_5047_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_5047_to_fp16 = cast(dtype = var_5047_to_fp16_dtype_0, x = var_5047)[name = string("cast_45")];
+            tensor<fp16, [1, 50, 1024]> normed_output_393_cast_fp16 = mul(x = clip_307_cast_fp16, y = var_5047_to_fp16)[name = string("normed_output_393_cast_fp16")];
+            tensor<fp16, [1024]> const_166_to_fp16 = const()[name = string("const_166_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(133958208)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_395_cast_fp16 = mul(x = normed_output_393_cast_fp16, y = const_166_to_fp16)[name = string("normed_output_395_cast_fp16")];
+            string normed_output_395_cast_fp16_to_fp32_dtype_0 = const()[name = string("normed_output_395_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_5060 = const()[name = string("op_5060"), val = fp32(-0x1p-1)];
+            fp32 var_5061 = const()[name = string("op_5061"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_5062 = const()[name = string("op_5062"), val = fp32(-0x1.2a05f2p+33)];
+            tensor<fp32, [1, 50, 1024]> normed_output_395_cast_fp16_to_fp32 = cast(dtype = normed_output_395_cast_fp16_to_fp32_dtype_0, x = normed_output_395_cast_fp16)[name = string("cast_44")];
+            tensor<fp32, [1, 50, 1024]> clip_308 = clip(alpha = var_5062, beta = var_5061, x = normed_output_395_cast_fp16_to_fp32)[name = string("clip_308")];
+            fp32 var_5056_promoted = const()[name = string("op_5056_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_5070 = pow(x = clip_308, y = var_5056_promoted)[name = string("op_5070")];
+            tensor<int32, [1]> var_5072_axes_0 = const()[name = string("op_5072_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5072_keep_dims_0 = const()[name = string("op_5072_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_5072 = reduce_mean(axes = var_5072_axes_0, keep_dims = var_5072_keep_dims_0, x = var_5070)[name = string("op_5072")];
+            string var_5072_to_fp16_dtype_0 = const()[name = string("op_5072_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_5073_to_fp16 = const()[name = string("op_5073_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_5072_to_fp16 = cast(dtype = var_5072_to_fp16_dtype_0, x = var_5072)[name = string("cast_43")];
+            tensor<fp16, [1, 50, 1]> mean_squared_199_cast_fp16 = add(x = var_5072_to_fp16, y = var_5073_to_fp16)[name = string("mean_squared_199_cast_fp16")];
+            string mean_squared_199_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_199_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_199_cast_fp16_to_fp32 = cast(dtype = mean_squared_199_cast_fp16_to_fp32_dtype_0, x = mean_squared_199_cast_fp16)[name = string("cast_42")];
+            tensor<fp32, [1, 50, 1]> var_5075 = pow(x = mean_squared_199_cast_fp16_to_fp32, y = var_5060)[name = string("op_5075")];
+            string clip_308_to_fp16_dtype_0 = const()[name = string("clip_308_to_fp16_dtype_0"), val = string("fp16")];
+            string var_5075_to_fp16_dtype_0 = const()[name = string("op_5075_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_308_to_fp16 = cast(dtype = clip_308_to_fp16_dtype_0, x = clip_308)[name = string("cast_40")];
+            tensor<fp16, [1, 50, 1]> var_5075_to_fp16 = cast(dtype = var_5075_to_fp16_dtype_0, x = var_5075)[name = string("cast_41")];
+            tensor<fp16, [1, 50, 1024]> normed_output_397_cast_fp16 = mul(x = clip_308_to_fp16, y = var_5075_to_fp16)[name = string("normed_output_397_cast_fp16")];
+            tensor<fp16, [1024]> const_167_to_fp16 = const()[name = string("const_167_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(133960320)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_399_cast_fp16 = mul(x = normed_output_397_cast_fp16, y = const_167_to_fp16)[name = string("normed_output_399_cast_fp16")];
+            fp16 feed_forward1s_11_ffw_layer_1_input_min_to_fp16 = const()[name = string("feed_forward1s_11_ffw_layer_1_input_min_to_fp16"), val = fp16(-0x1.28p+3)];
+            fp16 feed_forward1s_11_ffw_layer_1_input_max_to_fp16 = const()[name = string("feed_forward1s_11_ffw_layer_1_input_max_to_fp16"), val = fp16(0x1.26p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_309_cast_fp16 = clip(alpha = feed_forward1s_11_ffw_layer_1_input_min_to_fp16, beta = feed_forward1s_11_ffw_layer_1_input_max_to_fp16, x = normed_output_399_cast_fp16)[name = string("clip_309_cast_fp16")];
+            tensor<fp16, [4096, 1024]> feed_forward1s_11_ffw_layer_1_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(133962432))), lut = tensor<fp16, [128, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(136059648))))[name = string("feed_forward1s_11_ffw_layer_1_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 4096]> linear_122_cast_fp16 = linear(bias = linear_1_bias_0_to_fp16, weight = feed_forward1s_11_ffw_layer_1_linear_weight_to_fp16_palettized, x = clip_309_cast_fp16)[name = string("linear_122_cast_fp16")];
+            fp16 feed_forward1s_11_ffw_layer_1_output_min_to_fp16 = const()[name = string("feed_forward1s_11_ffw_layer_1_output_min_to_fp16"), val = fp16(-0x1.5ep+4)];
+            fp16 feed_forward1s_11_ffw_layer_1_output_max_to_fp16 = const()[name = string("feed_forward1s_11_ffw_layer_1_output_max_to_fp16"), val = fp16(0x1.5ap+4)];
+            tensor<fp16, [1, 50, 4096]> clip_310_cast_fp16 = clip(alpha = feed_forward1s_11_ffw_layer_1_output_min_to_fp16, beta = feed_forward1s_11_ffw_layer_1_output_max_to_fp16, x = linear_122_cast_fp16)[name = string("clip_310_cast_fp16")];
+            tensor<fp16, [1, 50, 4096]> hidden_states_1143_cast_fp16 = silu(x = clip_310_cast_fp16)[name = string("hidden_states_1143_cast_fp16")];
+            fp16 feed_forward1s_11_ffw_layer_2_input_min_to_fp16 = const()[name = string("feed_forward1s_11_ffw_layer_2_input_min_to_fp16"), val = fp16(-0x1.3cp+3)];
+            fp16 feed_forward1s_11_ffw_layer_2_input_max_to_fp16 = const()[name = string("feed_forward1s_11_ffw_layer_2_input_max_to_fp16"), val = fp16(0x1.3ap+3)];
+            tensor<fp16, [1, 50, 4096]> clip_311_cast_fp16 = clip(alpha = feed_forward1s_11_ffw_layer_2_input_min_to_fp16, beta = feed_forward1s_11_ffw_layer_2_input_max_to_fp16, x = hidden_states_1143_cast_fp16)[name = string("clip_311_cast_fp16")];
+            tensor<fp16, [1024, 4096]> feed_forward1s_11_ffw_layer_2_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 4096]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(136063808))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(138161024))))[name = string("feed_forward1s_11_ffw_layer_2_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_123_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = feed_forward1s_11_ffw_layer_2_linear_weight_to_fp16_palettized, x = clip_311_cast_fp16)[name = string("linear_123_cast_fp16")];
+            fp16 feed_forward1s_11_ffw_layer_2_output_min_to_fp16 = const()[name = string("feed_forward1s_11_ffw_layer_2_output_min_to_fp16"), val = fp16(-0x1.9p+5)];
+            fp16 feed_forward1s_11_ffw_layer_2_output_max_to_fp16 = const()[name = string("feed_forward1s_11_ffw_layer_2_output_max_to_fp16"), val = fp16(0x1.8cp+5)];
+            tensor<fp16, [1, 50, 1024]> clip_312_cast_fp16 = clip(alpha = feed_forward1s_11_ffw_layer_2_output_min_to_fp16, beta = feed_forward1s_11_ffw_layer_2_output_max_to_fp16, x = linear_123_cast_fp16)[name = string("clip_312_cast_fp16")];
+            string clip_312_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_312_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1024]> clip_312_cast_fp16_to_fp32 = cast(dtype = clip_312_cast_fp16_to_fp32_dtype_0, x = clip_312_cast_fp16)[name = string("cast_39")];
+            tensor<fp32, [1, 50, 1024]> clip_313 = clip(alpha = var_5062, beta = var_5061, x = clip_312_cast_fp16_to_fp32)[name = string("clip_313")];
+            fp32 var_5056_promoted_1 = const()[name = string("op_5056_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_5102 = pow(x = clip_313, y = var_5056_promoted_1)[name = string("op_5102")];
+            tensor<int32, [1]> var_5104_axes_0 = const()[name = string("op_5104_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5104_keep_dims_0 = const()[name = string("op_5104_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_5104 = reduce_mean(axes = var_5104_axes_0, keep_dims = var_5104_keep_dims_0, x = var_5102)[name = string("op_5104")];
+            string var_5104_to_fp16_dtype_0 = const()[name = string("op_5104_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_5105_to_fp16 = const()[name = string("op_5105_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_5104_to_fp16 = cast(dtype = var_5104_to_fp16_dtype_0, x = var_5104)[name = string("cast_38")];
+            tensor<fp16, [1, 50, 1]> mean_squared_201_cast_fp16 = add(x = var_5104_to_fp16, y = var_5105_to_fp16)[name = string("mean_squared_201_cast_fp16")];
+            string mean_squared_201_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_201_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_201_cast_fp16_to_fp32 = cast(dtype = mean_squared_201_cast_fp16_to_fp32_dtype_0, x = mean_squared_201_cast_fp16)[name = string("cast_37")];
+            tensor<fp32, [1, 50, 1]> var_5107 = pow(x = mean_squared_201_cast_fp16_to_fp32, y = var_5060)[name = string("op_5107")];
+            string clip_313_to_fp16_dtype_0 = const()[name = string("clip_313_to_fp16_dtype_0"), val = string("fp16")];
+            string var_5107_to_fp16_dtype_0 = const()[name = string("op_5107_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_313_to_fp16 = cast(dtype = clip_313_to_fp16_dtype_0, x = clip_313)[name = string("cast_35")];
+            tensor<fp16, [1, 50, 1]> var_5107_to_fp16 = cast(dtype = var_5107_to_fp16_dtype_0, x = var_5107)[name = string("cast_36")];
+            tensor<fp16, [1, 50, 1024]> normed_output_401_cast_fp16 = mul(x = clip_313_to_fp16, y = var_5107_to_fp16)[name = string("normed_output_401_cast_fp16")];
+            tensor<fp16, [1024]> const_168_to_fp16 = const()[name = string("const_168_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(138162112)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_403_cast_fp16 = mul(x = normed_output_401_cast_fp16, y = const_168_to_fp16)[name = string("normed_output_403_cast_fp16")];
+            fp16 var_5052_to_fp16 = const()[name = string("op_5052_to_fp16"), val = fp16(0x1p-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_1155_cast_fp16 = mul(x = normed_output_403_cast_fp16, y = var_5052_to_fp16)[name = string("hidden_states_1155_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_1157_cast_fp16 = add(x = hidden_states_1155_cast_fp16, y = normed_output_395_cast_fp16)[name = string("hidden_states_1157_cast_fp16")];
+            fp16 var_5114_to_fp16 = const()[name = string("op_5114_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_5115_to_fp16 = const()[name = string("op_5115_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_314_cast_fp16 = clip(alpha = var_5114_to_fp16, beta = var_5115_to_fp16, x = hidden_states_1157_cast_fp16)[name = string("clip_314_cast_fp16")];
+            string clip_314_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_314_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_5117 = const()[name = string("op_5117"), val = fp32(-0x1p-1)];
+            fp32 var_5121_promoted = const()[name = string("op_5121_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_314_cast_fp16_to_fp32 = cast(dtype = clip_314_cast_fp16_to_fp32_dtype_0, x = clip_314_cast_fp16)[name = string("cast_34")];
+            tensor<fp32, [1, 50, 1024]> var_5127 = pow(x = clip_314_cast_fp16_to_fp32, y = var_5121_promoted)[name = string("op_5127")];
+            tensor<int32, [1]> var_5129_axes_0 = const()[name = string("op_5129_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5129_keep_dims_0 = const()[name = string("op_5129_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_5129 = reduce_mean(axes = var_5129_axes_0, keep_dims = var_5129_keep_dims_0, x = var_5127)[name = string("op_5129")];
+            string var_5129_to_fp16_dtype_0 = const()[name = string("op_5129_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_5130_to_fp16 = const()[name = string("op_5130_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_5129_to_fp16 = cast(dtype = var_5129_to_fp16_dtype_0, x = var_5129)[name = string("cast_33")];
+            tensor<fp16, [1, 50, 1]> mean_squared_203_cast_fp16 = add(x = var_5129_to_fp16, y = var_5130_to_fp16)[name = string("mean_squared_203_cast_fp16")];
+            string mean_squared_203_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_203_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_203_cast_fp16_to_fp32 = cast(dtype = mean_squared_203_cast_fp16_to_fp32_dtype_0, x = mean_squared_203_cast_fp16)[name = string("cast_32")];
+            tensor<fp32, [1, 50, 1]> var_5132 = pow(x = mean_squared_203_cast_fp16_to_fp32, y = var_5117)[name = string("op_5132")];
+            string var_5132_to_fp16_dtype_0 = const()[name = string("op_5132_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_5132_to_fp16 = cast(dtype = var_5132_to_fp16_dtype_0, x = var_5132)[name = string("cast_31")];
+            tensor<fp16, [1, 50, 1024]> normed_output_405_cast_fp16 = mul(x = clip_314_cast_fp16, y = var_5132_to_fp16)[name = string("normed_output_405_cast_fp16")];
+            tensor<fp16, [1024]> const_169_to_fp16 = const()[name = string("const_169_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(138164224)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_407_cast_fp16 = mul(x = normed_output_405_cast_fp16, y = const_169_to_fp16)[name = string("normed_output_407_cast_fp16")];
+            int32 var_5138 = const()[name = string("op_5138"), val = int32(-1)];
+            fp32 var_5139 = const()[name = string("op_5139"), val = fp32(-0x1.dcd65p+29)];
+            fp16 self_attns_11_q_proj_input_min_to_fp16 = const()[name = string("self_attns_11_q_proj_input_min_to_fp16"), val = fp16(-0x1.2cp+3)];
+            fp16 self_attns_11_q_proj_input_max_to_fp16 = const()[name = string("self_attns_11_q_proj_input_max_to_fp16"), val = fp16(0x1.28p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_315_cast_fp16 = clip(alpha = self_attns_11_q_proj_input_min_to_fp16, beta = self_attns_11_q_proj_input_max_to_fp16, x = normed_output_407_cast_fp16)[name = string("clip_315_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_11_q_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(138166336))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(138690688))))[name = string("self_attns_11_q_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_124_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_11_q_proj_linear_weight_to_fp16_palettized, x = clip_315_cast_fp16)[name = string("linear_124_cast_fp16")];
+            fp16 self_attns_11_q_proj_output_min_to_fp16 = const()[name = string("self_attns_11_q_proj_output_min_to_fp16"), val = fp16(-0x1.12p+4)];
+            fp16 self_attns_11_q_proj_output_max_to_fp16 = const()[name = string("self_attns_11_q_proj_output_max_to_fp16"), val = fp16(0x1.1p+4)];
+            tensor<fp16, [1, 50, 1024]> clip_316_cast_fp16 = clip(alpha = self_attns_11_q_proj_output_min_to_fp16, beta = self_attns_11_q_proj_output_max_to_fp16, x = linear_124_cast_fp16)[name = string("clip_316_cast_fp16")];
+            tensor<int32, [4]> var_5183 = const()[name = string("op_5183"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> q_cast_fp16 = reshape(shape = var_5183, x = clip_316_cast_fp16)[name = string("q_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_11_k_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(138691776))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(139216128))))[name = string("self_attns_11_k_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_125_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_11_k_proj_linear_weight_to_fp16_palettized, x = clip_315_cast_fp16)[name = string("linear_125_cast_fp16")];
+            fp16 self_attns_11_k_proj_output_min_to_fp16 = const()[name = string("self_attns_11_k_proj_output_min_to_fp16"), val = fp16(-0x1.12p+4)];
+            fp16 self_attns_11_k_proj_output_max_to_fp16 = const()[name = string("self_attns_11_k_proj_output_max_to_fp16"), val = fp16(0x1.1p+4)];
+            tensor<fp16, [1, 50, 1024]> clip_318_cast_fp16 = clip(alpha = self_attns_11_k_proj_output_min_to_fp16, beta = self_attns_11_k_proj_output_max_to_fp16, x = linear_125_cast_fp16)[name = string("clip_318_cast_fp16")];
+            tensor<int32, [4]> var_5195 = const()[name = string("op_5195"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> k_cast_fp16 = reshape(shape = var_5195, x = clip_318_cast_fp16)[name = string("k_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_11_v_proj_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(139217216))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(139741568))))[name = string("self_attns_11_v_proj_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_126_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_11_v_proj_linear_weight_to_fp16_palettized, x = clip_315_cast_fp16)[name = string("linear_126_cast_fp16")];
+            fp16 self_attns_11_v_proj_output_min_to_fp16 = const()[name = string("self_attns_11_v_proj_output_min_to_fp16"), val = fp16(-0x1.12p+4)];
+            fp16 self_attns_11_v_proj_output_max_to_fp16 = const()[name = string("self_attns_11_v_proj_output_max_to_fp16"), val = fp16(0x1.1p+4)];
+            tensor<fp16, [1, 50, 1024]> clip_320_cast_fp16 = clip(alpha = self_attns_11_v_proj_output_min_to_fp16, beta = self_attns_11_v_proj_output_max_to_fp16, x = linear_126_cast_fp16)[name = string("clip_320_cast_fp16")];
+            tensor<int32, [4]> var_5207 = const()[name = string("op_5207"), val = tensor<int32, [4]>([1, 50, 8, 128])];
+            tensor<fp16, [1, 50, 8, 128]> input_495_cast_fp16 = reshape(shape = var_5207, x = clip_320_cast_fp16)[name = string("input_495_cast_fp16")];
+            fp16 var_5209_to_fp16 = const()[name = string("op_5209_to_fp16"), val = fp16(0x1.054p-3)];
+            tensor<fp16, [1, 50, 8, 128]> var_5210_cast_fp16 = mul(x = q_cast_fp16, y = var_5209_to_fp16)[name = string("op_5210_cast_fp16")];
+            tensor<fp16, [128]> var_5211_to_fp16 = const()[name = string("op_5211_to_fp16"), val = tensor<fp16, [128]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(139742656)))];
+            tensor<fp16, [1, 50, 8, 128]> input_491_cast_fp16 = mul(x = var_5210_cast_fp16, y = var_5211_to_fp16)[name = string("input_491_cast_fp16")];
+            fp16 var_5213_to_fp16 = const()[name = string("op_5213_to_fp16"), val = fp16(0x1.e5p+0)];
+            tensor<fp16, [1, 50, 8, 128]> input_493_cast_fp16 = mul(x = k_cast_fp16, y = var_5213_to_fp16)[name = string("input_493_cast_fp16")];
+            tensor<int32, [8]> q_padded_pad_0 = const()[name = string("q_padded_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 10, 0, 0, 0, 0])];
+            string q_padded_mode_0 = const()[name = string("q_padded_mode_0"), val = string("constant")];
+            fp16 const_170_to_fp16 = const()[name = string("const_170_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 60, 8, 128]> q_padded_cast_fp16 = pad(constant_val = const_170_to_fp16, mode = q_padded_mode_0, pad = q_padded_pad_0, x = input_491_cast_fp16)[name = string("q_padded_cast_fp16")];
+            tensor<int32, [5]> var_5217 = const()[name = string("op_5217"), val = tensor<int32, [5]>([1, 5, 12, 8, 128])];
+            tensor<fp16, [1, 5, 12, 8, 128]> q_blocks_cast_fp16 = reshape(shape = var_5217, x = q_padded_cast_fp16)[name = string("q_blocks_cast_fp16")];
+            tensor<int32, [8]> k_padded_pad_0 = const()[name = string("k_padded_pad_0"), val = tensor<int32, [8]>([0, 0, 12, 11, 0, 0, 0, 0])];
+            string k_padded_mode_0 = const()[name = string("k_padded_mode_0"), val = string("constant")];
+            fp16 const_171_to_fp16 = const()[name = string("const_171_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 73, 8, 128]> k_padded_cast_fp16 = pad(constant_val = const_171_to_fp16, mode = k_padded_mode_0, pad = k_padded_pad_0, x = input_493_cast_fp16)[name = string("k_padded_cast_fp16")];
+            tensor<int32, [8]> v_padded_pad_0 = const()[name = string("v_padded_pad_0"), val = tensor<int32, [8]>([0, 0, 12, 11, 0, 0, 0, 0])];
+            string v_padded_mode_0 = const()[name = string("v_padded_mode_0"), val = string("constant")];
+            fp16 const_172_to_fp16 = const()[name = string("const_172_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 73, 8, 128]> v_padded_cast_fp16 = pad(constant_val = const_172_to_fp16, mode = v_padded_mode_0, pad = v_padded_pad_0, x = input_495_cast_fp16)[name = string("v_padded_cast_fp16")];
+            tensor<int32, [4]> var_5224_begin_0 = const()[name = string("op_5224_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_5224_end_0 = const()[name = string("op_5224_end_0"), val = tensor<int32, [4]>([1, 24, 8, 128])];
+            tensor<bool, [4]> var_5224_end_mask_0 = const()[name = string("op_5224_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_5224_cast_fp16 = slice_by_index(begin = var_5224_begin_0, end = var_5224_end_0, end_mask = var_5224_end_mask_0, x = k_padded_cast_fp16)[name = string("op_5224_cast_fp16")];
+            tensor<int32, [4]> var_5226_begin_0 = const()[name = string("op_5226_begin_0"), val = tensor<int32, [4]>([0, 12, 0, 0])];
+            tensor<int32, [4]> var_5226_end_0 = const()[name = string("op_5226_end_0"), val = tensor<int32, [4]>([1, 36, 8, 128])];
+            tensor<bool, [4]> var_5226_end_mask_0 = const()[name = string("op_5226_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_5226_cast_fp16 = slice_by_index(begin = var_5226_begin_0, end = var_5226_end_0, end_mask = var_5226_end_mask_0, x = k_padded_cast_fp16)[name = string("op_5226_cast_fp16")];
+            tensor<int32, [4]> var_5228_begin_0 = const()[name = string("op_5228_begin_0"), val = tensor<int32, [4]>([0, 24, 0, 0])];
+            tensor<int32, [4]> var_5228_end_0 = const()[name = string("op_5228_end_0"), val = tensor<int32, [4]>([1, 48, 8, 128])];
+            tensor<bool, [4]> var_5228_end_mask_0 = const()[name = string("op_5228_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_5228_cast_fp16 = slice_by_index(begin = var_5228_begin_0, end = var_5228_end_0, end_mask = var_5228_end_mask_0, x = k_padded_cast_fp16)[name = string("op_5228_cast_fp16")];
+            tensor<int32, [4]> var_5230_begin_0 = const()[name = string("op_5230_begin_0"), val = tensor<int32, [4]>([0, 36, 0, 0])];
+            tensor<int32, [4]> var_5230_end_0 = const()[name = string("op_5230_end_0"), val = tensor<int32, [4]>([1, 60, 8, 128])];
+            tensor<bool, [4]> var_5230_end_mask_0 = const()[name = string("op_5230_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_5230_cast_fp16 = slice_by_index(begin = var_5230_begin_0, end = var_5230_end_0, end_mask = var_5230_end_mask_0, x = k_padded_cast_fp16)[name = string("op_5230_cast_fp16")];
+            tensor<int32, [4]> var_5232_begin_0 = const()[name = string("op_5232_begin_0"), val = tensor<int32, [4]>([0, 48, 0, 0])];
+            tensor<int32, [4]> var_5232_end_0 = const()[name = string("op_5232_end_0"), val = tensor<int32, [4]>([1, 72, 8, 128])];
+            tensor<bool, [4]> var_5232_end_mask_0 = const()[name = string("op_5232_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_5232_cast_fp16 = slice_by_index(begin = var_5232_begin_0, end = var_5232_end_0, end_mask = var_5232_end_mask_0, x = k_padded_cast_fp16)[name = string("op_5232_cast_fp16")];
+            int32 k_blocks_axis_0 = const()[name = string("k_blocks_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 5, 24, 8, 128]> k_blocks_cast_fp16 = stack(axis = k_blocks_axis_0, values = (var_5224_cast_fp16, var_5226_cast_fp16, var_5228_cast_fp16, var_5230_cast_fp16, var_5232_cast_fp16))[name = string("k_blocks_cast_fp16")];
+            tensor<int32, [4]> var_5236_begin_0 = const()[name = string("op_5236_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_5236_end_0 = const()[name = string("op_5236_end_0"), val = tensor<int32, [4]>([1, 24, 8, 128])];
+            tensor<bool, [4]> var_5236_end_mask_0 = const()[name = string("op_5236_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_5236_cast_fp16 = slice_by_index(begin = var_5236_begin_0, end = var_5236_end_0, end_mask = var_5236_end_mask_0, x = v_padded_cast_fp16)[name = string("op_5236_cast_fp16")];
+            tensor<int32, [4]> var_5238_begin_0 = const()[name = string("op_5238_begin_0"), val = tensor<int32, [4]>([0, 12, 0, 0])];
+            tensor<int32, [4]> var_5238_end_0 = const()[name = string("op_5238_end_0"), val = tensor<int32, [4]>([1, 36, 8, 128])];
+            tensor<bool, [4]> var_5238_end_mask_0 = const()[name = string("op_5238_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_5238_cast_fp16 = slice_by_index(begin = var_5238_begin_0, end = var_5238_end_0, end_mask = var_5238_end_mask_0, x = v_padded_cast_fp16)[name = string("op_5238_cast_fp16")];
+            tensor<int32, [4]> var_5240_begin_0 = const()[name = string("op_5240_begin_0"), val = tensor<int32, [4]>([0, 24, 0, 0])];
+            tensor<int32, [4]> var_5240_end_0 = const()[name = string("op_5240_end_0"), val = tensor<int32, [4]>([1, 48, 8, 128])];
+            tensor<bool, [4]> var_5240_end_mask_0 = const()[name = string("op_5240_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_5240_cast_fp16 = slice_by_index(begin = var_5240_begin_0, end = var_5240_end_0, end_mask = var_5240_end_mask_0, x = v_padded_cast_fp16)[name = string("op_5240_cast_fp16")];
+            tensor<int32, [4]> var_5242_begin_0 = const()[name = string("op_5242_begin_0"), val = tensor<int32, [4]>([0, 36, 0, 0])];
+            tensor<int32, [4]> var_5242_end_0 = const()[name = string("op_5242_end_0"), val = tensor<int32, [4]>([1, 60, 8, 128])];
+            tensor<bool, [4]> var_5242_end_mask_0 = const()[name = string("op_5242_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_5242_cast_fp16 = slice_by_index(begin = var_5242_begin_0, end = var_5242_end_0, end_mask = var_5242_end_mask_0, x = v_padded_cast_fp16)[name = string("op_5242_cast_fp16")];
+            tensor<int32, [4]> var_5244_begin_0 = const()[name = string("op_5244_begin_0"), val = tensor<int32, [4]>([0, 48, 0, 0])];
+            tensor<int32, [4]> var_5244_end_0 = const()[name = string("op_5244_end_0"), val = tensor<int32, [4]>([1, 72, 8, 128])];
+            tensor<bool, [4]> var_5244_end_mask_0 = const()[name = string("op_5244_end_mask_0"), val = tensor<bool, [4]>([true, false, true, true])];
+            tensor<fp16, [1, 24, 8, 128]> var_5244_cast_fp16 = slice_by_index(begin = var_5244_begin_0, end = var_5244_end_0, end_mask = var_5244_end_mask_0, x = v_padded_cast_fp16)[name = string("op_5244_cast_fp16")];
+            int32 v_blocks_axis_0 = const()[name = string("v_blocks_axis_0"), val = int32(1)];
+            tensor<fp16, [1, 5, 24, 8, 128]> v_blocks_cast_fp16 = stack(axis = v_blocks_axis_0, values = (var_5236_cast_fp16, var_5238_cast_fp16, var_5240_cast_fp16, var_5242_cast_fp16, var_5244_cast_fp16))[name = string("v_blocks_cast_fp16")];
+            tensor<int32, [5]> var_5252 = const()[name = string("op_5252"), val = tensor<int32, [5]>([0, 3, 1, -3, -1])];
+            tensor<int32, [5]> var_5254 = const()[name = string("op_5254"), val = tensor<int32, [5]>([0, 3, 1, -1, -3])];
+            bool matrix_ac_transpose_x_0 = const()[name = string("matrix_ac_transpose_x_0"), val = bool(false)];
+            bool matrix_ac_transpose_y_0 = const()[name = string("matrix_ac_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 5, 12, 128]> queries_cast_fp16 = transpose(perm = var_5252, x = q_blocks_cast_fp16)[name = string("transpose_4")];
+            tensor<fp16, [1, 8, 5, 128, 24]> keys_t_cast_fp16 = transpose(perm = var_5254, x = k_blocks_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 8, 5, 12, 24]> matrix_ac_cast_fp16 = matmul(transpose_x = matrix_ac_transpose_x_0, transpose_y = matrix_ac_transpose_y_0, x = queries_cast_fp16, y = keys_t_cast_fp16)[name = string("matrix_ac_cast_fp16")];
+            tensor<int32, [4]> var_5257 = const()[name = string("op_5257"), val = tensor<int32, [4]>([1, 8, 60, 128])];
+            tensor<fp16, [1, 8, 60, 128]> q_flat_cast_fp16 = reshape(shape = var_5257, x = queries_cast_fp16)[name = string("q_flat_cast_fp16")];
+            bool matrix_bd_111_transpose_x_0 = const()[name = string("matrix_bd_111_transpose_x_0"), val = bool(false)];
+            bool matrix_bd_111_transpose_y_0 = const()[name = string("matrix_bd_111_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [8, 128, 13]> rel_k_t_to_fp16 = const()[name = string("rel_k_t_to_fp16"), val = tensor<fp16, [8, 128, 13]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(139742976)))];
+            tensor<fp16, [1, 8, 60, 13]> matrix_bd_111_cast_fp16 = matmul(transpose_x = matrix_bd_111_transpose_x_0, transpose_y = matrix_bd_111_transpose_y_0, x = q_flat_cast_fp16, y = rel_k_t_to_fp16)[name = string("matrix_bd_111_cast_fp16")];
+            tensor<int32, [5]> var_5262 = const()[name = string("op_5262"), val = tensor<int32, [5]>([1, 8, 5, 12, 13])];
+            tensor<fp16, [1, 8, 5, 12, 13]> input_497_cast_fp16 = reshape(shape = var_5262, x = matrix_bd_111_cast_fp16)[name = string("input_497_cast_fp16")];
+            tensor<int32, [10]> matrix_bd_113_pad_0 = const()[name = string("matrix_bd_113_pad_0"), val = tensor<int32, [10]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(139769664)))];
+            string matrix_bd_113_mode_0 = const()[name = string("matrix_bd_113_mode_0"), val = string("constant")];
+            fp16 const_174_to_fp16 = const()[name = string("const_174_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 8, 5, 12, 25]> matrix_bd_113_cast_fp16 = pad(constant_val = const_174_to_fp16, mode = matrix_bd_113_mode_0, pad = matrix_bd_113_pad_0, x = input_497_cast_fp16)[name = string("matrix_bd_113_cast_fp16")];
+            tensor<int32, [4]> var_5266 = const()[name = string("op_5266"), val = tensor<int32, [4]>([1, 8, 5, 300])];
+            tensor<fp16, [1, 8, 5, 300]> matrix_bd_115_cast_fp16 = reshape(shape = var_5266, x = matrix_bd_113_cast_fp16)[name = string("matrix_bd_115_cast_fp16")];
+            tensor<int32, [4]> matrix_bd_117_begin_0 = const()[name = string("matrix_bd_117_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> matrix_bd_117_end_0 = const()[name = string("matrix_bd_117_end_0"), val = tensor<int32, [4]>([1, 8, 5, 288])];
+            tensor<bool, [4]> matrix_bd_117_end_mask_0 = const()[name = string("matrix_bd_117_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 8, 5, 288]> matrix_bd_117_cast_fp16 = slice_by_index(begin = matrix_bd_117_begin_0, end = matrix_bd_117_end_0, end_mask = matrix_bd_117_end_mask_0, x = matrix_bd_115_cast_fp16)[name = string("matrix_bd_117_cast_fp16")];
+            tensor<int32, [5]> var_5272 = const()[name = string("op_5272"), val = tensor<int32, [5]>([1, 8, 5, 12, 24])];
+            tensor<fp16, [1, 8, 5, 12, 24]> matrix_bd_cast_fp16 = reshape(shape = var_5272, x = matrix_bd_117_cast_fp16)[name = string("matrix_bd_cast_fp16")];
+            tensor<fp16, [1, 8, 5, 12, 24]> attn_67_cast_fp16 = add(x = matrix_ac_cast_fp16, y = matrix_bd_cast_fp16)[name = string("attn_67_cast_fp16")];
+            fp16 _inversed_5275_y_0_to_fp16 = const()[name = string("_inversed_5275_y_0_to_fp16"), val = fp16(0x1.47cp-6)];
+            tensor<fp16, [1, 8, 5, 12, 24]> _inversed_5275_cast_fp16 = mul(x = attn_67_cast_fp16, y = _inversed_5275_y_0_to_fp16)[name = string("_inversed_5275_cast_fp16")];
+            string _inversed_5275_cast_fp16_to_fp32_dtype_0 = const()[name = string("_inversed_5275_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 8, 5, 12, 24]> _inversed_5275_cast_fp16_to_fp32 = cast(dtype = _inversed_5275_cast_fp16_to_fp32_dtype_0, x = _inversed_5275_cast_fp16)[name = string("cast_30")];
+            tensor<fp32, [1, 8, 5, 12, 24]> var_5276 = tanh(x = _inversed_5275_cast_fp16_to_fp32)[name = string("op_5276")];
+            string var_5276_to_fp16_dtype_0 = const()[name = string("op_5276_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 self_attns_11_softcap_to_fp16 = const()[name = string("self_attns_11_softcap_to_fp16"), val = fp16(0x1.9p+5)];
+            tensor<fp16, [1, 8, 5, 12, 24]> var_5276_to_fp16 = cast(dtype = var_5276_to_fp16_dtype_0, x = var_5276)[name = string("cast_29")];
+            tensor<fp16, [1, 8, 5, 12, 24]> attn_69_cast_fp16 = mul(x = var_5276_to_fp16, y = self_attns_11_softcap_to_fp16)[name = string("attn_69_cast_fp16")];
+            string attn_69_cast_fp16_to_fp32_dtype_0 = const()[name = string("attn_69_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 8, 5, 12, 24]> attn_69_cast_fp16_to_fp32 = cast(dtype = attn_69_cast_fp16_to_fp32_dtype_0, x = attn_69_cast_fp16)[name = string("cast_28")];
+            tensor<fp32, [1, 8, 5, 12, 24]> input_499 = select(a = var_5139, b = attn_69_cast_fp16_to_fp32, cond = var_460)[name = string("input_499")];
+            tensor<fp32, [1, 8, 5, 12, 24]> var_5280 = softmax(axis = var_5138, x = input_499)[name = string("op_5280")];
+            tensor<int32, [5]> var_5282 = const()[name = string("op_5282"), val = tensor<int32, [5]>([0, 3, 1, -3, -1])];
+            bool out_67_transpose_x_0 = const()[name = string("out_67_transpose_x_0"), val = bool(false)];
+            bool out_67_transpose_y_0 = const()[name = string("out_67_transpose_y_0"), val = bool(false)];
+            string var_5280_to_fp16_dtype_0 = const()[name = string("op_5280_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 8, 5, 24, 128]> values_t_cast_fp16 = transpose(perm = var_5282, x = v_blocks_cast_fp16)[name = string("transpose_3")];
+            tensor<fp16, [1, 8, 5, 12, 24]> var_5280_to_fp16 = cast(dtype = var_5280_to_fp16_dtype_0, x = var_5280)[name = string("cast_27")];
+            tensor<fp16, [1, 8, 5, 12, 128]> out_67_cast_fp16 = matmul(transpose_x = out_67_transpose_x_0, transpose_y = out_67_transpose_y_0, x = var_5280_to_fp16, y = values_t_cast_fp16)[name = string("out_67_cast_fp16")];
+            tensor<int32, [5]> var_5285 = const()[name = string("op_5285"), val = tensor<int32, [5]>([0, 2, 3, 1, 4])];
+            tensor<int32, [3]> var_5287 = const()[name = string("op_5287"), val = tensor<int32, [3]>([1, 60, 1024])];
+            tensor<fp16, [1, 5, 12, 8, 128]> var_5286_cast_fp16 = transpose(perm = var_5285, x = out_67_cast_fp16)[name = string("transpose_2")];
+            tensor<fp16, [1, 60, 1024]> out_69_cast_fp16 = reshape(shape = var_5287, x = var_5286_cast_fp16)[name = string("out_69_cast_fp16")];
+            tensor<int32, [3]> var_5290_begin_0 = const()[name = string("op_5290_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_5290_end_0 = const()[name = string("op_5290_end_0"), val = tensor<int32, [3]>([1, 50, 1024])];
+            tensor<bool, [3]> var_5290_end_mask_0 = const()[name = string("op_5290_end_mask_0"), val = tensor<bool, [3]>([true, false, true])];
+            tensor<fp16, [1, 50, 1024]> var_5290_cast_fp16 = slice_by_index(begin = var_5290_begin_0, end = var_5290_end_0, end_mask = var_5290_end_mask_0, x = out_69_cast_fp16)[name = string("op_5290_cast_fp16")];
+            fp16 self_attns_11_post_input_min_to_fp16 = const()[name = string("self_attns_11_post_input_min_to_fp16"), val = fp16(-0x1.eep+3)];
+            fp16 self_attns_11_post_input_max_to_fp16 = const()[name = string("self_attns_11_post_input_max_to_fp16"), val = fp16(0x1.eap+3)];
+            tensor<fp16, [1, 50, 1024]> clip_321_cast_fp16 = clip(alpha = self_attns_11_post_input_min_to_fp16, beta = self_attns_11_post_input_max_to_fp16, x = var_5290_cast_fp16)[name = string("clip_321_cast_fp16")];
+            tensor<fp16, [1024, 1024]> self_attns_11_post_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(139769792))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(140294144))))[name = string("self_attns_11_post_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_128_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = self_attns_11_post_linear_weight_to_fp16_palettized, x = clip_321_cast_fp16)[name = string("linear_128_cast_fp16")];
+            fp16 self_attns_11_post_output_min_to_fp16 = const()[name = string("self_attns_11_post_output_min_to_fp16"), val = fp16(-0x1.2p+6)];
+            fp16 self_attns_11_post_output_max_to_fp16 = const()[name = string("self_attns_11_post_output_max_to_fp16"), val = fp16(0x1.1ep+6)];
+            tensor<fp16, [1, 50, 1024]> clip_322_cast_fp16 = clip(alpha = self_attns_11_post_output_min_to_fp16, beta = self_attns_11_post_output_max_to_fp16, x = linear_128_cast_fp16)[name = string("clip_322_cast_fp16")];
+            fp16 var_5302_to_fp16 = const()[name = string("op_5302_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_5303_to_fp16 = const()[name = string("op_5303_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_323_cast_fp16 = clip(alpha = var_5302_to_fp16, beta = var_5303_to_fp16, x = clip_322_cast_fp16)[name = string("clip_323_cast_fp16")];
+            string clip_323_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_323_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_5305 = const()[name = string("op_5305"), val = fp32(-0x1p-1)];
+            fp32 var_5309_promoted = const()[name = string("op_5309_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_323_cast_fp16_to_fp32 = cast(dtype = clip_323_cast_fp16_to_fp32_dtype_0, x = clip_323_cast_fp16)[name = string("cast_26")];
+            tensor<fp32, [1, 50, 1024]> var_5315 = pow(x = clip_323_cast_fp16_to_fp32, y = var_5309_promoted)[name = string("op_5315")];
+            tensor<int32, [1]> var_5317_axes_0 = const()[name = string("op_5317_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5317_keep_dims_0 = const()[name = string("op_5317_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_5317 = reduce_mean(axes = var_5317_axes_0, keep_dims = var_5317_keep_dims_0, x = var_5315)[name = string("op_5317")];
+            string var_5317_to_fp16_dtype_0 = const()[name = string("op_5317_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_5318_to_fp16 = const()[name = string("op_5318_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_5317_to_fp16 = cast(dtype = var_5317_to_fp16_dtype_0, x = var_5317)[name = string("cast_25")];
+            tensor<fp16, [1, 50, 1]> mean_squared_205_cast_fp16 = add(x = var_5317_to_fp16, y = var_5318_to_fp16)[name = string("mean_squared_205_cast_fp16")];
+            string mean_squared_205_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_205_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_205_cast_fp16_to_fp32 = cast(dtype = mean_squared_205_cast_fp16_to_fp32_dtype_0, x = mean_squared_205_cast_fp16)[name = string("cast_24")];
+            tensor<fp32, [1, 50, 1]> var_5320 = pow(x = mean_squared_205_cast_fp16_to_fp32, y = var_5305)[name = string("op_5320")];
+            string var_5320_to_fp16_dtype_0 = const()[name = string("op_5320_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_5320_to_fp16 = cast(dtype = var_5320_to_fp16_dtype_0, x = var_5320)[name = string("cast_23")];
+            tensor<fp16, [1, 50, 1024]> normed_output_409_cast_fp16 = mul(x = clip_323_cast_fp16, y = var_5320_to_fp16)[name = string("normed_output_409_cast_fp16")];
+            tensor<fp16, [1024]> const_175_to_fp16 = const()[name = string("const_175_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(140295232)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_411_cast_fp16 = mul(x = normed_output_409_cast_fp16, y = const_175_to_fp16)[name = string("normed_output_411_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_1183_cast_fp16 = add(x = normed_output_411_cast_fp16, y = hidden_states_1157_cast_fp16)[name = string("hidden_states_1183_cast_fp16")];
+            string hidden_states_1183_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_1183_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_5327 = const()[name = string("op_5327"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_5328 = const()[name = string("op_5328"), val = fp32(-0x1.2a05f2p+33)];
+            fp32 var_5340 = const()[name = string("op_5340"), val = fp32(-0x1p-1)];
+            fp32 var_5336_promoted = const()[name = string("op_5336_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> hidden_states_1183_cast_fp16_to_fp32 = cast(dtype = hidden_states_1183_cast_fp16_to_fp32_dtype_0, x = hidden_states_1183_cast_fp16)[name = string("cast_22")];
+            tensor<fp32, [1, 50, 1024]> var_5348 = pow(x = hidden_states_1183_cast_fp16_to_fp32, y = var_5336_promoted)[name = string("op_5348")];
+            tensor<int32, [1]> var_5350_axes_0 = const()[name = string("op_5350_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5350_keep_dims_0 = const()[name = string("op_5350_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_5350 = reduce_mean(axes = var_5350_axes_0, keep_dims = var_5350_keep_dims_0, x = var_5348)[name = string("op_5350")];
+            string var_5350_to_fp16_dtype_0 = const()[name = string("op_5350_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_5351_to_fp16 = const()[name = string("op_5351_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_5350_to_fp16 = cast(dtype = var_5350_to_fp16_dtype_0, x = var_5350)[name = string("cast_21")];
+            tensor<fp16, [1, 50, 1]> mean_squared_207_cast_fp16 = add(x = var_5350_to_fp16, y = var_5351_to_fp16)[name = string("mean_squared_207_cast_fp16")];
+            string mean_squared_207_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_207_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_207_cast_fp16_to_fp32 = cast(dtype = mean_squared_207_cast_fp16_to_fp32_dtype_0, x = mean_squared_207_cast_fp16)[name = string("cast_20")];
+            tensor<fp32, [1, 50, 1]> var_5353 = pow(x = mean_squared_207_cast_fp16_to_fp32, y = var_5340)[name = string("op_5353")];
+            string var_5353_to_fp16_dtype_0 = const()[name = string("op_5353_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_5353_to_fp16 = cast(dtype = var_5353_to_fp16_dtype_0, x = var_5353)[name = string("cast_19")];
+            tensor<fp16, [1, 50, 1024]> normed_output_413_cast_fp16 = mul(x = hidden_states_1183_cast_fp16, y = var_5353_to_fp16)[name = string("normed_output_413_cast_fp16")];
+            tensor<fp16, [1024]> const_176_to_fp16 = const()[name = string("const_176_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(140297344)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_415_cast_fp16 = mul(x = normed_output_413_cast_fp16, y = const_176_to_fp16)[name = string("normed_output_415_cast_fp16")];
+            fp16 lconv1ds_11_linear_start_input_min_to_fp16 = const()[name = string("lconv1ds_11_linear_start_input_min_to_fp16"), val = fp16(-0x1.64p+3)];
+            fp16 lconv1ds_11_linear_start_input_max_to_fp16 = const()[name = string("lconv1ds_11_linear_start_input_max_to_fp16"), val = fp16(0x1.6p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_324_cast_fp16 = clip(alpha = lconv1ds_11_linear_start_input_min_to_fp16, beta = lconv1ds_11_linear_start_input_max_to_fp16, x = normed_output_415_cast_fp16)[name = string("clip_324_cast_fp16")];
+            tensor<fp16, [2048, 1024]> lconv1ds_11_linear_start_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(140299456))), lut = tensor<fp16, [64, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(141348096))))[name = string("lconv1ds_11_linear_start_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 2048]> linear_129_cast_fp16 = linear(bias = linear_8_bias_0_to_fp16, weight = lconv1ds_11_linear_start_linear_weight_to_fp16_palettized, x = clip_324_cast_fp16)[name = string("linear_129_cast_fp16")];
+            fp16 lconv1ds_11_linear_start_output_min_to_fp16 = const()[name = string("lconv1ds_11_linear_start_output_min_to_fp16"), val = fp16(-0x1.b6p+4)];
+            fp16 lconv1ds_11_linear_start_output_max_to_fp16 = const()[name = string("lconv1ds_11_linear_start_output_max_to_fp16"), val = fp16(0x1.b2p+4)];
+            tensor<fp16, [1, 50, 2048]> clip_325_cast_fp16 = clip(alpha = lconv1ds_11_linear_start_output_min_to_fp16, beta = lconv1ds_11_linear_start_output_max_to_fp16, x = linear_129_cast_fp16)[name = string("clip_325_cast_fp16")];
+            int32 hidden_states_1191_split_num_splits_0 = const()[name = string("hidden_states_1191_split_num_splits_0"), val = int32(2)];
+            int32 hidden_states_1191_split_axis_0 = const()[name = string("hidden_states_1191_split_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_1191_split_cast_fp16_0, tensor<fp16, [1, 50, 1024]> hidden_states_1191_split_cast_fp16_1 = split(axis = hidden_states_1191_split_axis_0, num_splits = hidden_states_1191_split_num_splits_0, x = clip_325_cast_fp16)[name = string("hidden_states_1191_split_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_1191_split_1_sigmoid_cast_fp16 = sigmoid(x = hidden_states_1191_split_cast_fp16_1)[name = string("hidden_states_1191_split_1_sigmoid_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_1191_cast_fp16 = mul(x = hidden_states_1191_split_cast_fp16_0, y = hidden_states_1191_split_1_sigmoid_cast_fp16)[name = string("hidden_states_1191_cast_fp16")];
+            tensor<int32, [3]> input_507_perm_0 = const()[name = string("input_507_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [6]> input_509_pad_0 = const()[name = string("input_509_pad_0"), val = tensor<int32, [6]>([0, 0, 0, 0, 4, 0])];
+            string input_509_mode_0 = const()[name = string("input_509_mode_0"), val = string("constant")];
+            fp16 const_177_to_fp16 = const()[name = string("const_177_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 1024, 50]> input_507_cast_fp16 = transpose(perm = input_507_perm_0, x = hidden_states_1191_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 1024, 54]> input_509_cast_fp16 = pad(constant_val = const_177_to_fp16, mode = input_509_mode_0, pad = input_509_pad_0, x = input_507_cast_fp16)[name = string("input_509_cast_fp16")];
+            string var_5379_pad_type_0 = const()[name = string("op_5379_pad_type_0"), val = string("valid")];
+            int32 var_5379_groups_0 = const()[name = string("op_5379_groups_0"), val = int32(1024)];
+            tensor<int32, [1]> var_5379_strides_0 = const()[name = string("op_5379_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_5379_pad_0 = const()[name = string("op_5379_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_5379_dilations_0 = const()[name = string("op_5379_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1024, 1, 5]> lconv1ds_11_depthwise_conv1d_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1, 5]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(141350208))), lut = tensor<fp16, [32, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(141352832))))[name = string("lconv1ds_11_depthwise_conv1d_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 1024, 50]> var_5379_cast_fp16 = conv(dilations = var_5379_dilations_0, groups = var_5379_groups_0, pad = var_5379_pad_0, pad_type = var_5379_pad_type_0, strides = var_5379_strides_0, weight = lconv1ds_11_depthwise_conv1d_weight_to_fp16_palettized, x = input_509_cast_fp16)[name = string("op_5379_cast_fp16")];
+            tensor<int32, [3]> hidden_states_1193_perm_0 = const()[name = string("hidden_states_1193_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            string hidden_states_1193_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_1193_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_1193_cast_fp16 = transpose(perm = hidden_states_1193_perm_0, x = var_5379_cast_fp16)[name = string("transpose_0")];
+            tensor<fp32, [1, 50, 1024]> hidden_states_1193_cast_fp16_to_fp32 = cast(dtype = hidden_states_1193_cast_fp16_to_fp32_dtype_0, x = hidden_states_1193_cast_fp16)[name = string("cast_18")];
+            tensor<fp32, [1, 50, 1024]> clip_326 = clip(alpha = var_5328, beta = var_5327, x = hidden_states_1193_cast_fp16_to_fp32)[name = string("clip_326")];
+            fp32 var_5336_promoted_1 = const()[name = string("op_5336_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_5384 = pow(x = clip_326, y = var_5336_promoted_1)[name = string("op_5384")];
+            tensor<int32, [1]> var_5386_axes_0 = const()[name = string("op_5386_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5386_keep_dims_0 = const()[name = string("op_5386_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_5386 = reduce_mean(axes = var_5386_axes_0, keep_dims = var_5386_keep_dims_0, x = var_5384)[name = string("op_5386")];
+            string var_5386_to_fp16_dtype_0 = const()[name = string("op_5386_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_5387_to_fp16 = const()[name = string("op_5387_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_5386_to_fp16 = cast(dtype = var_5386_to_fp16_dtype_0, x = var_5386)[name = string("cast_17")];
+            tensor<fp16, [1, 50, 1]> mean_squared_209_cast_fp16 = add(x = var_5386_to_fp16, y = var_5387_to_fp16)[name = string("mean_squared_209_cast_fp16")];
+            string mean_squared_209_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_209_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_209_cast_fp16_to_fp32 = cast(dtype = mean_squared_209_cast_fp16_to_fp32_dtype_0, x = mean_squared_209_cast_fp16)[name = string("cast_16")];
+            tensor<fp32, [1, 50, 1]> var_5389 = pow(x = mean_squared_209_cast_fp16_to_fp32, y = var_5340)[name = string("op_5389")];
+            string clip_326_to_fp16_dtype_0 = const()[name = string("clip_326_to_fp16_dtype_0"), val = string("fp16")];
+            string var_5389_to_fp16_dtype_0 = const()[name = string("op_5389_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_326_to_fp16 = cast(dtype = clip_326_to_fp16_dtype_0, x = clip_326)[name = string("cast_14")];
+            tensor<fp16, [1, 50, 1]> var_5389_to_fp16 = cast(dtype = var_5389_to_fp16_dtype_0, x = var_5389)[name = string("cast_15")];
+            tensor<fp16, [1, 50, 1024]> normed_output_417_cast_fp16 = mul(x = clip_326_to_fp16, y = var_5389_to_fp16)[name = string("normed_output_417_cast_fp16")];
+            tensor<fp16, [1024]> const_178_to_fp16 = const()[name = string("const_178_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(141353920)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_419_cast_fp16 = mul(x = normed_output_417_cast_fp16, y = const_178_to_fp16)[name = string("normed_output_419_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_1199_cast_fp16 = silu(x = normed_output_419_cast_fp16)[name = string("hidden_states_1199_cast_fp16")];
+            fp16 lconv1ds_11_linear_end_input_min_to_fp16 = const()[name = string("lconv1ds_11_linear_end_input_min_to_fp16"), val = fp16(-0x1.e6p+4)];
+            fp16 lconv1ds_11_linear_end_input_max_to_fp16 = const()[name = string("lconv1ds_11_linear_end_input_max_to_fp16"), val = fp16(0x1.e2p+4)];
+            tensor<fp16, [1, 50, 1024]> clip_327_cast_fp16 = clip(alpha = lconv1ds_11_linear_end_input_min_to_fp16, beta = lconv1ds_11_linear_end_input_max_to_fp16, x = hidden_states_1199_cast_fp16)[name = string("clip_327_cast_fp16")];
+            tensor<fp16, [1024, 1024]> lconv1ds_11_linear_end_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(141356032))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(141880384))))[name = string("lconv1ds_11_linear_end_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_130_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = lconv1ds_11_linear_end_linear_weight_to_fp16_palettized, x = clip_327_cast_fp16)[name = string("linear_130_cast_fp16")];
+            fp16 lconv1ds_11_linear_end_output_min_to_fp16 = const()[name = string("lconv1ds_11_linear_end_output_min_to_fp16"), val = fp16(-0x1.06p+4)];
+            fp16 lconv1ds_11_linear_end_output_max_to_fp16 = const()[name = string("lconv1ds_11_linear_end_output_max_to_fp16"), val = fp16(0x1.04p+4)];
+            tensor<fp16, [1, 50, 1024]> clip_328_cast_fp16 = clip(alpha = lconv1ds_11_linear_end_output_min_to_fp16, beta = lconv1ds_11_linear_end_output_max_to_fp16, x = linear_130_cast_fp16)[name = string("clip_328_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_1205_cast_fp16 = add(x = clip_328_cast_fp16, y = hidden_states_1183_cast_fp16)[name = string("hidden_states_1205_cast_fp16")];
+            string hidden_states_1205_cast_fp16_to_fp32_dtype_0 = const()[name = string("hidden_states_1205_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_5413 = const()[name = string("op_5413"), val = fp32(-0x1p-1)];
+            fp32 var_5414 = const()[name = string("op_5414"), val = fp32(0x1.2a05f2p+33)];
+            fp32 var_5415 = const()[name = string("op_5415"), val = fp32(-0x1.2a05f2p+33)];
+            tensor<fp32, [1, 50, 1024]> hidden_states_1205_cast_fp16_to_fp32 = cast(dtype = hidden_states_1205_cast_fp16_to_fp32_dtype_0, x = hidden_states_1205_cast_fp16)[name = string("cast_13")];
+            tensor<fp32, [1, 50, 1024]> clip_329 = clip(alpha = var_5415, beta = var_5414, x = hidden_states_1205_cast_fp16_to_fp32)[name = string("clip_329")];
+            fp32 var_5409_promoted = const()[name = string("op_5409_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_5423 = pow(x = clip_329, y = var_5409_promoted)[name = string("op_5423")];
+            tensor<int32, [1]> var_5425_axes_0 = const()[name = string("op_5425_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5425_keep_dims_0 = const()[name = string("op_5425_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_5425 = reduce_mean(axes = var_5425_axes_0, keep_dims = var_5425_keep_dims_0, x = var_5423)[name = string("op_5425")];
+            string var_5425_to_fp16_dtype_0 = const()[name = string("op_5425_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_5426_to_fp16 = const()[name = string("op_5426_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_5425_to_fp16 = cast(dtype = var_5425_to_fp16_dtype_0, x = var_5425)[name = string("cast_12")];
+            tensor<fp16, [1, 50, 1]> mean_squared_211_cast_fp16 = add(x = var_5425_to_fp16, y = var_5426_to_fp16)[name = string("mean_squared_211_cast_fp16")];
+            string mean_squared_211_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_211_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_211_cast_fp16_to_fp32 = cast(dtype = mean_squared_211_cast_fp16_to_fp32_dtype_0, x = mean_squared_211_cast_fp16)[name = string("cast_11")];
+            tensor<fp32, [1, 50, 1]> var_5428 = pow(x = mean_squared_211_cast_fp16_to_fp32, y = var_5413)[name = string("op_5428")];
+            string clip_329_to_fp16_dtype_0 = const()[name = string("clip_329_to_fp16_dtype_0"), val = string("fp16")];
+            string var_5428_to_fp16_dtype_0 = const()[name = string("op_5428_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_329_to_fp16 = cast(dtype = clip_329_to_fp16_dtype_0, x = clip_329)[name = string("cast_9")];
+            tensor<fp16, [1, 50, 1]> var_5428_to_fp16 = cast(dtype = var_5428_to_fp16_dtype_0, x = var_5428)[name = string("cast_10")];
+            tensor<fp16, [1, 50, 1024]> normed_output_421_cast_fp16 = mul(x = clip_329_to_fp16, y = var_5428_to_fp16)[name = string("normed_output_421_cast_fp16")];
+            tensor<fp16, [1024]> const_179_to_fp16 = const()[name = string("const_179_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(141881472)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_423_cast_fp16 = mul(x = normed_output_421_cast_fp16, y = const_179_to_fp16)[name = string("normed_output_423_cast_fp16")];
+            fp16 feed_forward2s_11_ffw_layer_1_input_min_to_fp16 = const()[name = string("feed_forward2s_11_ffw_layer_1_input_min_to_fp16"), val = fp16(-0x1.9cp+3)];
+            fp16 feed_forward2s_11_ffw_layer_1_input_max_to_fp16 = const()[name = string("feed_forward2s_11_ffw_layer_1_input_max_to_fp16"), val = fp16(0x1.98p+3)];
+            tensor<fp16, [1, 50, 1024]> clip_330_cast_fp16 = clip(alpha = feed_forward2s_11_ffw_layer_1_input_min_to_fp16, beta = feed_forward2s_11_ffw_layer_1_input_max_to_fp16, x = normed_output_423_cast_fp16)[name = string("clip_330_cast_fp16")];
+            tensor<fp16, [4096, 1024]> feed_forward2s_11_ffw_layer_1_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(141883584))), lut = tensor<fp16, [128, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(143980800))))[name = string("feed_forward2s_11_ffw_layer_1_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 4096]> linear_131_cast_fp16 = linear(bias = linear_1_bias_0_to_fp16, weight = feed_forward2s_11_ffw_layer_1_linear_weight_to_fp16_palettized, x = clip_330_cast_fp16)[name = string("linear_131_cast_fp16")];
+            fp16 feed_forward2s_11_ffw_layer_1_output_min_to_fp16 = const()[name = string("feed_forward2s_11_ffw_layer_1_output_min_to_fp16"), val = fp16(-0x1.44p+5)];
+            fp16 feed_forward2s_11_ffw_layer_1_output_max_to_fp16 = const()[name = string("feed_forward2s_11_ffw_layer_1_output_max_to_fp16"), val = fp16(0x1.4p+5)];
+            tensor<fp16, [1, 50, 4096]> clip_331_cast_fp16 = clip(alpha = feed_forward2s_11_ffw_layer_1_output_min_to_fp16, beta = feed_forward2s_11_ffw_layer_1_output_max_to_fp16, x = linear_131_cast_fp16)[name = string("clip_331_cast_fp16")];
+            tensor<fp16, [1, 50, 4096]> hidden_states_1215_cast_fp16 = silu(x = clip_331_cast_fp16)[name = string("hidden_states_1215_cast_fp16")];
+            fp16 feed_forward2s_11_ffw_layer_2_input_min_to_fp16 = const()[name = string("feed_forward2s_11_ffw_layer_2_input_min_to_fp16"), val = fp16(-0x1.6ap+3)];
+            fp16 feed_forward2s_11_ffw_layer_2_input_max_to_fp16 = const()[name = string("feed_forward2s_11_ffw_layer_2_input_max_to_fp16"), val = fp16(0x1.66p+3)];
+            tensor<fp16, [1, 50, 4096]> clip_332_cast_fp16 = clip(alpha = feed_forward2s_11_ffw_layer_2_input_min_to_fp16, beta = feed_forward2s_11_ffw_layer_2_input_max_to_fp16, x = hidden_states_1215_cast_fp16)[name = string("clip_332_cast_fp16")];
+            tensor<fp16, [1024, 4096]> feed_forward2s_11_ffw_layer_2_linear_weight_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 4096]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(143984960))), lut = tensor<fp16, [32, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(146082176))))[name = string("feed_forward2s_11_ffw_layer_2_linear_weight_to_fp16_palettized")];
+            tensor<fp16, [1, 50, 1024]> linear_132_cast_fp16 = linear(bias = linear_0_bias_0_to_fp16, weight = feed_forward2s_11_ffw_layer_2_linear_weight_to_fp16_palettized, x = clip_332_cast_fp16)[name = string("linear_132_cast_fp16")];
+            fp16 feed_forward2s_11_ffw_layer_2_output_min_to_fp16 = const()[name = string("feed_forward2s_11_ffw_layer_2_output_min_to_fp16"), val = fp16(-0x1.c4p+5)];
+            fp16 feed_forward2s_11_ffw_layer_2_output_max_to_fp16 = const()[name = string("feed_forward2s_11_ffw_layer_2_output_max_to_fp16"), val = fp16(0x1.cp+5)];
+            tensor<fp16, [1, 50, 1024]> clip_333_cast_fp16 = clip(alpha = feed_forward2s_11_ffw_layer_2_output_min_to_fp16, beta = feed_forward2s_11_ffw_layer_2_output_max_to_fp16, x = linear_132_cast_fp16)[name = string("clip_333_cast_fp16")];
+            string clip_333_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_333_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1024]> clip_333_cast_fp16_to_fp32 = cast(dtype = clip_333_cast_fp16_to_fp32_dtype_0, x = clip_333_cast_fp16)[name = string("cast_8")];
+            tensor<fp32, [1, 50, 1024]> clip_334 = clip(alpha = var_5415, beta = var_5414, x = clip_333_cast_fp16_to_fp32)[name = string("clip_334")];
+            fp32 var_5409_promoted_1 = const()[name = string("op_5409_promoted_1"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> var_5455 = pow(x = clip_334, y = var_5409_promoted_1)[name = string("op_5455")];
+            tensor<int32, [1]> var_5457_axes_0 = const()[name = string("op_5457_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5457_keep_dims_0 = const()[name = string("op_5457_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_5457 = reduce_mean(axes = var_5457_axes_0, keep_dims = var_5457_keep_dims_0, x = var_5455)[name = string("op_5457")];
+            string var_5457_to_fp16_dtype_0 = const()[name = string("op_5457_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_5458_to_fp16 = const()[name = string("op_5458_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_5457_to_fp16 = cast(dtype = var_5457_to_fp16_dtype_0, x = var_5457)[name = string("cast_7")];
+            tensor<fp16, [1, 50, 1]> mean_squared_213_cast_fp16 = add(x = var_5457_to_fp16, y = var_5458_to_fp16)[name = string("mean_squared_213_cast_fp16")];
+            string mean_squared_213_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_213_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_213_cast_fp16_to_fp32 = cast(dtype = mean_squared_213_cast_fp16_to_fp32_dtype_0, x = mean_squared_213_cast_fp16)[name = string("cast_6")];
+            tensor<fp32, [1, 50, 1]> var_5460 = pow(x = mean_squared_213_cast_fp16_to_fp32, y = var_5413)[name = string("op_5460")];
+            string clip_334_to_fp16_dtype_0 = const()[name = string("clip_334_to_fp16_dtype_0"), val = string("fp16")];
+            string var_5460_to_fp16_dtype_0 = const()[name = string("op_5460_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1024]> clip_334_to_fp16 = cast(dtype = clip_334_to_fp16_dtype_0, x = clip_334)[name = string("cast_4")];
+            tensor<fp16, [1, 50, 1]> var_5460_to_fp16 = cast(dtype = var_5460_to_fp16_dtype_0, x = var_5460)[name = string("cast_5")];
+            tensor<fp16, [1, 50, 1024]> normed_output_425_cast_fp16 = mul(x = clip_334_to_fp16, y = var_5460_to_fp16)[name = string("normed_output_425_cast_fp16")];
+            tensor<fp16, [1024]> const_180_to_fp16 = const()[name = string("const_180_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(146083264)))];
+            tensor<fp16, [1, 50, 1024]> normed_output_427_cast_fp16 = mul(x = normed_output_425_cast_fp16, y = const_180_to_fp16)[name = string("normed_output_427_cast_fp16")];
+            fp16 var_5405_to_fp16 = const()[name = string("op_5405_to_fp16"), val = fp16(0x1p-1)];
+            tensor<fp16, [1, 50, 1024]> hidden_states_1227_cast_fp16 = mul(x = normed_output_427_cast_fp16, y = var_5405_to_fp16)[name = string("hidden_states_1227_cast_fp16")];
+            tensor<fp16, [1, 50, 1024]> hidden_states_1229_cast_fp16 = add(x = hidden_states_1227_cast_fp16, y = hidden_states_1205_cast_fp16)[name = string("hidden_states_1229_cast_fp16")];
+            fp16 var_5467_to_fp16 = const()[name = string("op_5467_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            fp16 var_5468_to_fp16 = const()[name = string("op_5468_to_fp16"), val = fp16(0x1.ffcp+15)];
+            tensor<fp16, [1, 50, 1024]> clip_335_cast_fp16 = clip(alpha = var_5467_to_fp16, beta = var_5468_to_fp16, x = hidden_states_1229_cast_fp16)[name = string("clip_335_cast_fp16")];
+            string clip_335_cast_fp16_to_fp32_dtype_0 = const()[name = string("clip_335_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            fp32 var_5470 = const()[name = string("op_5470"), val = fp32(-0x1p-1)];
+            fp32 var_5474_promoted = const()[name = string("op_5474_promoted"), val = fp32(0x1p+1)];
+            tensor<fp32, [1, 50, 1024]> clip_335_cast_fp16_to_fp32 = cast(dtype = clip_335_cast_fp16_to_fp32_dtype_0, x = clip_335_cast_fp16)[name = string("cast_3")];
+            tensor<fp32, [1, 50, 1024]> var_5480 = pow(x = clip_335_cast_fp16_to_fp32, y = var_5474_promoted)[name = string("op_5480")];
+            tensor<int32, [1]> var_5482_axes_0 = const()[name = string("op_5482_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5482_keep_dims_0 = const()[name = string("op_5482_keep_dims_0"), val = bool(true)];
+            tensor<fp32, [1, 50, 1]> var_5482 = reduce_mean(axes = var_5482_axes_0, keep_dims = var_5482_keep_dims_0, x = var_5480)[name = string("op_5482")];
+            string var_5482_to_fp16_dtype_0 = const()[name = string("op_5482_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 var_5483_to_fp16 = const()[name = string("op_5483_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 50, 1]> var_5482_to_fp16 = cast(dtype = var_5482_to_fp16_dtype_0, x = var_5482)[name = string("cast_2")];
+            tensor<fp16, [1, 50, 1]> mean_squared_cast_fp16 = add(x = var_5482_to_fp16, y = var_5483_to_fp16)[name = string("mean_squared_cast_fp16")];
+            string mean_squared_cast_fp16_to_fp32_dtype_0 = const()[name = string("mean_squared_cast_fp16_to_fp32_dtype_0"), val = string("fp32")];
+            tensor<fp32, [1, 50, 1]> mean_squared_cast_fp16_to_fp32 = cast(dtype = mean_squared_cast_fp16_to_fp32_dtype_0, x = mean_squared_cast_fp16)[name = string("cast_1")];
+            tensor<fp32, [1, 50, 1]> var_5485 = pow(x = mean_squared_cast_fp16_to_fp32, y = var_5470)[name = string("op_5485")];
+            string var_5485_to_fp16_dtype_0 = const()[name = string("op_5485_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 50, 1]> var_5485_to_fp16 = cast(dtype = var_5485_to_fp16_dtype_0, x = var_5485)[name = string("cast_0")];
+            tensor<fp16, [1, 50, 1024]> normed_output_429_cast_fp16 = mul(x = clip_335_cast_fp16, y = var_5485_to_fp16)[name = string("normed_output_429_cast_fp16")];
+            tensor<fp16, [1024]> const_181_to_fp16 = const()[name = string("const_181_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(146085376)))];
+            tensor<fp16, [1, 50, 1024]> hidden_states = mul(x = normed_output_429_cast_fp16, y = const_181_to_fp16)[name = string("normed_output_cast_fp16")];
+        } -> (hidden_states);
+}
\ No newline at end of file
diff --git a/audio.mlmodelc/weights/weight.bin b/audio.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5e2b7538355d63833c6daf4be53fe1f81193b8f1
--- /dev/null
+++ b/audio.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:37da916ac6ac7911266a9c7532a681e4039aea7ce13bf570d80636b705dc6163
+size 146087488
diff --git a/audio_config.json b/audio_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..3b0fb4bca21c5d65a031d6906b7bc04d2cc6d921
--- /dev/null
+++ b/audio_config.json
@@ -0,0 +1,19 @@
+{
+  "sampling_rate": 16000,
+  "feature_size": 128,
+  "frame_length": 320,
+  "hop_length": 160,
+  "fft_length": 512,
+  "mel_floor": 1e-05,
+  "min_frequency": 0,
+  "max_frequency": 8000,
+  "log_offset": 0.001,
+  "preemphasis": 0.97,
+  "mel_frames": 200,
+  "num_tokens": 50,
+  "audio_token_id": 258881,
+  "boa_token_id": 256000,
+  "eoa_token_id": 258883,
+  "ms_per_token": 40,
+  "quantization": "int4"
+}
\ No newline at end of file
diff --git a/chunk1.mlmodelc/analytics/coremldata.bin b/chunk1.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0abffef7bd44682e24e07c8d917dc03a343cbb1f
--- /dev/null
+++ b/chunk1.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb7d0c074925f5e2b23d70754135276d3b38e5bb2ebf89df153a401e37ef2f57
+size 243
diff --git a/chunk1.mlmodelc/coremldata.bin b/chunk1.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..913f44ce44134bd04ea4c8694f887d628cf14ff2
--- /dev/null
+++ b/chunk1.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b53fc92f6d11bf88eb63b9a7af4a7211180e3c031115a9abfa20655814727d4
+size 1333
diff --git a/chunk1.mlmodelc/model.mil b/chunk1.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..5e2149d2cd1def034692619945c7c51459a61b0e
--- /dev/null
+++ b/chunk1.mlmodelc/model.mil
@@ -0,0 +1,8435 @@
+program(1.3)
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3500.14.1"}, {"coremlc-version", "3500.32.1"}})]
+{
+    func decode_q1<ios18>(tensor<fp16, [2, 2, 2048, 512]> K_full_in, tensor<fp16, [10, 2, 512, 512]> K_sliding_in, tensor<fp16, [2, 2, 2048, 512]> V_full_in, tensor<fp16, [10, 2, 512, 512]> V_sliding_in, tensor<fp16, [1, 1, 1, 2048]> causal_mask_full, tensor<fp16, [1, 1, 1, 512]> causal_mask_sliding, tensor<fp16, [1, 1, 1, 512]> cos_f, tensor<fp16, [1, 1, 1, 256]> cos_s, tensor<fp16, [1, 1, 2560]> hidden_states, tensor<fp16, [1, 1, 10752]> per_layer_raw, tensor<fp16, [1, 1, 1, 512]> sin_f, tensor<fp16, [1, 1, 1, 256]> sin_s, tensor<fp16, [1, 1, 2048, 1]> update_mask) {
+            tensor<fp16, [10752, 2560, 1, 1]> per_layer_model_projection_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10752, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [336, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(13762688))))[name = string("per_layer_model_projection_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_0_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(13773504))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16395008))))[name = string("layers_0_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_0_self_attn_q_norm_weight = const()[name = string("layers_0_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16397120)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_0_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16397696))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17053120))))[name = string("layers_0_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_0_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17053696))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17709120))))[name = string("layers_0_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_0_self_attn_k_norm_weight = const()[name = string("layers_0_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17709696)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_0_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17710272))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(30817536))))[name = string("layers_0_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_0_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(30827840))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(43935104))))[name = string("layers_0_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_0_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(43945408))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(57052672))))[name = string("layers_0_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_0_post_feedforward_layernorm_weight = const()[name = string("layers_0_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(57055296)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_0_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(57060480))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(57388224))))[name = string("layers_0_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_1_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(57388544))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(60010048))))[name = string("layers_1_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_1_self_attn_q_norm_weight = const()[name = string("layers_1_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(60012160)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_1_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(60012736))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(60668160))))[name = string("layers_1_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_1_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(60668736))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(61324160))))[name = string("layers_1_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_1_self_attn_k_norm_weight = const()[name = string("layers_1_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(61324736)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_1_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(61325312))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(74432576))))[name = string("layers_1_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_1_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(74442880))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(87550144))))[name = string("layers_1_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_1_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(87560448))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(100667712))))[name = string("layers_1_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_1_post_feedforward_layernorm_weight = const()[name = string("layers_1_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(100670336)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_1_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(100675520))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(101003264))))[name = string("layers_1_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_2_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(101003584))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(103625088))))[name = string("layers_2_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_2_self_attn_q_norm_weight = const()[name = string("layers_2_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(103627200)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_2_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(103627776))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(104283200))))[name = string("layers_2_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_2_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(104283776))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(104939200))))[name = string("layers_2_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_2_self_attn_k_norm_weight = const()[name = string("layers_2_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(104939776)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_2_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(104940352))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(118047616))))[name = string("layers_2_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_2_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(118057920))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(131165184))))[name = string("layers_2_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_2_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(131175488))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(144282752))))[name = string("layers_2_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_2_post_feedforward_layernorm_weight = const()[name = string("layers_2_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(144285376)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_2_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(144290560))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(144618304))))[name = string("layers_2_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_3_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(144618624))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(147240128))))[name = string("layers_3_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_3_self_attn_q_norm_weight = const()[name = string("layers_3_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(147242240)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_3_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(147242816))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(147898240))))[name = string("layers_3_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_3_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(147898816))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(148554240))))[name = string("layers_3_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_3_self_attn_k_norm_weight = const()[name = string("layers_3_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(148554816)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_3_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(148555392))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(161662656))))[name = string("layers_3_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_3_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(161672960))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(174780224))))[name = string("layers_3_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_3_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(174790528))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(187897792))))[name = string("layers_3_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_3_post_feedforward_layernorm_weight = const()[name = string("layers_3_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(187900416)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_3_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(187905600))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(188233344))))[name = string("layers_3_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_4_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(188233664))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(190855168))))[name = string("layers_4_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_4_self_attn_q_norm_weight = const()[name = string("layers_4_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(190857280)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_4_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(190857856))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(191513280))))[name = string("layers_4_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_4_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(191513856))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(192169280))))[name = string("layers_4_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_4_self_attn_k_norm_weight = const()[name = string("layers_4_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(192169856)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_4_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(192170432))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(205277696))))[name = string("layers_4_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_4_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(205288000))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(218395264))))[name = string("layers_4_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_4_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(218405568))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(231512832))))[name = string("layers_4_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_4_post_feedforward_layernorm_weight = const()[name = string("layers_4_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(231515456)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_4_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(231520640))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(231848384))))[name = string("layers_4_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [4096, 2560, 1, 1]> layers_5_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(231848704))), lut = tensor<fp16, [128, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(237091648))))[name = string("layers_5_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [512]> layers_5_self_attn_q_norm_weight = const()[name = string("layers_5_self_attn_q_norm_weight"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(237095808)))];
+            tensor<fp16, [1024, 2560, 1, 1]> layers_5_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(237096896))), lut = tensor<fp16, [32, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(238407680))))[name = string("layers_5_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [1024, 2560, 1, 1]> layers_5_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(238408768))), lut = tensor<fp16, [32, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(239719552))))[name = string("layers_5_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [512]> layers_5_self_attn_k_norm_weight = const()[name = string("layers_5_self_attn_k_norm_weight"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(239720640)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_5_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(239721728))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(252828992))))[name = string("layers_5_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_5_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(252839296))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(265946560))))[name = string("layers_5_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_5_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(265956864))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(279064128))))[name = string("layers_5_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_5_post_feedforward_layernorm_weight = const()[name = string("layers_5_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(279066752)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_5_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(279071936))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(279399680))))[name = string("layers_5_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_6_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(279400000))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(282021504))))[name = string("layers_6_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_6_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(282023616))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(282679040))))[name = string("layers_6_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_6_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(282679616))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(283335040))))[name = string("layers_6_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_6_self_attn_k_norm_weight = const()[name = string("layers_6_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(283335616)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_6_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(283336192))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(296443456))))[name = string("layers_6_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_6_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(296453760))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(309561024))))[name = string("layers_6_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_6_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(309571328))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(322678592))))[name = string("layers_6_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_6_post_feedforward_layernorm_weight = const()[name = string("layers_6_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(322681216)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_6_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(322686400))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(323014144))))[name = string("layers_6_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_7_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(323014464))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(325635968))))[name = string("layers_7_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_7_self_attn_q_norm_weight = const()[name = string("layers_7_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(325638080)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_7_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(325638656))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(326294080))))[name = string("layers_7_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_7_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(326294656))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(326950080))))[name = string("layers_7_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_7_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(326950656))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(340057920))))[name = string("layers_7_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_7_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(340068224))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(353175488))))[name = string("layers_7_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_7_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(353185792))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(366293056))))[name = string("layers_7_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_7_post_feedforward_layernorm_weight = const()[name = string("layers_7_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(366295680)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_7_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(366300864))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(366628608))))[name = string("layers_7_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_8_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(366628928))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(369250432))))[name = string("layers_8_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_8_self_attn_q_norm_weight = const()[name = string("layers_8_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(369252544)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_8_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(369253120))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(369908544))))[name = string("layers_8_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_8_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(369909120))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(370564544))))[name = string("layers_8_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_8_self_attn_k_norm_weight = const()[name = string("layers_8_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(370565120)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_8_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(370565696))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(383672960))))[name = string("layers_8_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_8_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(383683264))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(396790528))))[name = string("layers_8_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_8_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(396800832))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409908096))))[name = string("layers_8_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_8_post_feedforward_layernorm_weight = const()[name = string("layers_8_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409910720)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_8_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409915904))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(410243648))))[name = string("layers_8_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_9_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(410243968))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412865472))))[name = string("layers_9_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_9_self_attn_q_norm_weight = const()[name = string("layers_9_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412867584)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_9_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412868160))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(413523584))))[name = string("layers_9_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_9_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(413524160))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(414179584))))[name = string("layers_9_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_9_self_attn_k_norm_weight = const()[name = string("layers_9_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(414180160)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_9_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(414180736))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(427288000))))[name = string("layers_9_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_9_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(427298304))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(440405568))))[name = string("layers_9_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_9_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(440415872))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(453523136))))[name = string("layers_9_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_9_post_feedforward_layernorm_weight = const()[name = string("layers_9_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(453525760)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_9_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(453530944))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(453858688))))[name = string("layers_9_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_10_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(453859008))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(456480512))))[name = string("layers_10_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_10_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(456482624))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(457138048))))[name = string("layers_10_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_10_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(457138624))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(457794048))))[name = string("layers_10_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_10_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(457794624))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(470901888))))[name = string("layers_10_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_10_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(470912192))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(484019456))))[name = string("layers_10_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_10_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(484029760))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(497137024))))[name = string("layers_10_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_10_post_feedforward_layernorm_weight = const()[name = string("layers_10_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(497139648)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_10_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(497144832))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(497472576))))[name = string("layers_10_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [4096, 2560, 1, 1]> layers_11_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(497472896))), lut = tensor<fp16, [128, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(502715840))))[name = string("layers_11_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [512]> layers_11_self_attn_q_norm_weight = const()[name = string("layers_11_self_attn_q_norm_weight"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(502720000)))];
+            tensor<fp16, [1024, 2560, 1, 1]> layers_11_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(502721088))), lut = tensor<fp16, [32, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(504031872))))[name = string("layers_11_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [1024, 2560, 1, 1]> layers_11_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(504032960))), lut = tensor<fp16, [32, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(505343744))))[name = string("layers_11_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [512]> layers_11_self_attn_k_norm_weight = const()[name = string("layers_11_self_attn_k_norm_weight"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(505344832)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_11_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(505345920))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(518453184))))[name = string("layers_11_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_11_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(518463488))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(531570752))))[name = string("layers_11_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_11_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(531581056))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(544688320))))[name = string("layers_11_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_11_post_feedforward_layernorm_weight = const()[name = string("layers_11_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(544690944)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_11_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(544696128))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(545023872))))[name = string("layers_11_per_layer_input_gate_weight_palettized")];
+            tensor<int32, [3]> var_740 = const()[name = string("op_740"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_743_axes_0 = const()[name = string("op_743_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_741_cast_fp16 = transpose(perm = var_740, x = hidden_states)[name = string("transpose_217")];
+            tensor<fp16, [1, 2560, 1, 1]> var_743_cast_fp16 = expand_dims(axes = var_743_axes_0, x = var_741_cast_fp16)[name = string("op_743_cast_fp16")];
+            string var_759_pad_type_0 = const()[name = string("op_759_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_759_strides_0 = const()[name = string("op_759_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_759_pad_0 = const()[name = string("op_759_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_759_dilations_0 = const()[name = string("op_759_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_759_groups_0 = const()[name = string("op_759_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10752, 1, 1]> var_759 = conv(dilations = var_759_dilations_0, groups = var_759_groups_0, pad = var_759_pad_0, pad_type = var_759_pad_type_0, strides = var_759_strides_0, weight = per_layer_model_projection_weight_palettized, x = var_743_cast_fp16)[name = string("op_759")];
+            fp16 var_760_to_fp16 = const()[name = string("op_760_to_fp16"), val = fp16(0x1.43cp-6)];
+            tensor<fp16, [1, 10752, 1, 1]> proj_1_cast_fp16 = mul(x = var_759, y = var_760_to_fp16)[name = string("proj_1_cast_fp16")];
+            tensor<int32, [1]> var_763_axes_0 = const()[name = string("op_763_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 10752, 1]> var_763_cast_fp16 = squeeze(axes = var_763_axes_0, x = proj_1_cast_fp16)[name = string("op_763_cast_fp16")];
+            tensor<int32, [3]> var_767 = const()[name = string("op_767"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [3]> var_772 = const()[name = string("op_772"), val = tensor<int32, [3]>([1, 42, 256])];
+            tensor<fp16, [1, 1, 10752]> proj_cast_fp16 = transpose(perm = var_767, x = var_763_cast_fp16)[name = string("transpose_216")];
+            tensor<fp16, [1, 42, 256]> proj_grouped_cast_fp16 = reshape(shape = var_772, x = proj_cast_fp16)[name = string("proj_grouped_cast_fp16")];
+            fp16 const_0_promoted_to_fp16 = const()[name = string("const_0_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 42, 256]> var_774_cast_fp16 = mul(x = proj_grouped_cast_fp16, y = const_0_promoted_to_fp16)[name = string("op_774_cast_fp16")];
+            int32 var_776 = const()[name = string("op_776"), val = int32(-1)];
+            bool input_3_interleave_0 = const()[name = string("input_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 42, 512]> input_3_cast_fp16 = concat(axis = var_776, interleave = input_3_interleave_0, values = (proj_grouped_cast_fp16, var_774_cast_fp16))[name = string("input_3_cast_fp16")];
+            tensor<int32, [1]> normed_1_axes_0 = const()[name = string("normed_1_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_782_to_fp16 = const()[name = string("op_782_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 42, 512]> normed_1_cast_fp16 = layer_norm(axes = normed_1_axes_0, epsilon = var_782_to_fp16, x = input_3_cast_fp16)[name = string("normed_1_cast_fp16")];
+            tensor<int32, [2]> var_785_split_sizes_0 = const()[name = string("op_785_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_785_axis_0 = const()[name = string("op_785_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 42, 256]> var_785_cast_fp16_0, tensor<fp16, [1, 42, 256]> var_785_cast_fp16_1 = split(axis = var_785_axis_0, split_sizes = var_785_split_sizes_0, x = normed_1_cast_fp16)[name = string("op_785_cast_fp16")];
+            tensor<fp16, [256]> per_layer_projection_norm_weight_promoted_to_fp16 = const()[name = string("per_layer_projection_norm_weight_promoted_to_fp16"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(545024192)))];
+            tensor<fp16, [1, 42, 256]> var_787_cast_fp16 = mul(x = var_785_cast_fp16_0, y = per_layer_projection_norm_weight_promoted_to_fp16)[name = string("op_787_cast_fp16")];
+            tensor<int32, [3]> var_791 = const()[name = string("op_791"), val = tensor<int32, [3]>([1, 1, 10752])];
+            tensor<fp16, [1, 1, 10752]> proj_normed_cast_fp16 = reshape(shape = var_791, x = var_787_cast_fp16)[name = string("proj_normed_cast_fp16")];
+            tensor<fp16, [1, 1, 10752]> var_794_cast_fp16 = add(x = proj_normed_cast_fp16, y = per_layer_raw)[name = string("op_794_cast_fp16")];
+            fp16 var_795_to_fp16 = const()[name = string("op_795_to_fp16"), val = fp16(0x1.6ap-1)];
+            tensor<fp16, [1, 1, 10752]> per_layer_combined_out = mul(x = var_794_cast_fp16, y = var_795_to_fp16)[name = string("per_layer_combined_cast_fp16")];
+            tensor<int32, [4]> var_799_begin_0 = const()[name = string("op_799_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_799_end_0 = const()[name = string("op_799_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_799_end_mask_0 = const()[name = string("op_799_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_799_squeeze_mask_0 = const()[name = string("op_799_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_799_cast_fp16 = slice_by_index(begin = var_799_begin_0, end = var_799_end_0, end_mask = var_799_end_mask_0, squeeze_mask = var_799_squeeze_mask_0, x = K_sliding_in)[name = string("op_799_cast_fp16")];
+            tensor<int32, [1]> K_sliding_slot_1_axes_0 = const()[name = string("K_sliding_slot_1_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_slot_1_cast_fp16 = expand_dims(axes = K_sliding_slot_1_axes_0, x = var_799_cast_fp16)[name = string("K_sliding_slot_1_cast_fp16")];
+            tensor<int32, [4]> var_804_begin_0 = const()[name = string("op_804_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_804_end_0 = const()[name = string("op_804_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_804_end_mask_0 = const()[name = string("op_804_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_804_squeeze_mask_0 = const()[name = string("op_804_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_804_cast_fp16 = slice_by_index(begin = var_804_begin_0, end = var_804_end_0, end_mask = var_804_end_mask_0, squeeze_mask = var_804_squeeze_mask_0, x = V_sliding_in)[name = string("op_804_cast_fp16")];
+            tensor<int32, [1]> V_sliding_slot_1_axes_0 = const()[name = string("V_sliding_slot_1_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_slot_1_cast_fp16 = expand_dims(axes = V_sliding_slot_1_axes_0, x = var_804_cast_fp16)[name = string("V_sliding_slot_1_cast_fp16")];
+            int32 var_811 = const()[name = string("op_811"), val = int32(-1)];
+            fp16 const_1_promoted_to_fp16 = const()[name = string("const_1_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_813_cast_fp16 = mul(x = hidden_states, y = const_1_promoted_to_fp16)[name = string("op_813_cast_fp16")];
+            bool input_5_interleave_0 = const()[name = string("input_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_5_cast_fp16 = concat(axis = var_811, interleave = input_5_interleave_0, values = (hidden_states, var_813_cast_fp16))[name = string("input_5_cast_fp16")];
+            tensor<int32, [1]> normed_5_axes_0 = const()[name = string("normed_5_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_808_to_fp16 = const()[name = string("op_808_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_5_cast_fp16 = layer_norm(axes = normed_5_axes_0, epsilon = var_808_to_fp16, x = input_5_cast_fp16)[name = string("normed_5_cast_fp16")];
+            tensor<int32, [2]> var_818_split_sizes_0 = const()[name = string("op_818_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_818_axis_0 = const()[name = string("op_818_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_818_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_818_cast_fp16_1 = split(axis = var_818_axis_0, split_sizes = var_818_split_sizes_0, x = normed_5_cast_fp16)[name = string("op_818_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_0_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(545024768)))];
+            tensor<fp16, [1, 1, 2560]> h_1_cast_fp16 = mul(x = var_818_cast_fp16_0, y = layers_0_input_layernorm_weight_promoted_to_fp16)[name = string("h_1_cast_fp16")];
+            tensor<int32, [3]> var_824 = const()[name = string("op_824"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_827_axes_0 = const()[name = string("op_827_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_825_cast_fp16 = transpose(perm = var_824, x = h_1_cast_fp16)[name = string("transpose_215")];
+            tensor<fp16, [1, 2560, 1, 1]> var_827_cast_fp16 = expand_dims(axes = var_827_axes_0, x = var_825_cast_fp16)[name = string("op_827_cast_fp16")];
+            string var_843_pad_type_0 = const()[name = string("op_843_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_843_strides_0 = const()[name = string("op_843_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_843_pad_0 = const()[name = string("op_843_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_843_dilations_0 = const()[name = string("op_843_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_843_groups_0 = const()[name = string("op_843_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_843 = conv(dilations = var_843_dilations_0, groups = var_843_groups_0, pad = var_843_pad_0, pad_type = var_843_pad_type_0, strides = var_843_strides_0, weight = layers_0_self_attn_q_proj_weight_palettized, x = var_827_cast_fp16)[name = string("op_843")];
+            tensor<int32, [4]> var_848 = const()[name = string("op_848"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_849 = reshape(shape = var_848, x = var_843)[name = string("op_849")];
+            tensor<int32, [4]> var_854 = const()[name = string("op_854"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_864 = const()[name = string("op_864"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_855 = transpose(perm = var_854, x = var_849)[name = string("transpose_214")];
+            tensor<fp16, [1, 8, 256]> x_1 = reshape(shape = var_864, x = var_855)[name = string("x_1")];
+            int32 var_870 = const()[name = string("op_870"), val = int32(-1)];
+            fp16 const_2_promoted = const()[name = string("const_2_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_872 = mul(x = x_1, y = const_2_promoted)[name = string("op_872")];
+            bool input_9_interleave_0 = const()[name = string("input_9_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_9 = concat(axis = var_870, interleave = input_9_interleave_0, values = (x_1, var_872))[name = string("input_9")];
+            tensor<int32, [1]> normed_9_axes_0 = const()[name = string("normed_9_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_867_to_fp16 = const()[name = string("op_867_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_9_cast_fp16 = layer_norm(axes = normed_9_axes_0, epsilon = var_867_to_fp16, x = input_9)[name = string("normed_9_cast_fp16")];
+            tensor<int32, [2]> var_877_split_sizes_0 = const()[name = string("op_877_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_877_axis_0 = const()[name = string("op_877_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_877_0, tensor<fp16, [1, 8, 256]> var_877_1 = split(axis = var_877_axis_0, split_sizes = var_877_split_sizes_0, x = normed_9_cast_fp16)[name = string("op_877")];
+            tensor<fp16, [1, 8, 256]> var_879 = mul(x = var_877_0, y = layers_0_self_attn_q_norm_weight)[name = string("op_879")];
+            tensor<int32, [4]> var_884 = const()[name = string("op_884"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_3 = reshape(shape = var_884, x = var_879)[name = string("q_3")];
+            tensor<fp16, [1, 8, 1, 256]> var_886_cast_fp16 = mul(x = q_3, y = cos_s)[name = string("op_886_cast_fp16")];
+            tensor<int32, [2]> var_887_split_sizes_0 = const()[name = string("op_887_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_887_axis_0 = const()[name = string("op_887_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_887_0, tensor<fp16, [1, 8, 1, 128]> var_887_1 = split(axis = var_887_axis_0, split_sizes = var_887_split_sizes_0, x = q_3)[name = string("op_887")];
+            fp16 const_3_promoted = const()[name = string("const_3_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_889 = mul(x = var_887_1, y = const_3_promoted)[name = string("op_889")];
+            int32 var_891 = const()[name = string("op_891"), val = int32(-1)];
+            bool var_892_interleave_0 = const()[name = string("op_892_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_892 = concat(axis = var_891, interleave = var_892_interleave_0, values = (var_889, var_887_0))[name = string("op_892")];
+            tensor<fp16, [1, 8, 1, 256]> var_893_cast_fp16 = mul(x = var_892, y = sin_s)[name = string("op_893_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_7_cast_fp16 = add(x = var_886_cast_fp16, y = var_893_cast_fp16)[name = string("q_7_cast_fp16")];
+            string var_906_pad_type_0 = const()[name = string("op_906_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_906_strides_0 = const()[name = string("op_906_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_906_pad_0 = const()[name = string("op_906_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_906_dilations_0 = const()[name = string("op_906_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_906_groups_0 = const()[name = string("op_906_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_906 = conv(dilations = var_906_dilations_0, groups = var_906_groups_0, pad = var_906_pad_0, pad_type = var_906_pad_type_0, strides = var_906_strides_0, weight = layers_0_self_attn_k_proj_weight_palettized, x = var_827_cast_fp16)[name = string("op_906")];
+            tensor<int32, [4]> var_911 = const()[name = string("op_911"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_912 = reshape(shape = var_911, x = var_906)[name = string("op_912")];
+            tensor<int32, [4]> var_917 = const()[name = string("op_917"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_934_pad_type_0 = const()[name = string("op_934_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_934_strides_0 = const()[name = string("op_934_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_934_pad_0 = const()[name = string("op_934_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_934_dilations_0 = const()[name = string("op_934_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_934_groups_0 = const()[name = string("op_934_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_934 = conv(dilations = var_934_dilations_0, groups = var_934_groups_0, pad = var_934_pad_0, pad_type = var_934_pad_type_0, strides = var_934_strides_0, weight = layers_0_self_attn_v_proj_weight_palettized, x = var_827_cast_fp16)[name = string("op_934")];
+            tensor<int32, [4]> var_939 = const()[name = string("op_939"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_940 = reshape(shape = var_939, x = var_934)[name = string("op_940")];
+            tensor<int32, [4]> var_945 = const()[name = string("op_945"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_955 = const()[name = string("op_955"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp16, [1, 2, 1, 256]> var_918 = transpose(perm = var_917, x = var_912)[name = string("transpose_213")];
+            tensor<fp16, [1, 2, 256]> x_3 = reshape(shape = var_955, x = var_918)[name = string("x_3")];
+            int32 var_961 = const()[name = string("op_961"), val = int32(-1)];
+            fp16 const_4_promoted = const()[name = string("const_4_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 256]> var_963 = mul(x = x_3, y = const_4_promoted)[name = string("op_963")];
+            bool input_11_interleave_0 = const()[name = string("input_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512]> input_11 = concat(axis = var_961, interleave = input_11_interleave_0, values = (x_3, var_963))[name = string("input_11")];
+            tensor<int32, [1]> normed_13_axes_0 = const()[name = string("normed_13_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_958_to_fp16 = const()[name = string("op_958_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 512]> normed_13_cast_fp16 = layer_norm(axes = normed_13_axes_0, epsilon = var_958_to_fp16, x = input_11)[name = string("normed_13_cast_fp16")];
+            tensor<int32, [2]> var_968_split_sizes_0 = const()[name = string("op_968_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_968_axis_0 = const()[name = string("op_968_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 256]> var_968_0, tensor<fp16, [1, 2, 256]> var_968_1 = split(axis = var_968_axis_0, split_sizes = var_968_split_sizes_0, x = normed_13_cast_fp16)[name = string("op_968")];
+            tensor<fp16, [1, 2, 256]> var_970 = mul(x = var_968_0, y = layers_0_self_attn_k_norm_weight)[name = string("op_970")];
+            tensor<int32, [4]> var_975 = const()[name = string("op_975"), val = tensor<int32, [4]>([1, 2, 1, 256])];
+            tensor<fp16, [1, 2, 1, 256]> q_5 = reshape(shape = var_975, x = var_970)[name = string("q_5")];
+            fp16 var_977_promoted = const()[name = string("op_977_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 256]> var_946 = transpose(perm = var_945, x = var_940)[name = string("transpose_212")];
+            tensor<fp16, [1, 2, 1, 256]> var_978 = pow(x = var_946, y = var_977_promoted)[name = string("op_978")];
+            tensor<int32, [1]> var_983_axes_0 = const()[name = string("op_983_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_983_keep_dims_0 = const()[name = string("op_983_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_983 = reduce_mean(axes = var_983_axes_0, keep_dims = var_983_keep_dims_0, x = var_978)[name = string("op_983")];
+            fp16 var_985_to_fp16 = const()[name = string("op_985_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_1_cast_fp16 = add(x = var_983, y = var_985_to_fp16)[name = string("mean_sq_1_cast_fp16")];
+            fp32 var_987_epsilon_0 = const()[name = string("op_987_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_987_cast_fp16 = rsqrt(epsilon = var_987_epsilon_0, x = mean_sq_1_cast_fp16)[name = string("op_987_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_15_cast_fp16 = mul(x = var_946, y = var_987_cast_fp16)[name = string("input_15_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> var_989_cast_fp16 = mul(x = q_5, y = cos_s)[name = string("op_989_cast_fp16")];
+            tensor<int32, [2]> var_990_split_sizes_0 = const()[name = string("op_990_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_990_axis_0 = const()[name = string("op_990_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 128]> var_990_0, tensor<fp16, [1, 2, 1, 128]> var_990_1 = split(axis = var_990_axis_0, split_sizes = var_990_split_sizes_0, x = q_5)[name = string("op_990")];
+            fp16 const_5_promoted = const()[name = string("const_5_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 128]> var_992 = mul(x = var_990_1, y = const_5_promoted)[name = string("op_992")];
+            int32 var_994 = const()[name = string("op_994"), val = int32(-1)];
+            bool var_995_interleave_0 = const()[name = string("op_995_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 256]> var_995 = concat(axis = var_994, interleave = var_995_interleave_0, values = (var_992, var_990_0))[name = string("op_995")];
+            tensor<fp16, [1, 2, 1, 256]> var_996_cast_fp16 = mul(x = var_995, y = sin_s)[name = string("op_996_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_13_cast_fp16 = add(x = var_989_cast_fp16, y = var_996_cast_fp16)[name = string("input_13_cast_fp16")];
+            tensor<int32, [8]> k_padded_1_pad_0 = const()[name = string("k_padded_1_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_1_mode_0 = const()[name = string("k_padded_1_mode_0"), val = string("constant")];
+            fp16 const_6_to_fp16 = const()[name = string("const_6_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> k_padded_1_cast_fp16 = pad(constant_val = const_6_to_fp16, mode = k_padded_1_mode_0, pad = k_padded_1_pad_0, x = input_13_cast_fp16)[name = string("k_padded_1_cast_fp16")];
+            tensor<int32, [8]> v_padded_1_pad_0 = const()[name = string("v_padded_1_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_1_mode_0 = const()[name = string("v_padded_1_mode_0"), val = string("constant")];
+            fp16 const_7_to_fp16 = const()[name = string("const_7_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> v_padded_1_cast_fp16 = pad(constant_val = const_7_to_fp16, mode = v_padded_1_mode_0, pad = v_padded_1_pad_0, x = input_15_cast_fp16)[name = string("v_padded_1_cast_fp16")];
+            tensor<int32, [4]> var_1025_begin_0 = const()[name = string("op_1025_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_1025_end_0 = const()[name = string("op_1025_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_1025_end_mask_0 = const()[name = string("op_1025_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_1025_cast_fp16 = slice_by_index(begin = var_1025_begin_0, end = var_1025_end_0, end_mask = var_1025_end_mask_0, x = K_sliding_slot_1_cast_fp16)[name = string("op_1025_cast_fp16")];
+            int32 var_1032 = const()[name = string("op_1032"), val = int32(2)];
+            bool K_sliding_out_1_interleave_0 = const()[name = string("K_sliding_out_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_out_1_cast_fp16 = concat(axis = var_1032, interleave = K_sliding_out_1_interleave_0, values = (var_1025_cast_fp16, k_padded_1_cast_fp16))[name = string("K_sliding_out_1_cast_fp16")];
+            tensor<int32, [4]> var_1048_begin_0 = const()[name = string("op_1048_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_1048_end_0 = const()[name = string("op_1048_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_1048_end_mask_0 = const()[name = string("op_1048_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_1048_cast_fp16 = slice_by_index(begin = var_1048_begin_0, end = var_1048_end_0, end_mask = var_1048_end_mask_0, x = V_sliding_slot_1_cast_fp16)[name = string("op_1048_cast_fp16")];
+            int32 var_1055 = const()[name = string("op_1055"), val = int32(2)];
+            bool V_sliding_out_1_interleave_0 = const()[name = string("V_sliding_out_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_out_1_cast_fp16 = concat(axis = var_1055, interleave = V_sliding_out_1_interleave_0, values = (var_1048_cast_fp16, v_padded_1_cast_fp16))[name = string("V_sliding_out_1_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_1_begin_0 = const()[name = string("K_for_attn_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_1_end_0 = const()[name = string("K_for_attn_1_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_1_end_mask_0 = const()[name = string("K_for_attn_1_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_1_cast_fp16 = slice_by_index(begin = K_for_attn_1_begin_0, end = K_for_attn_1_end_0, end_mask = K_for_attn_1_end_mask_0, x = K_sliding_out_1_cast_fp16)[name = string("K_for_attn_1_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_1_begin_0 = const()[name = string("V_for_attn_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_1_end_0 = const()[name = string("V_for_attn_1_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_1_end_mask_0 = const()[name = string("V_for_attn_1_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_1_cast_fp16 = slice_by_index(begin = V_for_attn_1_begin_0, end = V_for_attn_1_end_0, end_mask = V_for_attn_1_end_mask_0, x = V_sliding_out_1_cast_fp16)[name = string("V_for_attn_1_cast_fp16")];
+            tensor<int32, [4]> transpose_0_perm_0 = const()[name = string("transpose_0_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_0_reps_0 = const()[name = string("tile_0_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_0_cast_fp16 = transpose(perm = transpose_0_perm_0, x = K_for_attn_1_cast_fp16)[name = string("transpose_211")];
+            tensor<fp16, [8, 1, 512, 256]> tile_0_cast_fp16 = tile(reps = tile_0_reps_0, x = transpose_0_cast_fp16)[name = string("tile_0_cast_fp16")];
+            tensor<int32, [5]> concat_0 = const()[name = string("concat_0"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_0_cast_fp16 = reshape(shape = concat_0, x = tile_0_cast_fp16)[name = string("reshape_0_cast_fp16")];
+            tensor<int32, [5]> transpose_1_perm_0 = const()[name = string("transpose_1_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_1 = const()[name = string("concat_1"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_1_cast_fp16 = transpose(perm = transpose_1_perm_0, x = reshape_0_cast_fp16)[name = string("transpose_210")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_1_cast_fp16 = reshape(shape = concat_1, x = transpose_1_cast_fp16)[name = string("reshape_1_cast_fp16")];
+            tensor<int32, [4]> transpose_48_perm_0 = const()[name = string("transpose_48_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_2_perm_0 = const()[name = string("transpose_2_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_1_reps_0 = const()[name = string("tile_1_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_2_cast_fp16 = transpose(perm = transpose_2_perm_0, x = V_for_attn_1_cast_fp16)[name = string("transpose_209")];
+            tensor<fp16, [8, 1, 512, 256]> tile_1_cast_fp16 = tile(reps = tile_1_reps_0, x = transpose_2_cast_fp16)[name = string("tile_1_cast_fp16")];
+            tensor<int32, [5]> concat_2 = const()[name = string("concat_2"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_2_cast_fp16 = reshape(shape = concat_2, x = tile_1_cast_fp16)[name = string("reshape_2_cast_fp16")];
+            tensor<int32, [5]> transpose_3_perm_0 = const()[name = string("transpose_3_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_3 = const()[name = string("concat_3"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_3_cast_fp16 = transpose(perm = transpose_3_perm_0, x = reshape_2_cast_fp16)[name = string("transpose_208")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_3_cast_fp16 = reshape(shape = concat_3, x = transpose_3_cast_fp16)[name = string("reshape_3_cast_fp16")];
+            tensor<int32, [4]> V_expanded_1_perm_0 = const()[name = string("V_expanded_1_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(false)];
+            bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_48_cast_fp16 = transpose(perm = transpose_48_perm_0, x = reshape_1_cast_fp16)[name = string("transpose_207")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = q_7_cast_fp16, y = transpose_48_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_7_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = causal_mask_sliding)[name = string("x_7_cast_fp16")];
+            tensor<int32, [1]> reduce_max_0_axes_0 = const()[name = string("reduce_max_0_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_0_keep_dims_0 = const()[name = string("reduce_max_0_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_0 = reduce_max(axes = reduce_max_0_axes_0, keep_dims = reduce_max_0_keep_dims_0, x = x_7_cast_fp16)[name = string("reduce_max_0")];
+            tensor<fp16, [1, 8, 1, 512]> var_1096 = sub(x = x_7_cast_fp16, y = reduce_max_0)[name = string("op_1096")];
+            tensor<fp16, [1, 8, 1, 512]> var_1102 = exp(x = var_1096)[name = string("op_1102")];
+            tensor<int32, [1]> var_1112_axes_0 = const()[name = string("op_1112_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1112_keep_dims_0 = const()[name = string("op_1112_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_1112 = reduce_sum(axes = var_1112_axes_0, keep_dims = var_1112_keep_dims_0, x = var_1102)[name = string("op_1112")];
+            tensor<fp16, [1, 8, 1, 512]> var_1118_cast_fp16 = real_div(x = var_1102, y = var_1112)[name = string("op_1118_cast_fp16")];
+            bool attn_output_1_transpose_x_0 = const()[name = string("attn_output_1_transpose_x_0"), val = bool(false)];
+            bool attn_output_1_transpose_y_0 = const()[name = string("attn_output_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_1_cast_fp16 = transpose(perm = V_expanded_1_perm_0, x = reshape_3_cast_fp16)[name = string("transpose_206")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_1_cast_fp16 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = var_1118_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_1_cast_fp16")];
+            tensor<int32, [4]> var_1129 = const()[name = string("op_1129"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1136 = const()[name = string("op_1136"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_1130_cast_fp16 = transpose(perm = var_1129, x = attn_output_1_cast_fp16)[name = string("transpose_205")];
+            tensor<fp16, [1, 1, 2048]> attn_output_3_cast_fp16 = reshape(shape = var_1136, x = var_1130_cast_fp16)[name = string("attn_output_3_cast_fp16")];
+            tensor<int32, [3]> var_1141 = const()[name = string("op_1141"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_1157_pad_type_0 = const()[name = string("op_1157_pad_type_0"), val = string("valid")];
+            int32 var_1157_groups_0 = const()[name = string("op_1157_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_1157_strides_0 = const()[name = string("op_1157_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_1157_pad_0 = const()[name = string("op_1157_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_1157_dilations_0 = const()[name = string("op_1157_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_0_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(545029952))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(547651456))))[name = string("squeeze_0_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_1142_cast_fp16 = transpose(perm = var_1141, x = attn_output_3_cast_fp16)[name = string("transpose_204")];
+            tensor<fp16, [1, 2560, 1]> var_1157_cast_fp16 = conv(dilations = var_1157_dilations_0, groups = var_1157_groups_0, pad = var_1157_pad_0, pad_type = var_1157_pad_type_0, strides = var_1157_strides_0, weight = squeeze_0_cast_fp16_to_fp32_to_fp16_palettized, x = var_1142_cast_fp16)[name = string("op_1157_cast_fp16")];
+            tensor<int32, [3]> var_1161 = const()[name = string("op_1161"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1167 = const()[name = string("op_1167"), val = int32(-1)];
+            fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_11_cast_fp16 = transpose(perm = var_1161, x = var_1157_cast_fp16)[name = string("transpose_203")];
+            tensor<fp16, [1, 1, 2560]> var_1169_cast_fp16 = mul(x = x_11_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_1169_cast_fp16")];
+            bool input_19_interleave_0 = const()[name = string("input_19_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_19_cast_fp16 = concat(axis = var_1167, interleave = input_19_interleave_0, values = (x_11_cast_fp16, var_1169_cast_fp16))[name = string("input_19_cast_fp16")];
+            tensor<int32, [1]> normed_17_axes_0 = const()[name = string("normed_17_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1164_to_fp16 = const()[name = string("op_1164_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_17_cast_fp16 = layer_norm(axes = normed_17_axes_0, epsilon = var_1164_to_fp16, x = input_19_cast_fp16)[name = string("normed_17_cast_fp16")];
+            tensor<int32, [2]> var_1174_split_sizes_0 = const()[name = string("op_1174_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1174_axis_0 = const()[name = string("op_1174_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1174_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1174_cast_fp16_1 = split(axis = var_1174_axis_0, split_sizes = var_1174_split_sizes_0, x = normed_17_cast_fp16)[name = string("op_1174_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_0_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(547654080)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_5_cast_fp16 = mul(x = var_1174_cast_fp16_0, y = layers_0_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_5_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_13_cast_fp16 = add(x = hidden_states, y = attn_output_5_cast_fp16)[name = string("x_13_cast_fp16")];
+            int32 var_1183 = const()[name = string("op_1183"), val = int32(-1)];
+            fp16 const_9_promoted_to_fp16 = const()[name = string("const_9_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_1185_cast_fp16 = mul(x = x_13_cast_fp16, y = const_9_promoted_to_fp16)[name = string("op_1185_cast_fp16")];
+            bool input_21_interleave_0 = const()[name = string("input_21_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_21_cast_fp16 = concat(axis = var_1183, interleave = input_21_interleave_0, values = (x_13_cast_fp16, var_1185_cast_fp16))[name = string("input_21_cast_fp16")];
+            tensor<int32, [1]> normed_21_axes_0 = const()[name = string("normed_21_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1180_to_fp16 = const()[name = string("op_1180_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_21_cast_fp16 = layer_norm(axes = normed_21_axes_0, epsilon = var_1180_to_fp16, x = input_21_cast_fp16)[name = string("normed_21_cast_fp16")];
+            tensor<int32, [2]> var_1190_split_sizes_0 = const()[name = string("op_1190_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1190_axis_0 = const()[name = string("op_1190_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1190_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1190_cast_fp16_1 = split(axis = var_1190_axis_0, split_sizes = var_1190_split_sizes_0, x = normed_21_cast_fp16)[name = string("op_1190_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_0_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(547659264)))];
+            tensor<fp16, [1, 1, 2560]> h_3_cast_fp16 = mul(x = var_1190_cast_fp16_0, y = layers_0_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_3_cast_fp16")];
+            tensor<int32, [3]> var_1201 = const()[name = string("op_1201"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_23_axes_0 = const()[name = string("input_23_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1202 = transpose(perm = var_1201, x = h_3_cast_fp16)[name = string("transpose_202")];
+            tensor<fp16, [1, 2560, 1, 1]> input_23 = expand_dims(axes = input_23_axes_0, x = var_1202)[name = string("input_23")];
+            string gate_1_pad_type_0 = const()[name = string("gate_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_1_strides_0 = const()[name = string("gate_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_1_pad_0 = const()[name = string("gate_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_1_dilations_0 = const()[name = string("gate_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_1_groups_0 = const()[name = string("gate_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_1 = conv(dilations = gate_1_dilations_0, groups = gate_1_groups_0, pad = gate_1_pad_0, pad_type = gate_1_pad_type_0, strides = gate_1_strides_0, weight = layers_0_mlp_gate_proj_weight_palettized, x = input_23)[name = string("gate_1")];
+            string up_1_pad_type_0 = const()[name = string("up_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_1_strides_0 = const()[name = string("up_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_1_pad_0 = const()[name = string("up_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_1_dilations_0 = const()[name = string("up_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_1_groups_0 = const()[name = string("up_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_1 = conv(dilations = up_1_dilations_0, groups = up_1_groups_0, pad = up_1_pad_0, pad_type = up_1_pad_type_0, strides = up_1_strides_0, weight = layers_0_mlp_up_proj_weight_palettized, x = input_23)[name = string("up_1")];
+            string gate_3_mode_0 = const()[name = string("gate_3_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_3 = gelu(mode = gate_3_mode_0, x = gate_1)[name = string("gate_3")];
+            tensor<fp16, [1, 10240, 1, 1]> input_25 = mul(x = gate_3, y = up_1)[name = string("input_25")];
+            string mlp_out_1_pad_type_0 = const()[name = string("mlp_out_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_1_strides_0 = const()[name = string("mlp_out_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_1_pad_0 = const()[name = string("mlp_out_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_1_dilations_0 = const()[name = string("mlp_out_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_1_groups_0 = const()[name = string("mlp_out_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_1 = conv(dilations = mlp_out_1_dilations_0, groups = mlp_out_1_groups_0, pad = mlp_out_1_pad_0, pad_type = mlp_out_1_pad_type_0, strides = mlp_out_1_strides_0, weight = layers_0_mlp_down_proj_weight_palettized, x = input_25)[name = string("mlp_out_1")];
+            tensor<int32, [1]> var_1242_axes_0 = const()[name = string("op_1242_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1242 = squeeze(axes = var_1242_axes_0, x = mlp_out_1)[name = string("op_1242")];
+            tensor<int32, [3]> var_1246 = const()[name = string("op_1246"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1252 = const()[name = string("op_1252"), val = int32(-1)];
+            fp16 const_10_promoted = const()[name = string("const_10_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_15 = transpose(perm = var_1246, x = var_1242)[name = string("transpose_201")];
+            tensor<fp16, [1, 1, 2560]> var_1254 = mul(x = x_15, y = const_10_promoted)[name = string("op_1254")];
+            bool input_27_interleave_0 = const()[name = string("input_27_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_27 = concat(axis = var_1252, interleave = input_27_interleave_0, values = (x_15, var_1254))[name = string("input_27")];
+            tensor<int32, [1]> normed_25_axes_0 = const()[name = string("normed_25_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1249_to_fp16 = const()[name = string("op_1249_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_25_cast_fp16 = layer_norm(axes = normed_25_axes_0, epsilon = var_1249_to_fp16, x = input_27)[name = string("normed_25_cast_fp16")];
+            tensor<int32, [2]> var_1259_split_sizes_0 = const()[name = string("op_1259_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1259_axis_0 = const()[name = string("op_1259_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1259_0, tensor<fp16, [1, 1, 2560]> var_1259_1 = split(axis = var_1259_axis_0, split_sizes = var_1259_split_sizes_0, x = normed_25_cast_fp16)[name = string("op_1259")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_3 = mul(x = var_1259_0, y = layers_0_post_feedforward_layernorm_weight)[name = string("hidden_states_3")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_5_cast_fp16 = add(x = x_13_cast_fp16, y = hidden_states_3)[name = string("hidden_states_5_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_1_begin_0 = const()[name = string("per_layer_slice_1_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> per_layer_slice_1_end_0 = const()[name = string("per_layer_slice_1_end_0"), val = tensor<int32, [3]>([1, 1, 256])];
+            tensor<bool, [3]> per_layer_slice_1_end_mask_0 = const()[name = string("per_layer_slice_1_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_1_cast_fp16 = slice_by_index(begin = per_layer_slice_1_begin_0, end = per_layer_slice_1_end_0, end_mask = per_layer_slice_1_end_mask_0, x = per_layer_combined_out)[name = string("per_layer_slice_1_cast_fp16")];
+            tensor<int32, [3]> var_1287 = const()[name = string("op_1287"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_29_axes_0 = const()[name = string("input_29_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1288 = transpose(perm = var_1287, x = hidden_states_5_cast_fp16)[name = string("transpose_200")];
+            tensor<fp16, [1, 2560, 1, 1]> input_29 = expand_dims(axes = input_29_axes_0, x = var_1288)[name = string("input_29")];
+            string gated_1_pad_type_0 = const()[name = string("gated_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_1_strides_0 = const()[name = string("gated_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_1_pad_0 = const()[name = string("gated_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_1_dilations_0 = const()[name = string("gated_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_1_groups_0 = const()[name = string("gated_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_1 = conv(dilations = gated_1_dilations_0, groups = gated_1_groups_0, pad = gated_1_pad_0, pad_type = gated_1_pad_type_0, strides = gated_1_strides_0, weight = layers_0_per_layer_input_gate_weight_palettized, x = input_29)[name = string("gated_1")];
+            string gated_3_mode_0 = const()[name = string("gated_3_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_3 = gelu(mode = gated_3_mode_0, x = gated_1)[name = string("gated_3")];
+            tensor<int32, [3]> var_1307 = const()[name = string("op_1307"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_1_axes_0 = const()[name = string("per_layer_slice_conv_1_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_1308_cast_fp16 = transpose(perm = var_1307, x = per_layer_slice_1_cast_fp16)[name = string("transpose_199")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_1_cast_fp16 = expand_dims(axes = per_layer_slice_conv_1_axes_0, x = var_1308_cast_fp16)[name = string("per_layer_slice_conv_1_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_31_cast_fp16 = mul(x = gated_3, y = per_layer_slice_conv_1_cast_fp16)[name = string("input_31_cast_fp16")];
+            string gated_5_pad_type_0 = const()[name = string("gated_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_5_strides_0 = const()[name = string("gated_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_5_pad_0 = const()[name = string("gated_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_5_dilations_0 = const()[name = string("gated_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_5_groups_0 = const()[name = string("gated_5_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_0_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(547664448))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(547992192))))[name = string("layers_0_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_5_cast_fp16 = conv(dilations = gated_5_dilations_0, groups = gated_5_groups_0, pad = gated_5_pad_0, pad_type = gated_5_pad_type_0, strides = gated_5_strides_0, weight = layers_0_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_31_cast_fp16)[name = string("gated_5_cast_fp16")];
+            tensor<int32, [1]> var_1324_axes_0 = const()[name = string("op_1324_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1324_cast_fp16 = squeeze(axes = var_1324_axes_0, x = gated_5_cast_fp16)[name = string("op_1324_cast_fp16")];
+            tensor<int32, [3]> var_1328 = const()[name = string("op_1328"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1334 = const()[name = string("op_1334"), val = int32(-1)];
+            fp16 const_11_promoted_to_fp16 = const()[name = string("const_11_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_17_cast_fp16 = transpose(perm = var_1328, x = var_1324_cast_fp16)[name = string("transpose_198")];
+            tensor<fp16, [1, 1, 2560]> var_1336_cast_fp16 = mul(x = x_17_cast_fp16, y = const_11_promoted_to_fp16)[name = string("op_1336_cast_fp16")];
+            bool input_33_interleave_0 = const()[name = string("input_33_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_33_cast_fp16 = concat(axis = var_1334, interleave = input_33_interleave_0, values = (x_17_cast_fp16, var_1336_cast_fp16))[name = string("input_33_cast_fp16")];
+            tensor<int32, [1]> normed_29_axes_0 = const()[name = string("normed_29_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1331_to_fp16 = const()[name = string("op_1331_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_29_cast_fp16 = layer_norm(axes = normed_29_axes_0, epsilon = var_1331_to_fp16, x = input_33_cast_fp16)[name = string("normed_29_cast_fp16")];
+            tensor<int32, [2]> var_1341_split_sizes_0 = const()[name = string("op_1341_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1341_axis_0 = const()[name = string("op_1341_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1341_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1341_cast_fp16_1 = split(axis = var_1341_axis_0, split_sizes = var_1341_split_sizes_0, x = normed_29_cast_fp16)[name = string("op_1341_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_0_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(547994816)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_9_cast_fp16 = mul(x = var_1341_cast_fp16_0, y = layers_0_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_9_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_11_cast_fp16 = add(x = hidden_states_5_cast_fp16, y = hidden_states_9_cast_fp16)[name = string("hidden_states_11_cast_fp16")];
+            tensor<fp16, [1]> const_12_promoted_to_fp16 = const()[name = string("const_12_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.f4p-5])];
+            tensor<fp16, [1, 1, 2560]> x_19_cast_fp16 = mul(x = hidden_states_11_cast_fp16, y = const_12_promoted_to_fp16)[name = string("x_19_cast_fp16")];
+            tensor<int32, [1]> var_1353_axes_0 = const()[name = string("op_1353_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_1353_cast_fp16 = squeeze(axes = var_1353_axes_0, x = K_sliding_out_1_cast_fp16)[name = string("op_1353_cast_fp16")];
+            tensor<int32, [1]> var_1355_axes_0 = const()[name = string("op_1355_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_1355_cast_fp16 = squeeze(axes = var_1355_axes_0, x = V_sliding_out_1_cast_fp16)[name = string("op_1355_cast_fp16")];
+            tensor<int32, [4]> var_1358_begin_0 = const()[name = string("op_1358_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> var_1358_end_0 = const()[name = string("op_1358_end_0"), val = tensor<int32, [4]>([2, 2, 512, 512])];
+            tensor<bool, [4]> var_1358_end_mask_0 = const()[name = string("op_1358_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_1358_squeeze_mask_0 = const()[name = string("op_1358_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_1358_cast_fp16 = slice_by_index(begin = var_1358_begin_0, end = var_1358_end_0, end_mask = var_1358_end_mask_0, squeeze_mask = var_1358_squeeze_mask_0, x = K_sliding_in)[name = string("op_1358_cast_fp16")];
+            tensor<int32, [1]> K_sliding_slot_3_axes_0 = const()[name = string("K_sliding_slot_3_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_slot_3_cast_fp16 = expand_dims(axes = K_sliding_slot_3_axes_0, x = var_1358_cast_fp16)[name = string("K_sliding_slot_3_cast_fp16")];
+            tensor<int32, [4]> var_1363_begin_0 = const()[name = string("op_1363_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> var_1363_end_0 = const()[name = string("op_1363_end_0"), val = tensor<int32, [4]>([2, 2, 512, 512])];
+            tensor<bool, [4]> var_1363_end_mask_0 = const()[name = string("op_1363_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_1363_squeeze_mask_0 = const()[name = string("op_1363_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_1363_cast_fp16 = slice_by_index(begin = var_1363_begin_0, end = var_1363_end_0, end_mask = var_1363_end_mask_0, squeeze_mask = var_1363_squeeze_mask_0, x = V_sliding_in)[name = string("op_1363_cast_fp16")];
+            tensor<int32, [1]> V_sliding_slot_3_axes_0 = const()[name = string("V_sliding_slot_3_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_slot_3_cast_fp16 = expand_dims(axes = V_sliding_slot_3_axes_0, x = var_1363_cast_fp16)[name = string("V_sliding_slot_3_cast_fp16")];
+            int32 var_1370 = const()[name = string("op_1370"), val = int32(-1)];
+            fp16 const_13_promoted_to_fp16 = const()[name = string("const_13_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_1372_cast_fp16 = mul(x = x_19_cast_fp16, y = const_13_promoted_to_fp16)[name = string("op_1372_cast_fp16")];
+            bool input_35_interleave_0 = const()[name = string("input_35_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_35_cast_fp16 = concat(axis = var_1370, interleave = input_35_interleave_0, values = (x_19_cast_fp16, var_1372_cast_fp16))[name = string("input_35_cast_fp16")];
+            tensor<int32, [1]> normed_33_axes_0 = const()[name = string("normed_33_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1367_to_fp16 = const()[name = string("op_1367_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_33_cast_fp16 = layer_norm(axes = normed_33_axes_0, epsilon = var_1367_to_fp16, x = input_35_cast_fp16)[name = string("normed_33_cast_fp16")];
+            tensor<int32, [2]> var_1377_split_sizes_0 = const()[name = string("op_1377_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1377_axis_0 = const()[name = string("op_1377_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1377_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1377_cast_fp16_1 = split(axis = var_1377_axis_0, split_sizes = var_1377_split_sizes_0, x = normed_33_cast_fp16)[name = string("op_1377_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_1_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(548000000)))];
+            tensor<fp16, [1, 1, 2560]> h_7_cast_fp16 = mul(x = var_1377_cast_fp16_0, y = layers_1_input_layernorm_weight_promoted_to_fp16)[name = string("h_7_cast_fp16")];
+            tensor<int32, [3]> var_1383 = const()[name = string("op_1383"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1386_axes_0 = const()[name = string("op_1386_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1384_cast_fp16 = transpose(perm = var_1383, x = h_7_cast_fp16)[name = string("transpose_197")];
+            tensor<fp16, [1, 2560, 1, 1]> var_1386_cast_fp16 = expand_dims(axes = var_1386_axes_0, x = var_1384_cast_fp16)[name = string("op_1386_cast_fp16")];
+            string var_1402_pad_type_0 = const()[name = string("op_1402_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1402_strides_0 = const()[name = string("op_1402_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1402_pad_0 = const()[name = string("op_1402_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1402_dilations_0 = const()[name = string("op_1402_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1402_groups_0 = const()[name = string("op_1402_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_1402 = conv(dilations = var_1402_dilations_0, groups = var_1402_groups_0, pad = var_1402_pad_0, pad_type = var_1402_pad_type_0, strides = var_1402_strides_0, weight = layers_1_self_attn_q_proj_weight_palettized, x = var_1386_cast_fp16)[name = string("op_1402")];
+            tensor<int32, [4]> var_1407 = const()[name = string("op_1407"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_1408 = reshape(shape = var_1407, x = var_1402)[name = string("op_1408")];
+            tensor<int32, [4]> var_1413 = const()[name = string("op_1413"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_1423 = const()[name = string("op_1423"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_1414 = transpose(perm = var_1413, x = var_1408)[name = string("transpose_196")];
+            tensor<fp16, [1, 8, 256]> x_21 = reshape(shape = var_1423, x = var_1414)[name = string("x_21")];
+            int32 var_1429 = const()[name = string("op_1429"), val = int32(-1)];
+            fp16 const_14_promoted = const()[name = string("const_14_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_1431 = mul(x = x_21, y = const_14_promoted)[name = string("op_1431")];
+            bool input_39_interleave_0 = const()[name = string("input_39_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_39 = concat(axis = var_1429, interleave = input_39_interleave_0, values = (x_21, var_1431))[name = string("input_39")];
+            tensor<int32, [1]> normed_37_axes_0 = const()[name = string("normed_37_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1426_to_fp16 = const()[name = string("op_1426_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_37_cast_fp16 = layer_norm(axes = normed_37_axes_0, epsilon = var_1426_to_fp16, x = input_39)[name = string("normed_37_cast_fp16")];
+            tensor<int32, [2]> var_1436_split_sizes_0 = const()[name = string("op_1436_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_1436_axis_0 = const()[name = string("op_1436_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_1436_0, tensor<fp16, [1, 8, 256]> var_1436_1 = split(axis = var_1436_axis_0, split_sizes = var_1436_split_sizes_0, x = normed_37_cast_fp16)[name = string("op_1436")];
+            tensor<fp16, [1, 8, 256]> var_1438 = mul(x = var_1436_0, y = layers_1_self_attn_q_norm_weight)[name = string("op_1438")];
+            tensor<int32, [4]> var_1443 = const()[name = string("op_1443"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_11 = reshape(shape = var_1443, x = var_1438)[name = string("q_11")];
+            tensor<fp16, [1, 8, 1, 256]> var_1445_cast_fp16 = mul(x = q_11, y = cos_s)[name = string("op_1445_cast_fp16")];
+            tensor<int32, [2]> var_1446_split_sizes_0 = const()[name = string("op_1446_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_1446_axis_0 = const()[name = string("op_1446_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_1446_0, tensor<fp16, [1, 8, 1, 128]> var_1446_1 = split(axis = var_1446_axis_0, split_sizes = var_1446_split_sizes_0, x = q_11)[name = string("op_1446")];
+            fp16 const_15_promoted = const()[name = string("const_15_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_1448 = mul(x = var_1446_1, y = const_15_promoted)[name = string("op_1448")];
+            int32 var_1450 = const()[name = string("op_1450"), val = int32(-1)];
+            bool var_1451_interleave_0 = const()[name = string("op_1451_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_1451 = concat(axis = var_1450, interleave = var_1451_interleave_0, values = (var_1448, var_1446_0))[name = string("op_1451")];
+            tensor<fp16, [1, 8, 1, 256]> var_1452_cast_fp16 = mul(x = var_1451, y = sin_s)[name = string("op_1452_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_15_cast_fp16 = add(x = var_1445_cast_fp16, y = var_1452_cast_fp16)[name = string("q_15_cast_fp16")];
+            string var_1465_pad_type_0 = const()[name = string("op_1465_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1465_strides_0 = const()[name = string("op_1465_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1465_pad_0 = const()[name = string("op_1465_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1465_dilations_0 = const()[name = string("op_1465_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1465_groups_0 = const()[name = string("op_1465_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_1465 = conv(dilations = var_1465_dilations_0, groups = var_1465_groups_0, pad = var_1465_pad_0, pad_type = var_1465_pad_type_0, strides = var_1465_strides_0, weight = layers_1_self_attn_k_proj_weight_palettized, x = var_1386_cast_fp16)[name = string("op_1465")];
+            tensor<int32, [4]> var_1470 = const()[name = string("op_1470"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_1471 = reshape(shape = var_1470, x = var_1465)[name = string("op_1471")];
+            tensor<int32, [4]> var_1476 = const()[name = string("op_1476"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_1493_pad_type_0 = const()[name = string("op_1493_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1493_strides_0 = const()[name = string("op_1493_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1493_pad_0 = const()[name = string("op_1493_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1493_dilations_0 = const()[name = string("op_1493_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1493_groups_0 = const()[name = string("op_1493_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_1493 = conv(dilations = var_1493_dilations_0, groups = var_1493_groups_0, pad = var_1493_pad_0, pad_type = var_1493_pad_type_0, strides = var_1493_strides_0, weight = layers_1_self_attn_v_proj_weight_palettized, x = var_1386_cast_fp16)[name = string("op_1493")];
+            tensor<int32, [4]> var_1498 = const()[name = string("op_1498"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_1499 = reshape(shape = var_1498, x = var_1493)[name = string("op_1499")];
+            tensor<int32, [4]> var_1504 = const()[name = string("op_1504"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_1514 = const()[name = string("op_1514"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp16, [1, 2, 1, 256]> var_1477 = transpose(perm = var_1476, x = var_1471)[name = string("transpose_195")];
+            tensor<fp16, [1, 2, 256]> x_23 = reshape(shape = var_1514, x = var_1477)[name = string("x_23")];
+            int32 var_1520 = const()[name = string("op_1520"), val = int32(-1)];
+            fp16 const_16_promoted = const()[name = string("const_16_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 256]> var_1522 = mul(x = x_23, y = const_16_promoted)[name = string("op_1522")];
+            bool input_41_interleave_0 = const()[name = string("input_41_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512]> input_41 = concat(axis = var_1520, interleave = input_41_interleave_0, values = (x_23, var_1522))[name = string("input_41")];
+            tensor<int32, [1]> normed_41_axes_0 = const()[name = string("normed_41_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1517_to_fp16 = const()[name = string("op_1517_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 512]> normed_41_cast_fp16 = layer_norm(axes = normed_41_axes_0, epsilon = var_1517_to_fp16, x = input_41)[name = string("normed_41_cast_fp16")];
+            tensor<int32, [2]> var_1527_split_sizes_0 = const()[name = string("op_1527_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_1527_axis_0 = const()[name = string("op_1527_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 256]> var_1527_0, tensor<fp16, [1, 2, 256]> var_1527_1 = split(axis = var_1527_axis_0, split_sizes = var_1527_split_sizes_0, x = normed_41_cast_fp16)[name = string("op_1527")];
+            tensor<fp16, [1, 2, 256]> var_1529 = mul(x = var_1527_0, y = layers_1_self_attn_k_norm_weight)[name = string("op_1529")];
+            tensor<int32, [4]> var_1534 = const()[name = string("op_1534"), val = tensor<int32, [4]>([1, 2, 1, 256])];
+            tensor<fp16, [1, 2, 1, 256]> q_13 = reshape(shape = var_1534, x = var_1529)[name = string("q_13")];
+            fp16 var_1536_promoted = const()[name = string("op_1536_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 256]> var_1505 = transpose(perm = var_1504, x = var_1499)[name = string("transpose_194")];
+            tensor<fp16, [1, 2, 1, 256]> var_1537 = pow(x = var_1505, y = var_1536_promoted)[name = string("op_1537")];
+            tensor<int32, [1]> var_1542_axes_0 = const()[name = string("op_1542_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1542_keep_dims_0 = const()[name = string("op_1542_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_1542 = reduce_mean(axes = var_1542_axes_0, keep_dims = var_1542_keep_dims_0, x = var_1537)[name = string("op_1542")];
+            fp16 var_1544_to_fp16 = const()[name = string("op_1544_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_3_cast_fp16 = add(x = var_1542, y = var_1544_to_fp16)[name = string("mean_sq_3_cast_fp16")];
+            fp32 var_1546_epsilon_0 = const()[name = string("op_1546_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_1546_cast_fp16 = rsqrt(epsilon = var_1546_epsilon_0, x = mean_sq_3_cast_fp16)[name = string("op_1546_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_45_cast_fp16 = mul(x = var_1505, y = var_1546_cast_fp16)[name = string("input_45_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> var_1548_cast_fp16 = mul(x = q_13, y = cos_s)[name = string("op_1548_cast_fp16")];
+            tensor<int32, [2]> var_1549_split_sizes_0 = const()[name = string("op_1549_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_1549_axis_0 = const()[name = string("op_1549_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 128]> var_1549_0, tensor<fp16, [1, 2, 1, 128]> var_1549_1 = split(axis = var_1549_axis_0, split_sizes = var_1549_split_sizes_0, x = q_13)[name = string("op_1549")];
+            fp16 const_17_promoted = const()[name = string("const_17_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 128]> var_1551 = mul(x = var_1549_1, y = const_17_promoted)[name = string("op_1551")];
+            int32 var_1553 = const()[name = string("op_1553"), val = int32(-1)];
+            bool var_1554_interleave_0 = const()[name = string("op_1554_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 256]> var_1554 = concat(axis = var_1553, interleave = var_1554_interleave_0, values = (var_1551, var_1549_0))[name = string("op_1554")];
+            tensor<fp16, [1, 2, 1, 256]> var_1555_cast_fp16 = mul(x = var_1554, y = sin_s)[name = string("op_1555_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_43_cast_fp16 = add(x = var_1548_cast_fp16, y = var_1555_cast_fp16)[name = string("input_43_cast_fp16")];
+            tensor<int32, [8]> k_padded_3_pad_0 = const()[name = string("k_padded_3_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_3_mode_0 = const()[name = string("k_padded_3_mode_0"), val = string("constant")];
+            fp16 const_18_to_fp16 = const()[name = string("const_18_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> k_padded_3_cast_fp16 = pad(constant_val = const_18_to_fp16, mode = k_padded_3_mode_0, pad = k_padded_3_pad_0, x = input_43_cast_fp16)[name = string("k_padded_3_cast_fp16")];
+            tensor<int32, [8]> v_padded_3_pad_0 = const()[name = string("v_padded_3_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_3_mode_0 = const()[name = string("v_padded_3_mode_0"), val = string("constant")];
+            fp16 const_19_to_fp16 = const()[name = string("const_19_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> v_padded_3_cast_fp16 = pad(constant_val = const_19_to_fp16, mode = v_padded_3_mode_0, pad = v_padded_3_pad_0, x = input_45_cast_fp16)[name = string("v_padded_3_cast_fp16")];
+            tensor<int32, [4]> var_1584_begin_0 = const()[name = string("op_1584_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_1584_end_0 = const()[name = string("op_1584_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_1584_end_mask_0 = const()[name = string("op_1584_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_1584_cast_fp16 = slice_by_index(begin = var_1584_begin_0, end = var_1584_end_0, end_mask = var_1584_end_mask_0, x = K_sliding_slot_3_cast_fp16)[name = string("op_1584_cast_fp16")];
+            int32 var_1591 = const()[name = string("op_1591"), val = int32(2)];
+            bool K_sliding_out_3_interleave_0 = const()[name = string("K_sliding_out_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_out_3_cast_fp16 = concat(axis = var_1591, interleave = K_sliding_out_3_interleave_0, values = (var_1584_cast_fp16, k_padded_3_cast_fp16))[name = string("K_sliding_out_3_cast_fp16")];
+            tensor<int32, [4]> var_1607_begin_0 = const()[name = string("op_1607_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_1607_end_0 = const()[name = string("op_1607_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_1607_end_mask_0 = const()[name = string("op_1607_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_1607_cast_fp16 = slice_by_index(begin = var_1607_begin_0, end = var_1607_end_0, end_mask = var_1607_end_mask_0, x = V_sliding_slot_3_cast_fp16)[name = string("op_1607_cast_fp16")];
+            int32 var_1614 = const()[name = string("op_1614"), val = int32(2)];
+            bool V_sliding_out_3_interleave_0 = const()[name = string("V_sliding_out_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_out_3_cast_fp16 = concat(axis = var_1614, interleave = V_sliding_out_3_interleave_0, values = (var_1607_cast_fp16, v_padded_3_cast_fp16))[name = string("V_sliding_out_3_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_3_begin_0 = const()[name = string("K_for_attn_3_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_3_end_0 = const()[name = string("K_for_attn_3_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_3_end_mask_0 = const()[name = string("K_for_attn_3_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_3_cast_fp16 = slice_by_index(begin = K_for_attn_3_begin_0, end = K_for_attn_3_end_0, end_mask = K_for_attn_3_end_mask_0, x = K_sliding_out_3_cast_fp16)[name = string("K_for_attn_3_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_3_begin_0 = const()[name = string("V_for_attn_3_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_3_end_0 = const()[name = string("V_for_attn_3_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_3_end_mask_0 = const()[name = string("V_for_attn_3_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_3_cast_fp16 = slice_by_index(begin = V_for_attn_3_begin_0, end = V_for_attn_3_end_0, end_mask = V_for_attn_3_end_mask_0, x = V_sliding_out_3_cast_fp16)[name = string("V_for_attn_3_cast_fp16")];
+            tensor<int32, [4]> transpose_4_perm_0 = const()[name = string("transpose_4_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_2_reps_0 = const()[name = string("tile_2_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_4_cast_fp16 = transpose(perm = transpose_4_perm_0, x = K_for_attn_3_cast_fp16)[name = string("transpose_193")];
+            tensor<fp16, [8, 1, 512, 256]> tile_2_cast_fp16 = tile(reps = tile_2_reps_0, x = transpose_4_cast_fp16)[name = string("tile_2_cast_fp16")];
+            tensor<int32, [5]> concat_4 = const()[name = string("concat_4"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_4_cast_fp16 = reshape(shape = concat_4, x = tile_2_cast_fp16)[name = string("reshape_4_cast_fp16")];
+            tensor<int32, [5]> transpose_5_perm_0 = const()[name = string("transpose_5_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_5 = const()[name = string("concat_5"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_5_cast_fp16 = transpose(perm = transpose_5_perm_0, x = reshape_4_cast_fp16)[name = string("transpose_192")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_5_cast_fp16 = reshape(shape = concat_5, x = transpose_5_cast_fp16)[name = string("reshape_5_cast_fp16")];
+            tensor<int32, [4]> transpose_49_perm_0 = const()[name = string("transpose_49_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_6_perm_0 = const()[name = string("transpose_6_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_3_reps_0 = const()[name = string("tile_3_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_6_cast_fp16 = transpose(perm = transpose_6_perm_0, x = V_for_attn_3_cast_fp16)[name = string("transpose_191")];
+            tensor<fp16, [8, 1, 512, 256]> tile_3_cast_fp16 = tile(reps = tile_3_reps_0, x = transpose_6_cast_fp16)[name = string("tile_3_cast_fp16")];
+            tensor<int32, [5]> concat_6 = const()[name = string("concat_6"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_6_cast_fp16 = reshape(shape = concat_6, x = tile_3_cast_fp16)[name = string("reshape_6_cast_fp16")];
+            tensor<int32, [5]> transpose_7_perm_0 = const()[name = string("transpose_7_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_7 = const()[name = string("concat_7"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_7_cast_fp16 = transpose(perm = transpose_7_perm_0, x = reshape_6_cast_fp16)[name = string("transpose_190")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_7_cast_fp16 = reshape(shape = concat_7, x = transpose_7_cast_fp16)[name = string("reshape_7_cast_fp16")];
+            tensor<int32, [4]> V_expanded_3_perm_0 = const()[name = string("V_expanded_3_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(false)];
+            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_49_cast_fp16 = transpose(perm = transpose_49_perm_0, x = reshape_5_cast_fp16)[name = string("transpose_189")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = q_15_cast_fp16, y = transpose_49_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_27_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = causal_mask_sliding)[name = string("x_27_cast_fp16")];
+            tensor<int32, [1]> reduce_max_1_axes_0 = const()[name = string("reduce_max_1_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_1_keep_dims_0 = const()[name = string("reduce_max_1_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_1 = reduce_max(axes = reduce_max_1_axes_0, keep_dims = reduce_max_1_keep_dims_0, x = x_27_cast_fp16)[name = string("reduce_max_1")];
+            tensor<fp16, [1, 8, 1, 512]> var_1655 = sub(x = x_27_cast_fp16, y = reduce_max_1)[name = string("op_1655")];
+            tensor<fp16, [1, 8, 1, 512]> var_1661 = exp(x = var_1655)[name = string("op_1661")];
+            tensor<int32, [1]> var_1671_axes_0 = const()[name = string("op_1671_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1671_keep_dims_0 = const()[name = string("op_1671_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_1671 = reduce_sum(axes = var_1671_axes_0, keep_dims = var_1671_keep_dims_0, x = var_1661)[name = string("op_1671")];
+            tensor<fp16, [1, 8, 1, 512]> var_1677_cast_fp16 = real_div(x = var_1661, y = var_1671)[name = string("op_1677_cast_fp16")];
+            bool attn_output_7_transpose_x_0 = const()[name = string("attn_output_7_transpose_x_0"), val = bool(false)];
+            bool attn_output_7_transpose_y_0 = const()[name = string("attn_output_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_3_cast_fp16 = transpose(perm = V_expanded_3_perm_0, x = reshape_7_cast_fp16)[name = string("transpose_188")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_7_cast_fp16 = matmul(transpose_x = attn_output_7_transpose_x_0, transpose_y = attn_output_7_transpose_y_0, x = var_1677_cast_fp16, y = V_expanded_3_cast_fp16)[name = string("attn_output_7_cast_fp16")];
+            tensor<int32, [4]> var_1688 = const()[name = string("op_1688"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1695 = const()[name = string("op_1695"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_1689_cast_fp16 = transpose(perm = var_1688, x = attn_output_7_cast_fp16)[name = string("transpose_187")];
+            tensor<fp16, [1, 1, 2048]> attn_output_9_cast_fp16 = reshape(shape = var_1695, x = var_1689_cast_fp16)[name = string("attn_output_9_cast_fp16")];
+            tensor<int32, [3]> var_1700 = const()[name = string("op_1700"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_1716_pad_type_0 = const()[name = string("op_1716_pad_type_0"), val = string("valid")];
+            int32 var_1716_groups_0 = const()[name = string("op_1716_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_1716_strides_0 = const()[name = string("op_1716_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_1716_pad_0 = const()[name = string("op_1716_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_1716_dilations_0 = const()[name = string("op_1716_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_1_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(548005184))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(550626688))))[name = string("squeeze_1_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_1701_cast_fp16 = transpose(perm = var_1700, x = attn_output_9_cast_fp16)[name = string("transpose_186")];
+            tensor<fp16, [1, 2560, 1]> var_1716_cast_fp16 = conv(dilations = var_1716_dilations_0, groups = var_1716_groups_0, pad = var_1716_pad_0, pad_type = var_1716_pad_type_0, strides = var_1716_strides_0, weight = squeeze_1_cast_fp16_to_fp32_to_fp16_palettized, x = var_1701_cast_fp16)[name = string("op_1716_cast_fp16")];
+            tensor<int32, [3]> var_1720 = const()[name = string("op_1720"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1726 = const()[name = string("op_1726"), val = int32(-1)];
+            fp16 const_20_promoted_to_fp16 = const()[name = string("const_20_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_31_cast_fp16 = transpose(perm = var_1720, x = var_1716_cast_fp16)[name = string("transpose_185")];
+            tensor<fp16, [1, 1, 2560]> var_1728_cast_fp16 = mul(x = x_31_cast_fp16, y = const_20_promoted_to_fp16)[name = string("op_1728_cast_fp16")];
+            bool input_49_interleave_0 = const()[name = string("input_49_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_49_cast_fp16 = concat(axis = var_1726, interleave = input_49_interleave_0, values = (x_31_cast_fp16, var_1728_cast_fp16))[name = string("input_49_cast_fp16")];
+            tensor<int32, [1]> normed_45_axes_0 = const()[name = string("normed_45_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1723_to_fp16 = const()[name = string("op_1723_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_45_cast_fp16 = layer_norm(axes = normed_45_axes_0, epsilon = var_1723_to_fp16, x = input_49_cast_fp16)[name = string("normed_45_cast_fp16")];
+            tensor<int32, [2]> var_1733_split_sizes_0 = const()[name = string("op_1733_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1733_axis_0 = const()[name = string("op_1733_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1733_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1733_cast_fp16_1 = split(axis = var_1733_axis_0, split_sizes = var_1733_split_sizes_0, x = normed_45_cast_fp16)[name = string("op_1733_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_1_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(550629312)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_11_cast_fp16 = mul(x = var_1733_cast_fp16_0, y = layers_1_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_11_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_33_cast_fp16 = add(x = x_19_cast_fp16, y = attn_output_11_cast_fp16)[name = string("x_33_cast_fp16")];
+            int32 var_1742 = const()[name = string("op_1742"), val = int32(-1)];
+            fp16 const_21_promoted_to_fp16 = const()[name = string("const_21_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_1744_cast_fp16 = mul(x = x_33_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_1744_cast_fp16")];
+            bool input_51_interleave_0 = const()[name = string("input_51_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_51_cast_fp16 = concat(axis = var_1742, interleave = input_51_interleave_0, values = (x_33_cast_fp16, var_1744_cast_fp16))[name = string("input_51_cast_fp16")];
+            tensor<int32, [1]> normed_49_axes_0 = const()[name = string("normed_49_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1739_to_fp16 = const()[name = string("op_1739_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_49_cast_fp16 = layer_norm(axes = normed_49_axes_0, epsilon = var_1739_to_fp16, x = input_51_cast_fp16)[name = string("normed_49_cast_fp16")];
+            tensor<int32, [2]> var_1749_split_sizes_0 = const()[name = string("op_1749_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1749_axis_0 = const()[name = string("op_1749_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1749_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1749_cast_fp16_1 = split(axis = var_1749_axis_0, split_sizes = var_1749_split_sizes_0, x = normed_49_cast_fp16)[name = string("op_1749_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_1_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(550634496)))];
+            tensor<fp16, [1, 1, 2560]> h_9_cast_fp16 = mul(x = var_1749_cast_fp16_0, y = layers_1_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_9_cast_fp16")];
+            tensor<int32, [3]> var_1760 = const()[name = string("op_1760"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_53_axes_0 = const()[name = string("input_53_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1761 = transpose(perm = var_1760, x = h_9_cast_fp16)[name = string("transpose_184")];
+            tensor<fp16, [1, 2560, 1, 1]> input_53 = expand_dims(axes = input_53_axes_0, x = var_1761)[name = string("input_53")];
+            string gate_5_pad_type_0 = const()[name = string("gate_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_5_strides_0 = const()[name = string("gate_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_5_pad_0 = const()[name = string("gate_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_5_dilations_0 = const()[name = string("gate_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_5_groups_0 = const()[name = string("gate_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_5 = conv(dilations = gate_5_dilations_0, groups = gate_5_groups_0, pad = gate_5_pad_0, pad_type = gate_5_pad_type_0, strides = gate_5_strides_0, weight = layers_1_mlp_gate_proj_weight_palettized, x = input_53)[name = string("gate_5")];
+            string up_3_pad_type_0 = const()[name = string("up_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_3_strides_0 = const()[name = string("up_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_3_pad_0 = const()[name = string("up_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_3_dilations_0 = const()[name = string("up_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_3_groups_0 = const()[name = string("up_3_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_3 = conv(dilations = up_3_dilations_0, groups = up_3_groups_0, pad = up_3_pad_0, pad_type = up_3_pad_type_0, strides = up_3_strides_0, weight = layers_1_mlp_up_proj_weight_palettized, x = input_53)[name = string("up_3")];
+            string gate_7_mode_0 = const()[name = string("gate_7_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_7 = gelu(mode = gate_7_mode_0, x = gate_5)[name = string("gate_7")];
+            tensor<fp16, [1, 10240, 1, 1]> input_55 = mul(x = gate_7, y = up_3)[name = string("input_55")];
+            string mlp_out_3_pad_type_0 = const()[name = string("mlp_out_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_3_strides_0 = const()[name = string("mlp_out_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_3_pad_0 = const()[name = string("mlp_out_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_3_dilations_0 = const()[name = string("mlp_out_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_3_groups_0 = const()[name = string("mlp_out_3_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_3 = conv(dilations = mlp_out_3_dilations_0, groups = mlp_out_3_groups_0, pad = mlp_out_3_pad_0, pad_type = mlp_out_3_pad_type_0, strides = mlp_out_3_strides_0, weight = layers_1_mlp_down_proj_weight_palettized, x = input_55)[name = string("mlp_out_3")];
+            tensor<int32, [1]> var_1801_axes_0 = const()[name = string("op_1801_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1801 = squeeze(axes = var_1801_axes_0, x = mlp_out_3)[name = string("op_1801")];
+            tensor<int32, [3]> var_1805 = const()[name = string("op_1805"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1811 = const()[name = string("op_1811"), val = int32(-1)];
+            fp16 const_22_promoted = const()[name = string("const_22_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_35 = transpose(perm = var_1805, x = var_1801)[name = string("transpose_183")];
+            tensor<fp16, [1, 1, 2560]> var_1813 = mul(x = x_35, y = const_22_promoted)[name = string("op_1813")];
+            bool input_57_interleave_0 = const()[name = string("input_57_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_57 = concat(axis = var_1811, interleave = input_57_interleave_0, values = (x_35, var_1813))[name = string("input_57")];
+            tensor<int32, [1]> normed_53_axes_0 = const()[name = string("normed_53_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1808_to_fp16 = const()[name = string("op_1808_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_53_cast_fp16 = layer_norm(axes = normed_53_axes_0, epsilon = var_1808_to_fp16, x = input_57)[name = string("normed_53_cast_fp16")];
+            tensor<int32, [2]> var_1818_split_sizes_0 = const()[name = string("op_1818_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1818_axis_0 = const()[name = string("op_1818_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1818_0, tensor<fp16, [1, 1, 2560]> var_1818_1 = split(axis = var_1818_axis_0, split_sizes = var_1818_split_sizes_0, x = normed_53_cast_fp16)[name = string("op_1818")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_13 = mul(x = var_1818_0, y = layers_1_post_feedforward_layernorm_weight)[name = string("hidden_states_13")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_15_cast_fp16 = add(x = x_33_cast_fp16, y = hidden_states_13)[name = string("hidden_states_15_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_3_begin_0 = const()[name = string("per_layer_slice_3_begin_0"), val = tensor<int32, [3]>([0, 0, 256])];
+            tensor<int32, [3]> per_layer_slice_3_end_0 = const()[name = string("per_layer_slice_3_end_0"), val = tensor<int32, [3]>([1, 1, 512])];
+            tensor<bool, [3]> per_layer_slice_3_end_mask_0 = const()[name = string("per_layer_slice_3_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_3_cast_fp16 = slice_by_index(begin = per_layer_slice_3_begin_0, end = per_layer_slice_3_end_0, end_mask = per_layer_slice_3_end_mask_0, x = per_layer_combined_out)[name = string("per_layer_slice_3_cast_fp16")];
+            tensor<int32, [3]> var_1846 = const()[name = string("op_1846"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_59_axes_0 = const()[name = string("input_59_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1847 = transpose(perm = var_1846, x = hidden_states_15_cast_fp16)[name = string("transpose_182")];
+            tensor<fp16, [1, 2560, 1, 1]> input_59 = expand_dims(axes = input_59_axes_0, x = var_1847)[name = string("input_59")];
+            string gated_7_pad_type_0 = const()[name = string("gated_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_7_strides_0 = const()[name = string("gated_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_7_pad_0 = const()[name = string("gated_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_7_dilations_0 = const()[name = string("gated_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_7_groups_0 = const()[name = string("gated_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_7 = conv(dilations = gated_7_dilations_0, groups = gated_7_groups_0, pad = gated_7_pad_0, pad_type = gated_7_pad_type_0, strides = gated_7_strides_0, weight = layers_1_per_layer_input_gate_weight_palettized, x = input_59)[name = string("gated_7")];
+            string gated_9_mode_0 = const()[name = string("gated_9_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_9 = gelu(mode = gated_9_mode_0, x = gated_7)[name = string("gated_9")];
+            tensor<int32, [3]> var_1866 = const()[name = string("op_1866"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_3_axes_0 = const()[name = string("per_layer_slice_conv_3_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_1867_cast_fp16 = transpose(perm = var_1866, x = per_layer_slice_3_cast_fp16)[name = string("transpose_181")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_3_cast_fp16 = expand_dims(axes = per_layer_slice_conv_3_axes_0, x = var_1867_cast_fp16)[name = string("per_layer_slice_conv_3_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_61_cast_fp16 = mul(x = gated_9, y = per_layer_slice_conv_3_cast_fp16)[name = string("input_61_cast_fp16")];
+            string gated_11_pad_type_0 = const()[name = string("gated_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_11_strides_0 = const()[name = string("gated_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_11_pad_0 = const()[name = string("gated_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_11_dilations_0 = const()[name = string("gated_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_11_groups_0 = const()[name = string("gated_11_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_1_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(550639680))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(550967424))))[name = string("layers_1_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_11_cast_fp16 = conv(dilations = gated_11_dilations_0, groups = gated_11_groups_0, pad = gated_11_pad_0, pad_type = gated_11_pad_type_0, strides = gated_11_strides_0, weight = layers_1_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_61_cast_fp16)[name = string("gated_11_cast_fp16")];
+            tensor<int32, [1]> var_1883_axes_0 = const()[name = string("op_1883_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1883_cast_fp16 = squeeze(axes = var_1883_axes_0, x = gated_11_cast_fp16)[name = string("op_1883_cast_fp16")];
+            tensor<int32, [3]> var_1887 = const()[name = string("op_1887"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1893 = const()[name = string("op_1893"), val = int32(-1)];
+            fp16 const_23_promoted_to_fp16 = const()[name = string("const_23_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_37_cast_fp16 = transpose(perm = var_1887, x = var_1883_cast_fp16)[name = string("transpose_180")];
+            tensor<fp16, [1, 1, 2560]> var_1895_cast_fp16 = mul(x = x_37_cast_fp16, y = const_23_promoted_to_fp16)[name = string("op_1895_cast_fp16")];
+            bool input_63_interleave_0 = const()[name = string("input_63_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_63_cast_fp16 = concat(axis = var_1893, interleave = input_63_interleave_0, values = (x_37_cast_fp16, var_1895_cast_fp16))[name = string("input_63_cast_fp16")];
+            tensor<int32, [1]> normed_57_axes_0 = const()[name = string("normed_57_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1890_to_fp16 = const()[name = string("op_1890_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_57_cast_fp16 = layer_norm(axes = normed_57_axes_0, epsilon = var_1890_to_fp16, x = input_63_cast_fp16)[name = string("normed_57_cast_fp16")];
+            tensor<int32, [2]> var_1900_split_sizes_0 = const()[name = string("op_1900_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1900_axis_0 = const()[name = string("op_1900_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1900_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1900_cast_fp16_1 = split(axis = var_1900_axis_0, split_sizes = var_1900_split_sizes_0, x = normed_57_cast_fp16)[name = string("op_1900_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_1_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(550970048)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_19_cast_fp16 = mul(x = var_1900_cast_fp16_0, y = layers_1_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_19_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_21_cast_fp16 = add(x = hidden_states_15_cast_fp16, y = hidden_states_19_cast_fp16)[name = string("hidden_states_21_cast_fp16")];
+            tensor<fp16, [1]> const_24_promoted_to_fp16 = const()[name = string("const_24_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.48p-3])];
+            tensor<fp16, [1, 1, 2560]> x_39_cast_fp16 = mul(x = hidden_states_21_cast_fp16, y = const_24_promoted_to_fp16)[name = string("x_39_cast_fp16")];
+            tensor<int32, [1]> var_1912_axes_0 = const()[name = string("op_1912_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_1912_cast_fp16 = squeeze(axes = var_1912_axes_0, x = K_sliding_out_3_cast_fp16)[name = string("op_1912_cast_fp16")];
+            tensor<int32, [1]> var_1914_axes_0 = const()[name = string("op_1914_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_1914_cast_fp16 = squeeze(axes = var_1914_axes_0, x = V_sliding_out_3_cast_fp16)[name = string("op_1914_cast_fp16")];
+            tensor<int32, [4]> var_1917_begin_0 = const()[name = string("op_1917_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
+            tensor<int32, [4]> var_1917_end_0 = const()[name = string("op_1917_end_0"), val = tensor<int32, [4]>([3, 2, 512, 512])];
+            tensor<bool, [4]> var_1917_end_mask_0 = const()[name = string("op_1917_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_1917_squeeze_mask_0 = const()[name = string("op_1917_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_1917_cast_fp16 = slice_by_index(begin = var_1917_begin_0, end = var_1917_end_0, end_mask = var_1917_end_mask_0, squeeze_mask = var_1917_squeeze_mask_0, x = K_sliding_in)[name = string("op_1917_cast_fp16")];
+            tensor<int32, [1]> K_sliding_slot_5_axes_0 = const()[name = string("K_sliding_slot_5_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_slot_5_cast_fp16 = expand_dims(axes = K_sliding_slot_5_axes_0, x = var_1917_cast_fp16)[name = string("K_sliding_slot_5_cast_fp16")];
+            tensor<int32, [4]> var_1922_begin_0 = const()[name = string("op_1922_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
+            tensor<int32, [4]> var_1922_end_0 = const()[name = string("op_1922_end_0"), val = tensor<int32, [4]>([3, 2, 512, 512])];
+            tensor<bool, [4]> var_1922_end_mask_0 = const()[name = string("op_1922_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_1922_squeeze_mask_0 = const()[name = string("op_1922_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_1922_cast_fp16 = slice_by_index(begin = var_1922_begin_0, end = var_1922_end_0, end_mask = var_1922_end_mask_0, squeeze_mask = var_1922_squeeze_mask_0, x = V_sliding_in)[name = string("op_1922_cast_fp16")];
+            tensor<int32, [1]> V_sliding_slot_5_axes_0 = const()[name = string("V_sliding_slot_5_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_slot_5_cast_fp16 = expand_dims(axes = V_sliding_slot_5_axes_0, x = var_1922_cast_fp16)[name = string("V_sliding_slot_5_cast_fp16")];
+            int32 var_1929 = const()[name = string("op_1929"), val = int32(-1)];
+            fp16 const_25_promoted_to_fp16 = const()[name = string("const_25_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_1931_cast_fp16 = mul(x = x_39_cast_fp16, y = const_25_promoted_to_fp16)[name = string("op_1931_cast_fp16")];
+            bool input_65_interleave_0 = const()[name = string("input_65_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_65_cast_fp16 = concat(axis = var_1929, interleave = input_65_interleave_0, values = (x_39_cast_fp16, var_1931_cast_fp16))[name = string("input_65_cast_fp16")];
+            tensor<int32, [1]> normed_61_axes_0 = const()[name = string("normed_61_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1926_to_fp16 = const()[name = string("op_1926_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_61_cast_fp16 = layer_norm(axes = normed_61_axes_0, epsilon = var_1926_to_fp16, x = input_65_cast_fp16)[name = string("normed_61_cast_fp16")];
+            tensor<int32, [2]> var_1936_split_sizes_0 = const()[name = string("op_1936_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1936_axis_0 = const()[name = string("op_1936_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1936_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1936_cast_fp16_1 = split(axis = var_1936_axis_0, split_sizes = var_1936_split_sizes_0, x = normed_61_cast_fp16)[name = string("op_1936_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_2_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(550975232)))];
+            tensor<fp16, [1, 1, 2560]> h_13_cast_fp16 = mul(x = var_1936_cast_fp16_0, y = layers_2_input_layernorm_weight_promoted_to_fp16)[name = string("h_13_cast_fp16")];
+            tensor<int32, [3]> var_1942 = const()[name = string("op_1942"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1945_axes_0 = const()[name = string("op_1945_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1943_cast_fp16 = transpose(perm = var_1942, x = h_13_cast_fp16)[name = string("transpose_179")];
+            tensor<fp16, [1, 2560, 1, 1]> var_1945_cast_fp16 = expand_dims(axes = var_1945_axes_0, x = var_1943_cast_fp16)[name = string("op_1945_cast_fp16")];
+            string var_1961_pad_type_0 = const()[name = string("op_1961_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1961_strides_0 = const()[name = string("op_1961_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1961_pad_0 = const()[name = string("op_1961_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1961_dilations_0 = const()[name = string("op_1961_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1961_groups_0 = const()[name = string("op_1961_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_1961 = conv(dilations = var_1961_dilations_0, groups = var_1961_groups_0, pad = var_1961_pad_0, pad_type = var_1961_pad_type_0, strides = var_1961_strides_0, weight = layers_2_self_attn_q_proj_weight_palettized, x = var_1945_cast_fp16)[name = string("op_1961")];
+            tensor<int32, [4]> var_1966 = const()[name = string("op_1966"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_1967 = reshape(shape = var_1966, x = var_1961)[name = string("op_1967")];
+            tensor<int32, [4]> var_1972 = const()[name = string("op_1972"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_1982 = const()[name = string("op_1982"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_1973 = transpose(perm = var_1972, x = var_1967)[name = string("transpose_178")];
+            tensor<fp16, [1, 8, 256]> x_41 = reshape(shape = var_1982, x = var_1973)[name = string("x_41")];
+            int32 var_1988 = const()[name = string("op_1988"), val = int32(-1)];
+            fp16 const_26_promoted = const()[name = string("const_26_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_1990 = mul(x = x_41, y = const_26_promoted)[name = string("op_1990")];
+            bool input_69_interleave_0 = const()[name = string("input_69_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_69 = concat(axis = var_1988, interleave = input_69_interleave_0, values = (x_41, var_1990))[name = string("input_69")];
+            tensor<int32, [1]> normed_65_axes_0 = const()[name = string("normed_65_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1985_to_fp16 = const()[name = string("op_1985_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_65_cast_fp16 = layer_norm(axes = normed_65_axes_0, epsilon = var_1985_to_fp16, x = input_69)[name = string("normed_65_cast_fp16")];
+            tensor<int32, [2]> var_1995_split_sizes_0 = const()[name = string("op_1995_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_1995_axis_0 = const()[name = string("op_1995_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_1995_0, tensor<fp16, [1, 8, 256]> var_1995_1 = split(axis = var_1995_axis_0, split_sizes = var_1995_split_sizes_0, x = normed_65_cast_fp16)[name = string("op_1995")];
+            tensor<fp16, [1, 8, 256]> var_1997 = mul(x = var_1995_0, y = layers_2_self_attn_q_norm_weight)[name = string("op_1997")];
+            tensor<int32, [4]> var_2002 = const()[name = string("op_2002"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_19 = reshape(shape = var_2002, x = var_1997)[name = string("q_19")];
+            tensor<fp16, [1, 8, 1, 256]> var_2004_cast_fp16 = mul(x = q_19, y = cos_s)[name = string("op_2004_cast_fp16")];
+            tensor<int32, [2]> var_2005_split_sizes_0 = const()[name = string("op_2005_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2005_axis_0 = const()[name = string("op_2005_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_2005_0, tensor<fp16, [1, 8, 1, 128]> var_2005_1 = split(axis = var_2005_axis_0, split_sizes = var_2005_split_sizes_0, x = q_19)[name = string("op_2005")];
+            fp16 const_27_promoted = const()[name = string("const_27_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_2007 = mul(x = var_2005_1, y = const_27_promoted)[name = string("op_2007")];
+            int32 var_2009 = const()[name = string("op_2009"), val = int32(-1)];
+            bool var_2010_interleave_0 = const()[name = string("op_2010_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_2010 = concat(axis = var_2009, interleave = var_2010_interleave_0, values = (var_2007, var_2005_0))[name = string("op_2010")];
+            tensor<fp16, [1, 8, 1, 256]> var_2011_cast_fp16 = mul(x = var_2010, y = sin_s)[name = string("op_2011_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_23_cast_fp16 = add(x = var_2004_cast_fp16, y = var_2011_cast_fp16)[name = string("q_23_cast_fp16")];
+            string var_2024_pad_type_0 = const()[name = string("op_2024_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2024_strides_0 = const()[name = string("op_2024_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2024_pad_0 = const()[name = string("op_2024_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2024_dilations_0 = const()[name = string("op_2024_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2024_groups_0 = const()[name = string("op_2024_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_2024 = conv(dilations = var_2024_dilations_0, groups = var_2024_groups_0, pad = var_2024_pad_0, pad_type = var_2024_pad_type_0, strides = var_2024_strides_0, weight = layers_2_self_attn_k_proj_weight_palettized, x = var_1945_cast_fp16)[name = string("op_2024")];
+            tensor<int32, [4]> var_2029 = const()[name = string("op_2029"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_2030 = reshape(shape = var_2029, x = var_2024)[name = string("op_2030")];
+            tensor<int32, [4]> var_2035 = const()[name = string("op_2035"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_2052_pad_type_0 = const()[name = string("op_2052_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2052_strides_0 = const()[name = string("op_2052_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2052_pad_0 = const()[name = string("op_2052_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2052_dilations_0 = const()[name = string("op_2052_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2052_groups_0 = const()[name = string("op_2052_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_2052 = conv(dilations = var_2052_dilations_0, groups = var_2052_groups_0, pad = var_2052_pad_0, pad_type = var_2052_pad_type_0, strides = var_2052_strides_0, weight = layers_2_self_attn_v_proj_weight_palettized, x = var_1945_cast_fp16)[name = string("op_2052")];
+            tensor<int32, [4]> var_2057 = const()[name = string("op_2057"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_2058 = reshape(shape = var_2057, x = var_2052)[name = string("op_2058")];
+            tensor<int32, [4]> var_2063 = const()[name = string("op_2063"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_2073 = const()[name = string("op_2073"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp16, [1, 2, 1, 256]> var_2036 = transpose(perm = var_2035, x = var_2030)[name = string("transpose_177")];
+            tensor<fp16, [1, 2, 256]> x_43 = reshape(shape = var_2073, x = var_2036)[name = string("x_43")];
+            int32 var_2079 = const()[name = string("op_2079"), val = int32(-1)];
+            fp16 const_28_promoted = const()[name = string("const_28_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 256]> var_2081 = mul(x = x_43, y = const_28_promoted)[name = string("op_2081")];
+            bool input_71_interleave_0 = const()[name = string("input_71_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512]> input_71 = concat(axis = var_2079, interleave = input_71_interleave_0, values = (x_43, var_2081))[name = string("input_71")];
+            tensor<int32, [1]> normed_69_axes_0 = const()[name = string("normed_69_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2076_to_fp16 = const()[name = string("op_2076_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 512]> normed_69_cast_fp16 = layer_norm(axes = normed_69_axes_0, epsilon = var_2076_to_fp16, x = input_71)[name = string("normed_69_cast_fp16")];
+            tensor<int32, [2]> var_2086_split_sizes_0 = const()[name = string("op_2086_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2086_axis_0 = const()[name = string("op_2086_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 256]> var_2086_0, tensor<fp16, [1, 2, 256]> var_2086_1 = split(axis = var_2086_axis_0, split_sizes = var_2086_split_sizes_0, x = normed_69_cast_fp16)[name = string("op_2086")];
+            tensor<fp16, [1, 2, 256]> var_2088 = mul(x = var_2086_0, y = layers_2_self_attn_k_norm_weight)[name = string("op_2088")];
+            tensor<int32, [4]> var_2093 = const()[name = string("op_2093"), val = tensor<int32, [4]>([1, 2, 1, 256])];
+            tensor<fp16, [1, 2, 1, 256]> q_21 = reshape(shape = var_2093, x = var_2088)[name = string("q_21")];
+            fp16 var_2095_promoted = const()[name = string("op_2095_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 256]> var_2064 = transpose(perm = var_2063, x = var_2058)[name = string("transpose_176")];
+            tensor<fp16, [1, 2, 1, 256]> var_2096 = pow(x = var_2064, y = var_2095_promoted)[name = string("op_2096")];
+            tensor<int32, [1]> var_2101_axes_0 = const()[name = string("op_2101_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2101_keep_dims_0 = const()[name = string("op_2101_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_2101 = reduce_mean(axes = var_2101_axes_0, keep_dims = var_2101_keep_dims_0, x = var_2096)[name = string("op_2101")];
+            fp16 var_2103_to_fp16 = const()[name = string("op_2103_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_5_cast_fp16 = add(x = var_2101, y = var_2103_to_fp16)[name = string("mean_sq_5_cast_fp16")];
+            fp32 var_2105_epsilon_0 = const()[name = string("op_2105_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_2105_cast_fp16 = rsqrt(epsilon = var_2105_epsilon_0, x = mean_sq_5_cast_fp16)[name = string("op_2105_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_75_cast_fp16 = mul(x = var_2064, y = var_2105_cast_fp16)[name = string("input_75_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> var_2107_cast_fp16 = mul(x = q_21, y = cos_s)[name = string("op_2107_cast_fp16")];
+            tensor<int32, [2]> var_2108_split_sizes_0 = const()[name = string("op_2108_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2108_axis_0 = const()[name = string("op_2108_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 128]> var_2108_0, tensor<fp16, [1, 2, 1, 128]> var_2108_1 = split(axis = var_2108_axis_0, split_sizes = var_2108_split_sizes_0, x = q_21)[name = string("op_2108")];
+            fp16 const_29_promoted = const()[name = string("const_29_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 128]> var_2110 = mul(x = var_2108_1, y = const_29_promoted)[name = string("op_2110")];
+            int32 var_2112 = const()[name = string("op_2112"), val = int32(-1)];
+            bool var_2113_interleave_0 = const()[name = string("op_2113_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 256]> var_2113 = concat(axis = var_2112, interleave = var_2113_interleave_0, values = (var_2110, var_2108_0))[name = string("op_2113")];
+            tensor<fp16, [1, 2, 1, 256]> var_2114_cast_fp16 = mul(x = var_2113, y = sin_s)[name = string("op_2114_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_73_cast_fp16 = add(x = var_2107_cast_fp16, y = var_2114_cast_fp16)[name = string("input_73_cast_fp16")];
+            tensor<int32, [8]> k_padded_5_pad_0 = const()[name = string("k_padded_5_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_5_mode_0 = const()[name = string("k_padded_5_mode_0"), val = string("constant")];
+            fp16 const_30_to_fp16 = const()[name = string("const_30_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> k_padded_5_cast_fp16 = pad(constant_val = const_30_to_fp16, mode = k_padded_5_mode_0, pad = k_padded_5_pad_0, x = input_73_cast_fp16)[name = string("k_padded_5_cast_fp16")];
+            tensor<int32, [8]> v_padded_5_pad_0 = const()[name = string("v_padded_5_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_5_mode_0 = const()[name = string("v_padded_5_mode_0"), val = string("constant")];
+            fp16 const_31_to_fp16 = const()[name = string("const_31_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> v_padded_5_cast_fp16 = pad(constant_val = const_31_to_fp16, mode = v_padded_5_mode_0, pad = v_padded_5_pad_0, x = input_75_cast_fp16)[name = string("v_padded_5_cast_fp16")];
+            tensor<int32, [4]> var_2143_begin_0 = const()[name = string("op_2143_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_2143_end_0 = const()[name = string("op_2143_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_2143_end_mask_0 = const()[name = string("op_2143_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_2143_cast_fp16 = slice_by_index(begin = var_2143_begin_0, end = var_2143_end_0, end_mask = var_2143_end_mask_0, x = K_sliding_slot_5_cast_fp16)[name = string("op_2143_cast_fp16")];
+            int32 var_2150 = const()[name = string("op_2150"), val = int32(2)];
+            bool K_sliding_out_5_interleave_0 = const()[name = string("K_sliding_out_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_out_5_cast_fp16 = concat(axis = var_2150, interleave = K_sliding_out_5_interleave_0, values = (var_2143_cast_fp16, k_padded_5_cast_fp16))[name = string("K_sliding_out_5_cast_fp16")];
+            tensor<int32, [4]> var_2166_begin_0 = const()[name = string("op_2166_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_2166_end_0 = const()[name = string("op_2166_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_2166_end_mask_0 = const()[name = string("op_2166_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_2166_cast_fp16 = slice_by_index(begin = var_2166_begin_0, end = var_2166_end_0, end_mask = var_2166_end_mask_0, x = V_sliding_slot_5_cast_fp16)[name = string("op_2166_cast_fp16")];
+            int32 var_2173 = const()[name = string("op_2173"), val = int32(2)];
+            bool V_sliding_out_5_interleave_0 = const()[name = string("V_sliding_out_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_out_5_cast_fp16 = concat(axis = var_2173, interleave = V_sliding_out_5_interleave_0, values = (var_2166_cast_fp16, v_padded_5_cast_fp16))[name = string("V_sliding_out_5_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_5_begin_0 = const()[name = string("K_for_attn_5_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_5_end_0 = const()[name = string("K_for_attn_5_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_5_end_mask_0 = const()[name = string("K_for_attn_5_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_5_cast_fp16 = slice_by_index(begin = K_for_attn_5_begin_0, end = K_for_attn_5_end_0, end_mask = K_for_attn_5_end_mask_0, x = K_sliding_out_5_cast_fp16)[name = string("K_for_attn_5_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_5_begin_0 = const()[name = string("V_for_attn_5_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_5_end_0 = const()[name = string("V_for_attn_5_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_5_end_mask_0 = const()[name = string("V_for_attn_5_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_5_cast_fp16 = slice_by_index(begin = V_for_attn_5_begin_0, end = V_for_attn_5_end_0, end_mask = V_for_attn_5_end_mask_0, x = V_sliding_out_5_cast_fp16)[name = string("V_for_attn_5_cast_fp16")];
+            tensor<int32, [4]> transpose_8_perm_0 = const()[name = string("transpose_8_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_4_reps_0 = const()[name = string("tile_4_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_8_cast_fp16 = transpose(perm = transpose_8_perm_0, x = K_for_attn_5_cast_fp16)[name = string("transpose_175")];
+            tensor<fp16, [8, 1, 512, 256]> tile_4_cast_fp16 = tile(reps = tile_4_reps_0, x = transpose_8_cast_fp16)[name = string("tile_4_cast_fp16")];
+            tensor<int32, [5]> concat_8 = const()[name = string("concat_8"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_8_cast_fp16 = reshape(shape = concat_8, x = tile_4_cast_fp16)[name = string("reshape_8_cast_fp16")];
+            tensor<int32, [5]> transpose_9_perm_0 = const()[name = string("transpose_9_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_9 = const()[name = string("concat_9"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_9_cast_fp16 = transpose(perm = transpose_9_perm_0, x = reshape_8_cast_fp16)[name = string("transpose_174")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_9_cast_fp16 = reshape(shape = concat_9, x = transpose_9_cast_fp16)[name = string("reshape_9_cast_fp16")];
+            tensor<int32, [4]> transpose_50_perm_0 = const()[name = string("transpose_50_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_10_perm_0 = const()[name = string("transpose_10_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_5_reps_0 = const()[name = string("tile_5_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_10_cast_fp16 = transpose(perm = transpose_10_perm_0, x = V_for_attn_5_cast_fp16)[name = string("transpose_173")];
+            tensor<fp16, [8, 1, 512, 256]> tile_5_cast_fp16 = tile(reps = tile_5_reps_0, x = transpose_10_cast_fp16)[name = string("tile_5_cast_fp16")];
+            tensor<int32, [5]> concat_10 = const()[name = string("concat_10"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_10_cast_fp16 = reshape(shape = concat_10, x = tile_5_cast_fp16)[name = string("reshape_10_cast_fp16")];
+            tensor<int32, [5]> transpose_11_perm_0 = const()[name = string("transpose_11_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_11 = const()[name = string("concat_11"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_11_cast_fp16 = transpose(perm = transpose_11_perm_0, x = reshape_10_cast_fp16)[name = string("transpose_172")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_11_cast_fp16 = reshape(shape = concat_11, x = transpose_11_cast_fp16)[name = string("reshape_11_cast_fp16")];
+            tensor<int32, [4]> V_expanded_5_perm_0 = const()[name = string("V_expanded_5_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(false)];
+            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_50_cast_fp16 = transpose(perm = transpose_50_perm_0, x = reshape_9_cast_fp16)[name = string("transpose_171")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = q_23_cast_fp16, y = transpose_50_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_47_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = causal_mask_sliding)[name = string("x_47_cast_fp16")];
+            tensor<int32, [1]> reduce_max_2_axes_0 = const()[name = string("reduce_max_2_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_2_keep_dims_0 = const()[name = string("reduce_max_2_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_2 = reduce_max(axes = reduce_max_2_axes_0, keep_dims = reduce_max_2_keep_dims_0, x = x_47_cast_fp16)[name = string("reduce_max_2")];
+            tensor<fp16, [1, 8, 1, 512]> var_2214 = sub(x = x_47_cast_fp16, y = reduce_max_2)[name = string("op_2214")];
+            tensor<fp16, [1, 8, 1, 512]> var_2220 = exp(x = var_2214)[name = string("op_2220")];
+            tensor<int32, [1]> var_2230_axes_0 = const()[name = string("op_2230_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2230_keep_dims_0 = const()[name = string("op_2230_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_2230 = reduce_sum(axes = var_2230_axes_0, keep_dims = var_2230_keep_dims_0, x = var_2220)[name = string("op_2230")];
+            tensor<fp16, [1, 8, 1, 512]> var_2236_cast_fp16 = real_div(x = var_2220, y = var_2230)[name = string("op_2236_cast_fp16")];
+            bool attn_output_13_transpose_x_0 = const()[name = string("attn_output_13_transpose_x_0"), val = bool(false)];
+            bool attn_output_13_transpose_y_0 = const()[name = string("attn_output_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_5_cast_fp16 = transpose(perm = V_expanded_5_perm_0, x = reshape_11_cast_fp16)[name = string("transpose_170")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_13_cast_fp16 = matmul(transpose_x = attn_output_13_transpose_x_0, transpose_y = attn_output_13_transpose_y_0, x = var_2236_cast_fp16, y = V_expanded_5_cast_fp16)[name = string("attn_output_13_cast_fp16")];
+            tensor<int32, [4]> var_2247 = const()[name = string("op_2247"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_2254 = const()[name = string("op_2254"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_2248_cast_fp16 = transpose(perm = var_2247, x = attn_output_13_cast_fp16)[name = string("transpose_169")];
+            tensor<fp16, [1, 1, 2048]> attn_output_15_cast_fp16 = reshape(shape = var_2254, x = var_2248_cast_fp16)[name = string("attn_output_15_cast_fp16")];
+            tensor<int32, [3]> var_2259 = const()[name = string("op_2259"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_2275_pad_type_0 = const()[name = string("op_2275_pad_type_0"), val = string("valid")];
+            int32 var_2275_groups_0 = const()[name = string("op_2275_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_2275_strides_0 = const()[name = string("op_2275_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_2275_pad_0 = const()[name = string("op_2275_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_2275_dilations_0 = const()[name = string("op_2275_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_2_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(550980416))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(553601920))))[name = string("squeeze_2_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_2260_cast_fp16 = transpose(perm = var_2259, x = attn_output_15_cast_fp16)[name = string("transpose_168")];
+            tensor<fp16, [1, 2560, 1]> var_2275_cast_fp16 = conv(dilations = var_2275_dilations_0, groups = var_2275_groups_0, pad = var_2275_pad_0, pad_type = var_2275_pad_type_0, strides = var_2275_strides_0, weight = squeeze_2_cast_fp16_to_fp32_to_fp16_palettized, x = var_2260_cast_fp16)[name = string("op_2275_cast_fp16")];
+            tensor<int32, [3]> var_2279 = const()[name = string("op_2279"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2285 = const()[name = string("op_2285"), val = int32(-1)];
+            fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_51_cast_fp16 = transpose(perm = var_2279, x = var_2275_cast_fp16)[name = string("transpose_167")];
+            tensor<fp16, [1, 1, 2560]> var_2287_cast_fp16 = mul(x = x_51_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_2287_cast_fp16")];
+            bool input_79_interleave_0 = const()[name = string("input_79_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_79_cast_fp16 = concat(axis = var_2285, interleave = input_79_interleave_0, values = (x_51_cast_fp16, var_2287_cast_fp16))[name = string("input_79_cast_fp16")];
+            tensor<int32, [1]> normed_73_axes_0 = const()[name = string("normed_73_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2282_to_fp16 = const()[name = string("op_2282_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_73_cast_fp16 = layer_norm(axes = normed_73_axes_0, epsilon = var_2282_to_fp16, x = input_79_cast_fp16)[name = string("normed_73_cast_fp16")];
+            tensor<int32, [2]> var_2292_split_sizes_0 = const()[name = string("op_2292_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2292_axis_0 = const()[name = string("op_2292_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2292_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2292_cast_fp16_1 = split(axis = var_2292_axis_0, split_sizes = var_2292_split_sizes_0, x = normed_73_cast_fp16)[name = string("op_2292_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_2_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(553604544)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_17_cast_fp16 = mul(x = var_2292_cast_fp16_0, y = layers_2_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_17_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_53_cast_fp16 = add(x = x_39_cast_fp16, y = attn_output_17_cast_fp16)[name = string("x_53_cast_fp16")];
+            int32 var_2301 = const()[name = string("op_2301"), val = int32(-1)];
+            fp16 const_33_promoted_to_fp16 = const()[name = string("const_33_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_2303_cast_fp16 = mul(x = x_53_cast_fp16, y = const_33_promoted_to_fp16)[name = string("op_2303_cast_fp16")];
+            bool input_81_interleave_0 = const()[name = string("input_81_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_81_cast_fp16 = concat(axis = var_2301, interleave = input_81_interleave_0, values = (x_53_cast_fp16, var_2303_cast_fp16))[name = string("input_81_cast_fp16")];
+            tensor<int32, [1]> normed_77_axes_0 = const()[name = string("normed_77_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2298_to_fp16 = const()[name = string("op_2298_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_77_cast_fp16 = layer_norm(axes = normed_77_axes_0, epsilon = var_2298_to_fp16, x = input_81_cast_fp16)[name = string("normed_77_cast_fp16")];
+            tensor<int32, [2]> var_2308_split_sizes_0 = const()[name = string("op_2308_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2308_axis_0 = const()[name = string("op_2308_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2308_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2308_cast_fp16_1 = split(axis = var_2308_axis_0, split_sizes = var_2308_split_sizes_0, x = normed_77_cast_fp16)[name = string("op_2308_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_2_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(553609728)))];
+            tensor<fp16, [1, 1, 2560]> h_15_cast_fp16 = mul(x = var_2308_cast_fp16_0, y = layers_2_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_15_cast_fp16")];
+            tensor<int32, [3]> var_2319 = const()[name = string("op_2319"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_83_axes_0 = const()[name = string("input_83_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2320 = transpose(perm = var_2319, x = h_15_cast_fp16)[name = string("transpose_166")];
+            tensor<fp16, [1, 2560, 1, 1]> input_83 = expand_dims(axes = input_83_axes_0, x = var_2320)[name = string("input_83")];
+            string gate_9_pad_type_0 = const()[name = string("gate_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_9_strides_0 = const()[name = string("gate_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_9_pad_0 = const()[name = string("gate_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_9_dilations_0 = const()[name = string("gate_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_9_groups_0 = const()[name = string("gate_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_9 = conv(dilations = gate_9_dilations_0, groups = gate_9_groups_0, pad = gate_9_pad_0, pad_type = gate_9_pad_type_0, strides = gate_9_strides_0, weight = layers_2_mlp_gate_proj_weight_palettized, x = input_83)[name = string("gate_9")];
+            string up_5_pad_type_0 = const()[name = string("up_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_5_strides_0 = const()[name = string("up_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_5_pad_0 = const()[name = string("up_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_5_dilations_0 = const()[name = string("up_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_5_groups_0 = const()[name = string("up_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_5 = conv(dilations = up_5_dilations_0, groups = up_5_groups_0, pad = up_5_pad_0, pad_type = up_5_pad_type_0, strides = up_5_strides_0, weight = layers_2_mlp_up_proj_weight_palettized, x = input_83)[name = string("up_5")];
+            string gate_11_mode_0 = const()[name = string("gate_11_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_11 = gelu(mode = gate_11_mode_0, x = gate_9)[name = string("gate_11")];
+            tensor<fp16, [1, 10240, 1, 1]> input_85 = mul(x = gate_11, y = up_5)[name = string("input_85")];
+            string mlp_out_5_pad_type_0 = const()[name = string("mlp_out_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_5_strides_0 = const()[name = string("mlp_out_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_5_pad_0 = const()[name = string("mlp_out_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_5_dilations_0 = const()[name = string("mlp_out_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_5_groups_0 = const()[name = string("mlp_out_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_5 = conv(dilations = mlp_out_5_dilations_0, groups = mlp_out_5_groups_0, pad = mlp_out_5_pad_0, pad_type = mlp_out_5_pad_type_0, strides = mlp_out_5_strides_0, weight = layers_2_mlp_down_proj_weight_palettized, x = input_85)[name = string("mlp_out_5")];
+            tensor<int32, [1]> var_2360_axes_0 = const()[name = string("op_2360_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2360 = squeeze(axes = var_2360_axes_0, x = mlp_out_5)[name = string("op_2360")];
+            tensor<int32, [3]> var_2364 = const()[name = string("op_2364"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2370 = const()[name = string("op_2370"), val = int32(-1)];
+            fp16 const_34_promoted = const()[name = string("const_34_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_55 = transpose(perm = var_2364, x = var_2360)[name = string("transpose_165")];
+            tensor<fp16, [1, 1, 2560]> var_2372 = mul(x = x_55, y = const_34_promoted)[name = string("op_2372")];
+            bool input_87_interleave_0 = const()[name = string("input_87_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_87 = concat(axis = var_2370, interleave = input_87_interleave_0, values = (x_55, var_2372))[name = string("input_87")];
+            tensor<int32, [1]> normed_81_axes_0 = const()[name = string("normed_81_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2367_to_fp16 = const()[name = string("op_2367_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_81_cast_fp16 = layer_norm(axes = normed_81_axes_0, epsilon = var_2367_to_fp16, x = input_87)[name = string("normed_81_cast_fp16")];
+            tensor<int32, [2]> var_2377_split_sizes_0 = const()[name = string("op_2377_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2377_axis_0 = const()[name = string("op_2377_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2377_0, tensor<fp16, [1, 1, 2560]> var_2377_1 = split(axis = var_2377_axis_0, split_sizes = var_2377_split_sizes_0, x = normed_81_cast_fp16)[name = string("op_2377")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_23 = mul(x = var_2377_0, y = layers_2_post_feedforward_layernorm_weight)[name = string("hidden_states_23")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_25_cast_fp16 = add(x = x_53_cast_fp16, y = hidden_states_23)[name = string("hidden_states_25_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_5_begin_0 = const()[name = string("per_layer_slice_5_begin_0"), val = tensor<int32, [3]>([0, 0, 512])];
+            tensor<int32, [3]> per_layer_slice_5_end_0 = const()[name = string("per_layer_slice_5_end_0"), val = tensor<int32, [3]>([1, 1, 768])];
+            tensor<bool, [3]> per_layer_slice_5_end_mask_0 = const()[name = string("per_layer_slice_5_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_5_cast_fp16 = slice_by_index(begin = per_layer_slice_5_begin_0, end = per_layer_slice_5_end_0, end_mask = per_layer_slice_5_end_mask_0, x = per_layer_combined_out)[name = string("per_layer_slice_5_cast_fp16")];
+            tensor<int32, [3]> var_2405 = const()[name = string("op_2405"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_89_axes_0 = const()[name = string("input_89_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2406 = transpose(perm = var_2405, x = hidden_states_25_cast_fp16)[name = string("transpose_164")];
+            tensor<fp16, [1, 2560, 1, 1]> input_89 = expand_dims(axes = input_89_axes_0, x = var_2406)[name = string("input_89")];
+            string gated_13_pad_type_0 = const()[name = string("gated_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_13_strides_0 = const()[name = string("gated_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_13_pad_0 = const()[name = string("gated_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_13_dilations_0 = const()[name = string("gated_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_13_groups_0 = const()[name = string("gated_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_13 = conv(dilations = gated_13_dilations_0, groups = gated_13_groups_0, pad = gated_13_pad_0, pad_type = gated_13_pad_type_0, strides = gated_13_strides_0, weight = layers_2_per_layer_input_gate_weight_palettized, x = input_89)[name = string("gated_13")];
+            string gated_15_mode_0 = const()[name = string("gated_15_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_15 = gelu(mode = gated_15_mode_0, x = gated_13)[name = string("gated_15")];
+            tensor<int32, [3]> var_2425 = const()[name = string("op_2425"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_5_axes_0 = const()[name = string("per_layer_slice_conv_5_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_2426_cast_fp16 = transpose(perm = var_2425, x = per_layer_slice_5_cast_fp16)[name = string("transpose_163")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_5_cast_fp16 = expand_dims(axes = per_layer_slice_conv_5_axes_0, x = var_2426_cast_fp16)[name = string("per_layer_slice_conv_5_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_91_cast_fp16 = mul(x = gated_15, y = per_layer_slice_conv_5_cast_fp16)[name = string("input_91_cast_fp16")];
+            string gated_17_pad_type_0 = const()[name = string("gated_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_17_strides_0 = const()[name = string("gated_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_17_pad_0 = const()[name = string("gated_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_17_dilations_0 = const()[name = string("gated_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_17_groups_0 = const()[name = string("gated_17_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_2_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(553614912))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(553942656))))[name = string("layers_2_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_17_cast_fp16 = conv(dilations = gated_17_dilations_0, groups = gated_17_groups_0, pad = gated_17_pad_0, pad_type = gated_17_pad_type_0, strides = gated_17_strides_0, weight = layers_2_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_91_cast_fp16)[name = string("gated_17_cast_fp16")];
+            tensor<int32, [1]> var_2442_axes_0 = const()[name = string("op_2442_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2442_cast_fp16 = squeeze(axes = var_2442_axes_0, x = gated_17_cast_fp16)[name = string("op_2442_cast_fp16")];
+            tensor<int32, [3]> var_2446 = const()[name = string("op_2446"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2452 = const()[name = string("op_2452"), val = int32(-1)];
+            fp16 const_35_promoted_to_fp16 = const()[name = string("const_35_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_57_cast_fp16 = transpose(perm = var_2446, x = var_2442_cast_fp16)[name = string("transpose_162")];
+            tensor<fp16, [1, 1, 2560]> var_2454_cast_fp16 = mul(x = x_57_cast_fp16, y = const_35_promoted_to_fp16)[name = string("op_2454_cast_fp16")];
+            bool input_93_interleave_0 = const()[name = string("input_93_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_93_cast_fp16 = concat(axis = var_2452, interleave = input_93_interleave_0, values = (x_57_cast_fp16, var_2454_cast_fp16))[name = string("input_93_cast_fp16")];
+            tensor<int32, [1]> normed_85_axes_0 = const()[name = string("normed_85_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2449_to_fp16 = const()[name = string("op_2449_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_85_cast_fp16 = layer_norm(axes = normed_85_axes_0, epsilon = var_2449_to_fp16, x = input_93_cast_fp16)[name = string("normed_85_cast_fp16")];
+            tensor<int32, [2]> var_2459_split_sizes_0 = const()[name = string("op_2459_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2459_axis_0 = const()[name = string("op_2459_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2459_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2459_cast_fp16_1 = split(axis = var_2459_axis_0, split_sizes = var_2459_split_sizes_0, x = normed_85_cast_fp16)[name = string("op_2459_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_2_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(553945280)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_29_cast_fp16 = mul(x = var_2459_cast_fp16_0, y = layers_2_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_29_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_31_cast_fp16 = add(x = hidden_states_25_cast_fp16, y = hidden_states_29_cast_fp16)[name = string("hidden_states_31_cast_fp16")];
+            tensor<fp16, [1]> const_36_promoted_to_fp16 = const()[name = string("const_36_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.aep-1])];
+            tensor<fp16, [1, 1, 2560]> x_59_cast_fp16 = mul(x = hidden_states_31_cast_fp16, y = const_36_promoted_to_fp16)[name = string("x_59_cast_fp16")];
+            tensor<int32, [1]> var_2471_axes_0 = const()[name = string("op_2471_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_2471_cast_fp16 = squeeze(axes = var_2471_axes_0, x = K_sliding_out_5_cast_fp16)[name = string("op_2471_cast_fp16")];
+            tensor<int32, [1]> var_2473_axes_0 = const()[name = string("op_2473_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_2473_cast_fp16 = squeeze(axes = var_2473_axes_0, x = V_sliding_out_5_cast_fp16)[name = string("op_2473_cast_fp16")];
+            tensor<int32, [4]> var_2476_begin_0 = const()[name = string("op_2476_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
+            tensor<int32, [4]> var_2476_end_0 = const()[name = string("op_2476_end_0"), val = tensor<int32, [4]>([4, 2, 512, 512])];
+            tensor<bool, [4]> var_2476_end_mask_0 = const()[name = string("op_2476_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_2476_squeeze_mask_0 = const()[name = string("op_2476_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_2476_cast_fp16 = slice_by_index(begin = var_2476_begin_0, end = var_2476_end_0, end_mask = var_2476_end_mask_0, squeeze_mask = var_2476_squeeze_mask_0, x = K_sliding_in)[name = string("op_2476_cast_fp16")];
+            tensor<int32, [1]> K_sliding_slot_7_axes_0 = const()[name = string("K_sliding_slot_7_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_slot_7_cast_fp16 = expand_dims(axes = K_sliding_slot_7_axes_0, x = var_2476_cast_fp16)[name = string("K_sliding_slot_7_cast_fp16")];
+            tensor<int32, [4]> var_2481_begin_0 = const()[name = string("op_2481_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
+            tensor<int32, [4]> var_2481_end_0 = const()[name = string("op_2481_end_0"), val = tensor<int32, [4]>([4, 2, 512, 512])];
+            tensor<bool, [4]> var_2481_end_mask_0 = const()[name = string("op_2481_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_2481_squeeze_mask_0 = const()[name = string("op_2481_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_2481_cast_fp16 = slice_by_index(begin = var_2481_begin_0, end = var_2481_end_0, end_mask = var_2481_end_mask_0, squeeze_mask = var_2481_squeeze_mask_0, x = V_sliding_in)[name = string("op_2481_cast_fp16")];
+            tensor<int32, [1]> V_sliding_slot_7_axes_0 = const()[name = string("V_sliding_slot_7_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_slot_7_cast_fp16 = expand_dims(axes = V_sliding_slot_7_axes_0, x = var_2481_cast_fp16)[name = string("V_sliding_slot_7_cast_fp16")];
+            int32 var_2488 = const()[name = string("op_2488"), val = int32(-1)];
+            fp16 const_37_promoted_to_fp16 = const()[name = string("const_37_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_2490_cast_fp16 = mul(x = x_59_cast_fp16, y = const_37_promoted_to_fp16)[name = string("op_2490_cast_fp16")];
+            bool input_95_interleave_0 = const()[name = string("input_95_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_95_cast_fp16 = concat(axis = var_2488, interleave = input_95_interleave_0, values = (x_59_cast_fp16, var_2490_cast_fp16))[name = string("input_95_cast_fp16")];
+            tensor<int32, [1]> normed_89_axes_0 = const()[name = string("normed_89_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2485_to_fp16 = const()[name = string("op_2485_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_89_cast_fp16 = layer_norm(axes = normed_89_axes_0, epsilon = var_2485_to_fp16, x = input_95_cast_fp16)[name = string("normed_89_cast_fp16")];
+            tensor<int32, [2]> var_2495_split_sizes_0 = const()[name = string("op_2495_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2495_axis_0 = const()[name = string("op_2495_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2495_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2495_cast_fp16_1 = split(axis = var_2495_axis_0, split_sizes = var_2495_split_sizes_0, x = normed_89_cast_fp16)[name = string("op_2495_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_3_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(553950464)))];
+            tensor<fp16, [1, 1, 2560]> h_19_cast_fp16 = mul(x = var_2495_cast_fp16_0, y = layers_3_input_layernorm_weight_promoted_to_fp16)[name = string("h_19_cast_fp16")];
+            tensor<int32, [3]> var_2501 = const()[name = string("op_2501"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_2504_axes_0 = const()[name = string("op_2504_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2502_cast_fp16 = transpose(perm = var_2501, x = h_19_cast_fp16)[name = string("transpose_161")];
+            tensor<fp16, [1, 2560, 1, 1]> var_2504_cast_fp16 = expand_dims(axes = var_2504_axes_0, x = var_2502_cast_fp16)[name = string("op_2504_cast_fp16")];
+            string var_2520_pad_type_0 = const()[name = string("op_2520_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2520_strides_0 = const()[name = string("op_2520_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2520_pad_0 = const()[name = string("op_2520_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2520_dilations_0 = const()[name = string("op_2520_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2520_groups_0 = const()[name = string("op_2520_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_2520 = conv(dilations = var_2520_dilations_0, groups = var_2520_groups_0, pad = var_2520_pad_0, pad_type = var_2520_pad_type_0, strides = var_2520_strides_0, weight = layers_3_self_attn_q_proj_weight_palettized, x = var_2504_cast_fp16)[name = string("op_2520")];
+            tensor<int32, [4]> var_2525 = const()[name = string("op_2525"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_2526 = reshape(shape = var_2525, x = var_2520)[name = string("op_2526")];
+            tensor<int32, [4]> var_2531 = const()[name = string("op_2531"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_2541 = const()[name = string("op_2541"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_2532 = transpose(perm = var_2531, x = var_2526)[name = string("transpose_160")];
+            tensor<fp16, [1, 8, 256]> x_61 = reshape(shape = var_2541, x = var_2532)[name = string("x_61")];
+            int32 var_2547 = const()[name = string("op_2547"), val = int32(-1)];
+            fp16 const_38_promoted = const()[name = string("const_38_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_2549 = mul(x = x_61, y = const_38_promoted)[name = string("op_2549")];
+            bool input_99_interleave_0 = const()[name = string("input_99_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_99 = concat(axis = var_2547, interleave = input_99_interleave_0, values = (x_61, var_2549))[name = string("input_99")];
+            tensor<int32, [1]> normed_93_axes_0 = const()[name = string("normed_93_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2544_to_fp16 = const()[name = string("op_2544_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_93_cast_fp16 = layer_norm(axes = normed_93_axes_0, epsilon = var_2544_to_fp16, x = input_99)[name = string("normed_93_cast_fp16")];
+            tensor<int32, [2]> var_2554_split_sizes_0 = const()[name = string("op_2554_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2554_axis_0 = const()[name = string("op_2554_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_2554_0, tensor<fp16, [1, 8, 256]> var_2554_1 = split(axis = var_2554_axis_0, split_sizes = var_2554_split_sizes_0, x = normed_93_cast_fp16)[name = string("op_2554")];
+            tensor<fp16, [1, 8, 256]> var_2556 = mul(x = var_2554_0, y = layers_3_self_attn_q_norm_weight)[name = string("op_2556")];
+            tensor<int32, [4]> var_2561 = const()[name = string("op_2561"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_27 = reshape(shape = var_2561, x = var_2556)[name = string("q_27")];
+            tensor<fp16, [1, 8, 1, 256]> var_2563_cast_fp16 = mul(x = q_27, y = cos_s)[name = string("op_2563_cast_fp16")];
+            tensor<int32, [2]> var_2564_split_sizes_0 = const()[name = string("op_2564_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2564_axis_0 = const()[name = string("op_2564_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_2564_0, tensor<fp16, [1, 8, 1, 128]> var_2564_1 = split(axis = var_2564_axis_0, split_sizes = var_2564_split_sizes_0, x = q_27)[name = string("op_2564")];
+            fp16 const_39_promoted = const()[name = string("const_39_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_2566 = mul(x = var_2564_1, y = const_39_promoted)[name = string("op_2566")];
+            int32 var_2568 = const()[name = string("op_2568"), val = int32(-1)];
+            bool var_2569_interleave_0 = const()[name = string("op_2569_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_2569 = concat(axis = var_2568, interleave = var_2569_interleave_0, values = (var_2566, var_2564_0))[name = string("op_2569")];
+            tensor<fp16, [1, 8, 1, 256]> var_2570_cast_fp16 = mul(x = var_2569, y = sin_s)[name = string("op_2570_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_31_cast_fp16 = add(x = var_2563_cast_fp16, y = var_2570_cast_fp16)[name = string("q_31_cast_fp16")];
+            string var_2583_pad_type_0 = const()[name = string("op_2583_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2583_strides_0 = const()[name = string("op_2583_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2583_pad_0 = const()[name = string("op_2583_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2583_dilations_0 = const()[name = string("op_2583_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2583_groups_0 = const()[name = string("op_2583_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_2583 = conv(dilations = var_2583_dilations_0, groups = var_2583_groups_0, pad = var_2583_pad_0, pad_type = var_2583_pad_type_0, strides = var_2583_strides_0, weight = layers_3_self_attn_k_proj_weight_palettized, x = var_2504_cast_fp16)[name = string("op_2583")];
+            tensor<int32, [4]> var_2588 = const()[name = string("op_2588"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_2589 = reshape(shape = var_2588, x = var_2583)[name = string("op_2589")];
+            tensor<int32, [4]> var_2594 = const()[name = string("op_2594"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_2611_pad_type_0 = const()[name = string("op_2611_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2611_strides_0 = const()[name = string("op_2611_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2611_pad_0 = const()[name = string("op_2611_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2611_dilations_0 = const()[name = string("op_2611_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2611_groups_0 = const()[name = string("op_2611_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_2611 = conv(dilations = var_2611_dilations_0, groups = var_2611_groups_0, pad = var_2611_pad_0, pad_type = var_2611_pad_type_0, strides = var_2611_strides_0, weight = layers_3_self_attn_v_proj_weight_palettized, x = var_2504_cast_fp16)[name = string("op_2611")];
+            tensor<int32, [4]> var_2616 = const()[name = string("op_2616"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_2617 = reshape(shape = var_2616, x = var_2611)[name = string("op_2617")];
+            tensor<int32, [4]> var_2622 = const()[name = string("op_2622"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_2632 = const()[name = string("op_2632"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp16, [1, 2, 1, 256]> var_2595 = transpose(perm = var_2594, x = var_2589)[name = string("transpose_159")];
+            tensor<fp16, [1, 2, 256]> x_63 = reshape(shape = var_2632, x = var_2595)[name = string("x_63")];
+            int32 var_2638 = const()[name = string("op_2638"), val = int32(-1)];
+            fp16 const_40_promoted = const()[name = string("const_40_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 256]> var_2640 = mul(x = x_63, y = const_40_promoted)[name = string("op_2640")];
+            bool input_101_interleave_0 = const()[name = string("input_101_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512]> input_101 = concat(axis = var_2638, interleave = input_101_interleave_0, values = (x_63, var_2640))[name = string("input_101")];
+            tensor<int32, [1]> normed_97_axes_0 = const()[name = string("normed_97_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2635_to_fp16 = const()[name = string("op_2635_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 512]> normed_97_cast_fp16 = layer_norm(axes = normed_97_axes_0, epsilon = var_2635_to_fp16, x = input_101)[name = string("normed_97_cast_fp16")];
+            tensor<int32, [2]> var_2645_split_sizes_0 = const()[name = string("op_2645_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2645_axis_0 = const()[name = string("op_2645_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 256]> var_2645_0, tensor<fp16, [1, 2, 256]> var_2645_1 = split(axis = var_2645_axis_0, split_sizes = var_2645_split_sizes_0, x = normed_97_cast_fp16)[name = string("op_2645")];
+            tensor<fp16, [1, 2, 256]> var_2647 = mul(x = var_2645_0, y = layers_3_self_attn_k_norm_weight)[name = string("op_2647")];
+            tensor<int32, [4]> var_2652 = const()[name = string("op_2652"), val = tensor<int32, [4]>([1, 2, 1, 256])];
+            tensor<fp16, [1, 2, 1, 256]> q_29 = reshape(shape = var_2652, x = var_2647)[name = string("q_29")];
+            fp16 var_2654_promoted = const()[name = string("op_2654_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 256]> var_2623 = transpose(perm = var_2622, x = var_2617)[name = string("transpose_158")];
+            tensor<fp16, [1, 2, 1, 256]> var_2655 = pow(x = var_2623, y = var_2654_promoted)[name = string("op_2655")];
+            tensor<int32, [1]> var_2660_axes_0 = const()[name = string("op_2660_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2660_keep_dims_0 = const()[name = string("op_2660_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_2660 = reduce_mean(axes = var_2660_axes_0, keep_dims = var_2660_keep_dims_0, x = var_2655)[name = string("op_2660")];
+            fp16 var_2662_to_fp16 = const()[name = string("op_2662_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_7_cast_fp16 = add(x = var_2660, y = var_2662_to_fp16)[name = string("mean_sq_7_cast_fp16")];
+            fp32 var_2664_epsilon_0 = const()[name = string("op_2664_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_2664_cast_fp16 = rsqrt(epsilon = var_2664_epsilon_0, x = mean_sq_7_cast_fp16)[name = string("op_2664_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_105_cast_fp16 = mul(x = var_2623, y = var_2664_cast_fp16)[name = string("input_105_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> var_2666_cast_fp16 = mul(x = q_29, y = cos_s)[name = string("op_2666_cast_fp16")];
+            tensor<int32, [2]> var_2667_split_sizes_0 = const()[name = string("op_2667_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2667_axis_0 = const()[name = string("op_2667_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 128]> var_2667_0, tensor<fp16, [1, 2, 1, 128]> var_2667_1 = split(axis = var_2667_axis_0, split_sizes = var_2667_split_sizes_0, x = q_29)[name = string("op_2667")];
+            fp16 const_41_promoted = const()[name = string("const_41_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 128]> var_2669 = mul(x = var_2667_1, y = const_41_promoted)[name = string("op_2669")];
+            int32 var_2671 = const()[name = string("op_2671"), val = int32(-1)];
+            bool var_2672_interleave_0 = const()[name = string("op_2672_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 256]> var_2672 = concat(axis = var_2671, interleave = var_2672_interleave_0, values = (var_2669, var_2667_0))[name = string("op_2672")];
+            tensor<fp16, [1, 2, 1, 256]> var_2673_cast_fp16 = mul(x = var_2672, y = sin_s)[name = string("op_2673_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_103_cast_fp16 = add(x = var_2666_cast_fp16, y = var_2673_cast_fp16)[name = string("input_103_cast_fp16")];
+            tensor<int32, [8]> k_padded_7_pad_0 = const()[name = string("k_padded_7_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_7_mode_0 = const()[name = string("k_padded_7_mode_0"), val = string("constant")];
+            fp16 const_42_to_fp16 = const()[name = string("const_42_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> k_padded_7_cast_fp16 = pad(constant_val = const_42_to_fp16, mode = k_padded_7_mode_0, pad = k_padded_7_pad_0, x = input_103_cast_fp16)[name = string("k_padded_7_cast_fp16")];
+            tensor<int32, [8]> v_padded_7_pad_0 = const()[name = string("v_padded_7_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_7_mode_0 = const()[name = string("v_padded_7_mode_0"), val = string("constant")];
+            fp16 const_43_to_fp16 = const()[name = string("const_43_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> v_padded_7_cast_fp16 = pad(constant_val = const_43_to_fp16, mode = v_padded_7_mode_0, pad = v_padded_7_pad_0, x = input_105_cast_fp16)[name = string("v_padded_7_cast_fp16")];
+            tensor<int32, [4]> var_2702_begin_0 = const()[name = string("op_2702_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_2702_end_0 = const()[name = string("op_2702_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_2702_end_mask_0 = const()[name = string("op_2702_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_2702_cast_fp16 = slice_by_index(begin = var_2702_begin_0, end = var_2702_end_0, end_mask = var_2702_end_mask_0, x = K_sliding_slot_7_cast_fp16)[name = string("op_2702_cast_fp16")];
+            int32 var_2709 = const()[name = string("op_2709"), val = int32(2)];
+            bool K_sliding_out_7_interleave_0 = const()[name = string("K_sliding_out_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_out_7_cast_fp16 = concat(axis = var_2709, interleave = K_sliding_out_7_interleave_0, values = (var_2702_cast_fp16, k_padded_7_cast_fp16))[name = string("K_sliding_out_7_cast_fp16")];
+            tensor<int32, [4]> var_2725_begin_0 = const()[name = string("op_2725_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_2725_end_0 = const()[name = string("op_2725_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_2725_end_mask_0 = const()[name = string("op_2725_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_2725_cast_fp16 = slice_by_index(begin = var_2725_begin_0, end = var_2725_end_0, end_mask = var_2725_end_mask_0, x = V_sliding_slot_7_cast_fp16)[name = string("op_2725_cast_fp16")];
+            int32 var_2732 = const()[name = string("op_2732"), val = int32(2)];
+            bool V_sliding_out_7_interleave_0 = const()[name = string("V_sliding_out_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_out_7_cast_fp16 = concat(axis = var_2732, interleave = V_sliding_out_7_interleave_0, values = (var_2725_cast_fp16, v_padded_7_cast_fp16))[name = string("V_sliding_out_7_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_7_begin_0 = const()[name = string("K_for_attn_7_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_7_end_0 = const()[name = string("K_for_attn_7_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_7_end_mask_0 = const()[name = string("K_for_attn_7_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_7_cast_fp16 = slice_by_index(begin = K_for_attn_7_begin_0, end = K_for_attn_7_end_0, end_mask = K_for_attn_7_end_mask_0, x = K_sliding_out_7_cast_fp16)[name = string("K_for_attn_7_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_7_begin_0 = const()[name = string("V_for_attn_7_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_7_end_0 = const()[name = string("V_for_attn_7_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_7_end_mask_0 = const()[name = string("V_for_attn_7_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_7_cast_fp16 = slice_by_index(begin = V_for_attn_7_begin_0, end = V_for_attn_7_end_0, end_mask = V_for_attn_7_end_mask_0, x = V_sliding_out_7_cast_fp16)[name = string("V_for_attn_7_cast_fp16")];
+            tensor<int32, [4]> transpose_12_perm_0 = const()[name = string("transpose_12_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_6_reps_0 = const()[name = string("tile_6_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_12_cast_fp16 = transpose(perm = transpose_12_perm_0, x = K_for_attn_7_cast_fp16)[name = string("transpose_157")];
+            tensor<fp16, [8, 1, 512, 256]> tile_6_cast_fp16 = tile(reps = tile_6_reps_0, x = transpose_12_cast_fp16)[name = string("tile_6_cast_fp16")];
+            tensor<int32, [5]> concat_12 = const()[name = string("concat_12"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_12_cast_fp16 = reshape(shape = concat_12, x = tile_6_cast_fp16)[name = string("reshape_12_cast_fp16")];
+            tensor<int32, [5]> transpose_13_perm_0 = const()[name = string("transpose_13_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_13 = const()[name = string("concat_13"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_13_cast_fp16 = transpose(perm = transpose_13_perm_0, x = reshape_12_cast_fp16)[name = string("transpose_156")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_13_cast_fp16 = reshape(shape = concat_13, x = transpose_13_cast_fp16)[name = string("reshape_13_cast_fp16")];
+            tensor<int32, [4]> transpose_51_perm_0 = const()[name = string("transpose_51_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_14_perm_0 = const()[name = string("transpose_14_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_7_reps_0 = const()[name = string("tile_7_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_14_cast_fp16 = transpose(perm = transpose_14_perm_0, x = V_for_attn_7_cast_fp16)[name = string("transpose_155")];
+            tensor<fp16, [8, 1, 512, 256]> tile_7_cast_fp16 = tile(reps = tile_7_reps_0, x = transpose_14_cast_fp16)[name = string("tile_7_cast_fp16")];
+            tensor<int32, [5]> concat_14 = const()[name = string("concat_14"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_14_cast_fp16 = reshape(shape = concat_14, x = tile_7_cast_fp16)[name = string("reshape_14_cast_fp16")];
+            tensor<int32, [5]> transpose_15_perm_0 = const()[name = string("transpose_15_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_15 = const()[name = string("concat_15"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_15_cast_fp16 = transpose(perm = transpose_15_perm_0, x = reshape_14_cast_fp16)[name = string("transpose_154")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_15_cast_fp16 = reshape(shape = concat_15, x = transpose_15_cast_fp16)[name = string("reshape_15_cast_fp16")];
+            tensor<int32, [4]> V_expanded_7_perm_0 = const()[name = string("V_expanded_7_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(false)];
+            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_51_cast_fp16 = transpose(perm = transpose_51_perm_0, x = reshape_13_cast_fp16)[name = string("transpose_153")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = q_31_cast_fp16, y = transpose_51_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_67_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = causal_mask_sliding)[name = string("x_67_cast_fp16")];
+            tensor<int32, [1]> reduce_max_3_axes_0 = const()[name = string("reduce_max_3_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_3_keep_dims_0 = const()[name = string("reduce_max_3_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_3 = reduce_max(axes = reduce_max_3_axes_0, keep_dims = reduce_max_3_keep_dims_0, x = x_67_cast_fp16)[name = string("reduce_max_3")];
+            tensor<fp16, [1, 8, 1, 512]> var_2773 = sub(x = x_67_cast_fp16, y = reduce_max_3)[name = string("op_2773")];
+            tensor<fp16, [1, 8, 1, 512]> var_2779 = exp(x = var_2773)[name = string("op_2779")];
+            tensor<int32, [1]> var_2789_axes_0 = const()[name = string("op_2789_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2789_keep_dims_0 = const()[name = string("op_2789_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_2789 = reduce_sum(axes = var_2789_axes_0, keep_dims = var_2789_keep_dims_0, x = var_2779)[name = string("op_2789")];
+            tensor<fp16, [1, 8, 1, 512]> var_2795_cast_fp16 = real_div(x = var_2779, y = var_2789)[name = string("op_2795_cast_fp16")];
+            bool attn_output_19_transpose_x_0 = const()[name = string("attn_output_19_transpose_x_0"), val = bool(false)];
+            bool attn_output_19_transpose_y_0 = const()[name = string("attn_output_19_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_7_cast_fp16 = transpose(perm = V_expanded_7_perm_0, x = reshape_15_cast_fp16)[name = string("transpose_152")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_19_cast_fp16 = matmul(transpose_x = attn_output_19_transpose_x_0, transpose_y = attn_output_19_transpose_y_0, x = var_2795_cast_fp16, y = V_expanded_7_cast_fp16)[name = string("attn_output_19_cast_fp16")];
+            tensor<int32, [4]> var_2806 = const()[name = string("op_2806"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_2813 = const()[name = string("op_2813"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_2807_cast_fp16 = transpose(perm = var_2806, x = attn_output_19_cast_fp16)[name = string("transpose_151")];
+            tensor<fp16, [1, 1, 2048]> attn_output_21_cast_fp16 = reshape(shape = var_2813, x = var_2807_cast_fp16)[name = string("attn_output_21_cast_fp16")];
+            tensor<int32, [3]> var_2818 = const()[name = string("op_2818"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_2834_pad_type_0 = const()[name = string("op_2834_pad_type_0"), val = string("valid")];
+            int32 var_2834_groups_0 = const()[name = string("op_2834_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_2834_strides_0 = const()[name = string("op_2834_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_2834_pad_0 = const()[name = string("op_2834_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_2834_dilations_0 = const()[name = string("op_2834_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_3_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(553955648))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(556577152))))[name = string("squeeze_3_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_2819_cast_fp16 = transpose(perm = var_2818, x = attn_output_21_cast_fp16)[name = string("transpose_150")];
+            tensor<fp16, [1, 2560, 1]> var_2834_cast_fp16 = conv(dilations = var_2834_dilations_0, groups = var_2834_groups_0, pad = var_2834_pad_0, pad_type = var_2834_pad_type_0, strides = var_2834_strides_0, weight = squeeze_3_cast_fp16_to_fp32_to_fp16_palettized, x = var_2819_cast_fp16)[name = string("op_2834_cast_fp16")];
+            tensor<int32, [3]> var_2838 = const()[name = string("op_2838"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2844 = const()[name = string("op_2844"), val = int32(-1)];
+            fp16 const_44_promoted_to_fp16 = const()[name = string("const_44_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_71_cast_fp16 = transpose(perm = var_2838, x = var_2834_cast_fp16)[name = string("transpose_149")];
+            tensor<fp16, [1, 1, 2560]> var_2846_cast_fp16 = mul(x = x_71_cast_fp16, y = const_44_promoted_to_fp16)[name = string("op_2846_cast_fp16")];
+            bool input_109_interleave_0 = const()[name = string("input_109_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_109_cast_fp16 = concat(axis = var_2844, interleave = input_109_interleave_0, values = (x_71_cast_fp16, var_2846_cast_fp16))[name = string("input_109_cast_fp16")];
+            tensor<int32, [1]> normed_101_axes_0 = const()[name = string("normed_101_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2841_to_fp16 = const()[name = string("op_2841_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_101_cast_fp16 = layer_norm(axes = normed_101_axes_0, epsilon = var_2841_to_fp16, x = input_109_cast_fp16)[name = string("normed_101_cast_fp16")];
+            tensor<int32, [2]> var_2851_split_sizes_0 = const()[name = string("op_2851_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2851_axis_0 = const()[name = string("op_2851_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2851_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2851_cast_fp16_1 = split(axis = var_2851_axis_0, split_sizes = var_2851_split_sizes_0, x = normed_101_cast_fp16)[name = string("op_2851_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_3_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(556579776)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_23_cast_fp16 = mul(x = var_2851_cast_fp16_0, y = layers_3_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_23_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_73_cast_fp16 = add(x = x_59_cast_fp16, y = attn_output_23_cast_fp16)[name = string("x_73_cast_fp16")];
+            int32 var_2860 = const()[name = string("op_2860"), val = int32(-1)];
+            fp16 const_45_promoted_to_fp16 = const()[name = string("const_45_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_2862_cast_fp16 = mul(x = x_73_cast_fp16, y = const_45_promoted_to_fp16)[name = string("op_2862_cast_fp16")];
+            bool input_111_interleave_0 = const()[name = string("input_111_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_111_cast_fp16 = concat(axis = var_2860, interleave = input_111_interleave_0, values = (x_73_cast_fp16, var_2862_cast_fp16))[name = string("input_111_cast_fp16")];
+            tensor<int32, [1]> normed_105_axes_0 = const()[name = string("normed_105_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2857_to_fp16 = const()[name = string("op_2857_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_105_cast_fp16 = layer_norm(axes = normed_105_axes_0, epsilon = var_2857_to_fp16, x = input_111_cast_fp16)[name = string("normed_105_cast_fp16")];
+            tensor<int32, [2]> var_2867_split_sizes_0 = const()[name = string("op_2867_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2867_axis_0 = const()[name = string("op_2867_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2867_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2867_cast_fp16_1 = split(axis = var_2867_axis_0, split_sizes = var_2867_split_sizes_0, x = normed_105_cast_fp16)[name = string("op_2867_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_3_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(556584960)))];
+            tensor<fp16, [1, 1, 2560]> h_21_cast_fp16 = mul(x = var_2867_cast_fp16_0, y = layers_3_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_21_cast_fp16")];
+            tensor<int32, [3]> var_2878 = const()[name = string("op_2878"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_113_axes_0 = const()[name = string("input_113_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2879 = transpose(perm = var_2878, x = h_21_cast_fp16)[name = string("transpose_148")];
+            tensor<fp16, [1, 2560, 1, 1]> input_113 = expand_dims(axes = input_113_axes_0, x = var_2879)[name = string("input_113")];
+            string gate_13_pad_type_0 = const()[name = string("gate_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_13_strides_0 = const()[name = string("gate_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_13_pad_0 = const()[name = string("gate_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_13_dilations_0 = const()[name = string("gate_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_13_groups_0 = const()[name = string("gate_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_13 = conv(dilations = gate_13_dilations_0, groups = gate_13_groups_0, pad = gate_13_pad_0, pad_type = gate_13_pad_type_0, strides = gate_13_strides_0, weight = layers_3_mlp_gate_proj_weight_palettized, x = input_113)[name = string("gate_13")];
+            string up_7_pad_type_0 = const()[name = string("up_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_7_strides_0 = const()[name = string("up_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_7_pad_0 = const()[name = string("up_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_7_dilations_0 = const()[name = string("up_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_7_groups_0 = const()[name = string("up_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_7 = conv(dilations = up_7_dilations_0, groups = up_7_groups_0, pad = up_7_pad_0, pad_type = up_7_pad_type_0, strides = up_7_strides_0, weight = layers_3_mlp_up_proj_weight_palettized, x = input_113)[name = string("up_7")];
+            string gate_15_mode_0 = const()[name = string("gate_15_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_15 = gelu(mode = gate_15_mode_0, x = gate_13)[name = string("gate_15")];
+            tensor<fp16, [1, 10240, 1, 1]> input_115 = mul(x = gate_15, y = up_7)[name = string("input_115")];
+            string mlp_out_7_pad_type_0 = const()[name = string("mlp_out_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_7_strides_0 = const()[name = string("mlp_out_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_7_pad_0 = const()[name = string("mlp_out_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_7_dilations_0 = const()[name = string("mlp_out_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_7_groups_0 = const()[name = string("mlp_out_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_7 = conv(dilations = mlp_out_7_dilations_0, groups = mlp_out_7_groups_0, pad = mlp_out_7_pad_0, pad_type = mlp_out_7_pad_type_0, strides = mlp_out_7_strides_0, weight = layers_3_mlp_down_proj_weight_palettized, x = input_115)[name = string("mlp_out_7")];
+            tensor<int32, [1]> var_2919_axes_0 = const()[name = string("op_2919_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2919 = squeeze(axes = var_2919_axes_0, x = mlp_out_7)[name = string("op_2919")];
+            tensor<int32, [3]> var_2923 = const()[name = string("op_2923"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2929 = const()[name = string("op_2929"), val = int32(-1)];
+            fp16 const_46_promoted = const()[name = string("const_46_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_75 = transpose(perm = var_2923, x = var_2919)[name = string("transpose_147")];
+            tensor<fp16, [1, 1, 2560]> var_2931 = mul(x = x_75, y = const_46_promoted)[name = string("op_2931")];
+            bool input_117_interleave_0 = const()[name = string("input_117_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_117 = concat(axis = var_2929, interleave = input_117_interleave_0, values = (x_75, var_2931))[name = string("input_117")];
+            tensor<int32, [1]> normed_109_axes_0 = const()[name = string("normed_109_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2926_to_fp16 = const()[name = string("op_2926_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_109_cast_fp16 = layer_norm(axes = normed_109_axes_0, epsilon = var_2926_to_fp16, x = input_117)[name = string("normed_109_cast_fp16")];
+            tensor<int32, [2]> var_2936_split_sizes_0 = const()[name = string("op_2936_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2936_axis_0 = const()[name = string("op_2936_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2936_0, tensor<fp16, [1, 1, 2560]> var_2936_1 = split(axis = var_2936_axis_0, split_sizes = var_2936_split_sizes_0, x = normed_109_cast_fp16)[name = string("op_2936")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_33 = mul(x = var_2936_0, y = layers_3_post_feedforward_layernorm_weight)[name = string("hidden_states_33")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_35_cast_fp16 = add(x = x_73_cast_fp16, y = hidden_states_33)[name = string("hidden_states_35_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_7_begin_0 = const()[name = string("per_layer_slice_7_begin_0"), val = tensor<int32, [3]>([0, 0, 768])];
+            tensor<int32, [3]> per_layer_slice_7_end_0 = const()[name = string("per_layer_slice_7_end_0"), val = tensor<int32, [3]>([1, 1, 1024])];
+            tensor<bool, [3]> per_layer_slice_7_end_mask_0 = const()[name = string("per_layer_slice_7_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_7_cast_fp16 = slice_by_index(begin = per_layer_slice_7_begin_0, end = per_layer_slice_7_end_0, end_mask = per_layer_slice_7_end_mask_0, x = per_layer_combined_out)[name = string("per_layer_slice_7_cast_fp16")];
+            tensor<int32, [3]> var_2964 = const()[name = string("op_2964"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_119_axes_0 = const()[name = string("input_119_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2965 = transpose(perm = var_2964, x = hidden_states_35_cast_fp16)[name = string("transpose_146")];
+            tensor<fp16, [1, 2560, 1, 1]> input_119 = expand_dims(axes = input_119_axes_0, x = var_2965)[name = string("input_119")];
+            string gated_19_pad_type_0 = const()[name = string("gated_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_19_strides_0 = const()[name = string("gated_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_19_pad_0 = const()[name = string("gated_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_19_dilations_0 = const()[name = string("gated_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_19_groups_0 = const()[name = string("gated_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_19 = conv(dilations = gated_19_dilations_0, groups = gated_19_groups_0, pad = gated_19_pad_0, pad_type = gated_19_pad_type_0, strides = gated_19_strides_0, weight = layers_3_per_layer_input_gate_weight_palettized, x = input_119)[name = string("gated_19")];
+            string gated_21_mode_0 = const()[name = string("gated_21_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_21 = gelu(mode = gated_21_mode_0, x = gated_19)[name = string("gated_21")];
+            tensor<int32, [3]> var_2984 = const()[name = string("op_2984"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_7_axes_0 = const()[name = string("per_layer_slice_conv_7_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_2985_cast_fp16 = transpose(perm = var_2984, x = per_layer_slice_7_cast_fp16)[name = string("transpose_145")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_7_cast_fp16 = expand_dims(axes = per_layer_slice_conv_7_axes_0, x = var_2985_cast_fp16)[name = string("per_layer_slice_conv_7_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_121_cast_fp16 = mul(x = gated_21, y = per_layer_slice_conv_7_cast_fp16)[name = string("input_121_cast_fp16")];
+            string gated_23_pad_type_0 = const()[name = string("gated_23_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_23_strides_0 = const()[name = string("gated_23_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_23_pad_0 = const()[name = string("gated_23_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_23_dilations_0 = const()[name = string("gated_23_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_23_groups_0 = const()[name = string("gated_23_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_3_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(556590144))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(556917888))))[name = string("layers_3_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_23_cast_fp16 = conv(dilations = gated_23_dilations_0, groups = gated_23_groups_0, pad = gated_23_pad_0, pad_type = gated_23_pad_type_0, strides = gated_23_strides_0, weight = layers_3_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_121_cast_fp16)[name = string("gated_23_cast_fp16")];
+            tensor<int32, [1]> var_3001_axes_0 = const()[name = string("op_3001_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3001_cast_fp16 = squeeze(axes = var_3001_axes_0, x = gated_23_cast_fp16)[name = string("op_3001_cast_fp16")];
+            tensor<int32, [3]> var_3005 = const()[name = string("op_3005"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3011 = const()[name = string("op_3011"), val = int32(-1)];
+            fp16 const_47_promoted_to_fp16 = const()[name = string("const_47_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_77_cast_fp16 = transpose(perm = var_3005, x = var_3001_cast_fp16)[name = string("transpose_144")];
+            tensor<fp16, [1, 1, 2560]> var_3013_cast_fp16 = mul(x = x_77_cast_fp16, y = const_47_promoted_to_fp16)[name = string("op_3013_cast_fp16")];
+            bool input_123_interleave_0 = const()[name = string("input_123_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_123_cast_fp16 = concat(axis = var_3011, interleave = input_123_interleave_0, values = (x_77_cast_fp16, var_3013_cast_fp16))[name = string("input_123_cast_fp16")];
+            tensor<int32, [1]> normed_113_axes_0 = const()[name = string("normed_113_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3008_to_fp16 = const()[name = string("op_3008_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_113_cast_fp16 = layer_norm(axes = normed_113_axes_0, epsilon = var_3008_to_fp16, x = input_123_cast_fp16)[name = string("normed_113_cast_fp16")];
+            tensor<int32, [2]> var_3018_split_sizes_0 = const()[name = string("op_3018_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3018_axis_0 = const()[name = string("op_3018_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3018_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3018_cast_fp16_1 = split(axis = var_3018_axis_0, split_sizes = var_3018_split_sizes_0, x = normed_113_cast_fp16)[name = string("op_3018_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_3_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(556920512)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_39_cast_fp16 = mul(x = var_3018_cast_fp16_0, y = layers_3_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_39_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_41_cast_fp16 = add(x = hidden_states_35_cast_fp16, y = hidden_states_39_cast_fp16)[name = string("hidden_states_41_cast_fp16")];
+            tensor<fp16, [1]> const_48_promoted_to_fp16 = const()[name = string("const_48_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.6cp-1])];
+            tensor<fp16, [1, 1, 2560]> x_79_cast_fp16 = mul(x = hidden_states_41_cast_fp16, y = const_48_promoted_to_fp16)[name = string("x_79_cast_fp16")];
+            tensor<int32, [1]> var_3030_axes_0 = const()[name = string("op_3030_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_3030_cast_fp16 = squeeze(axes = var_3030_axes_0, x = K_sliding_out_7_cast_fp16)[name = string("op_3030_cast_fp16")];
+            tensor<int32, [1]> var_3032_axes_0 = const()[name = string("op_3032_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_3032_cast_fp16 = squeeze(axes = var_3032_axes_0, x = V_sliding_out_7_cast_fp16)[name = string("op_3032_cast_fp16")];
+            tensor<int32, [4]> var_3035_begin_0 = const()[name = string("op_3035_begin_0"), val = tensor<int32, [4]>([4, 0, 0, 0])];
+            tensor<int32, [4]> var_3035_end_0 = const()[name = string("op_3035_end_0"), val = tensor<int32, [4]>([5, 2, 512, 512])];
+            tensor<bool, [4]> var_3035_end_mask_0 = const()[name = string("op_3035_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_3035_squeeze_mask_0 = const()[name = string("op_3035_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_3035_cast_fp16 = slice_by_index(begin = var_3035_begin_0, end = var_3035_end_0, end_mask = var_3035_end_mask_0, squeeze_mask = var_3035_squeeze_mask_0, x = K_sliding_in)[name = string("op_3035_cast_fp16")];
+            tensor<int32, [1]> K_sliding_slot_9_axes_0 = const()[name = string("K_sliding_slot_9_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_slot_9_cast_fp16 = expand_dims(axes = K_sliding_slot_9_axes_0, x = var_3035_cast_fp16)[name = string("K_sliding_slot_9_cast_fp16")];
+            tensor<int32, [4]> var_3040_begin_0 = const()[name = string("op_3040_begin_0"), val = tensor<int32, [4]>([4, 0, 0, 0])];
+            tensor<int32, [4]> var_3040_end_0 = const()[name = string("op_3040_end_0"), val = tensor<int32, [4]>([5, 2, 512, 512])];
+            tensor<bool, [4]> var_3040_end_mask_0 = const()[name = string("op_3040_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_3040_squeeze_mask_0 = const()[name = string("op_3040_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_3040_cast_fp16 = slice_by_index(begin = var_3040_begin_0, end = var_3040_end_0, end_mask = var_3040_end_mask_0, squeeze_mask = var_3040_squeeze_mask_0, x = V_sliding_in)[name = string("op_3040_cast_fp16")];
+            tensor<int32, [1]> V_sliding_slot_9_axes_0 = const()[name = string("V_sliding_slot_9_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_slot_9_cast_fp16 = expand_dims(axes = V_sliding_slot_9_axes_0, x = var_3040_cast_fp16)[name = string("V_sliding_slot_9_cast_fp16")];
+            int32 var_3047 = const()[name = string("op_3047"), val = int32(-1)];
+            fp16 const_49_promoted_to_fp16 = const()[name = string("const_49_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_3049_cast_fp16 = mul(x = x_79_cast_fp16, y = const_49_promoted_to_fp16)[name = string("op_3049_cast_fp16")];
+            bool input_125_interleave_0 = const()[name = string("input_125_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_125_cast_fp16 = concat(axis = var_3047, interleave = input_125_interleave_0, values = (x_79_cast_fp16, var_3049_cast_fp16))[name = string("input_125_cast_fp16")];
+            tensor<int32, [1]> normed_117_axes_0 = const()[name = string("normed_117_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3044_to_fp16 = const()[name = string("op_3044_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_117_cast_fp16 = layer_norm(axes = normed_117_axes_0, epsilon = var_3044_to_fp16, x = input_125_cast_fp16)[name = string("normed_117_cast_fp16")];
+            tensor<int32, [2]> var_3054_split_sizes_0 = const()[name = string("op_3054_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3054_axis_0 = const()[name = string("op_3054_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3054_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3054_cast_fp16_1 = split(axis = var_3054_axis_0, split_sizes = var_3054_split_sizes_0, x = normed_117_cast_fp16)[name = string("op_3054_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_4_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(556925696)))];
+            tensor<fp16, [1, 1, 2560]> h_25_cast_fp16 = mul(x = var_3054_cast_fp16_0, y = layers_4_input_layernorm_weight_promoted_to_fp16)[name = string("h_25_cast_fp16")];
+            tensor<int32, [3]> var_3060 = const()[name = string("op_3060"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_3063_axes_0 = const()[name = string("op_3063_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3061_cast_fp16 = transpose(perm = var_3060, x = h_25_cast_fp16)[name = string("transpose_143")];
+            tensor<fp16, [1, 2560, 1, 1]> var_3063_cast_fp16 = expand_dims(axes = var_3063_axes_0, x = var_3061_cast_fp16)[name = string("op_3063_cast_fp16")];
+            string var_3079_pad_type_0 = const()[name = string("op_3079_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_3079_strides_0 = const()[name = string("op_3079_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_3079_pad_0 = const()[name = string("op_3079_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_3079_dilations_0 = const()[name = string("op_3079_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_3079_groups_0 = const()[name = string("op_3079_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_3079 = conv(dilations = var_3079_dilations_0, groups = var_3079_groups_0, pad = var_3079_pad_0, pad_type = var_3079_pad_type_0, strides = var_3079_strides_0, weight = layers_4_self_attn_q_proj_weight_palettized, x = var_3063_cast_fp16)[name = string("op_3079")];
+            tensor<int32, [4]> var_3084 = const()[name = string("op_3084"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_3085 = reshape(shape = var_3084, x = var_3079)[name = string("op_3085")];
+            tensor<int32, [4]> var_3090 = const()[name = string("op_3090"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_3100 = const()[name = string("op_3100"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_3091 = transpose(perm = var_3090, x = var_3085)[name = string("transpose_142")];
+            tensor<fp16, [1, 8, 256]> x_81 = reshape(shape = var_3100, x = var_3091)[name = string("x_81")];
+            int32 var_3106 = const()[name = string("op_3106"), val = int32(-1)];
+            fp16 const_50_promoted = const()[name = string("const_50_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_3108 = mul(x = x_81, y = const_50_promoted)[name = string("op_3108")];
+            bool input_129_interleave_0 = const()[name = string("input_129_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_129 = concat(axis = var_3106, interleave = input_129_interleave_0, values = (x_81, var_3108))[name = string("input_129")];
+            tensor<int32, [1]> normed_121_axes_0 = const()[name = string("normed_121_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3103_to_fp16 = const()[name = string("op_3103_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_121_cast_fp16 = layer_norm(axes = normed_121_axes_0, epsilon = var_3103_to_fp16, x = input_129)[name = string("normed_121_cast_fp16")];
+            tensor<int32, [2]> var_3113_split_sizes_0 = const()[name = string("op_3113_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_3113_axis_0 = const()[name = string("op_3113_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_3113_0, tensor<fp16, [1, 8, 256]> var_3113_1 = split(axis = var_3113_axis_0, split_sizes = var_3113_split_sizes_0, x = normed_121_cast_fp16)[name = string("op_3113")];
+            tensor<fp16, [1, 8, 256]> var_3115 = mul(x = var_3113_0, y = layers_4_self_attn_q_norm_weight)[name = string("op_3115")];
+            tensor<int32, [4]> var_3120 = const()[name = string("op_3120"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_35 = reshape(shape = var_3120, x = var_3115)[name = string("q_35")];
+            tensor<fp16, [1, 8, 1, 256]> var_3122_cast_fp16 = mul(x = q_35, y = cos_s)[name = string("op_3122_cast_fp16")];
+            tensor<int32, [2]> var_3123_split_sizes_0 = const()[name = string("op_3123_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_3123_axis_0 = const()[name = string("op_3123_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_3123_0, tensor<fp16, [1, 8, 1, 128]> var_3123_1 = split(axis = var_3123_axis_0, split_sizes = var_3123_split_sizes_0, x = q_35)[name = string("op_3123")];
+            fp16 const_51_promoted = const()[name = string("const_51_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_3125 = mul(x = var_3123_1, y = const_51_promoted)[name = string("op_3125")];
+            int32 var_3127 = const()[name = string("op_3127"), val = int32(-1)];
+            bool var_3128_interleave_0 = const()[name = string("op_3128_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_3128 = concat(axis = var_3127, interleave = var_3128_interleave_0, values = (var_3125, var_3123_0))[name = string("op_3128")];
+            tensor<fp16, [1, 8, 1, 256]> var_3129_cast_fp16 = mul(x = var_3128, y = sin_s)[name = string("op_3129_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_39_cast_fp16 = add(x = var_3122_cast_fp16, y = var_3129_cast_fp16)[name = string("q_39_cast_fp16")];
+            string var_3142_pad_type_0 = const()[name = string("op_3142_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_3142_strides_0 = const()[name = string("op_3142_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_3142_pad_0 = const()[name = string("op_3142_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_3142_dilations_0 = const()[name = string("op_3142_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_3142_groups_0 = const()[name = string("op_3142_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_3142 = conv(dilations = var_3142_dilations_0, groups = var_3142_groups_0, pad = var_3142_pad_0, pad_type = var_3142_pad_type_0, strides = var_3142_strides_0, weight = layers_4_self_attn_k_proj_weight_palettized, x = var_3063_cast_fp16)[name = string("op_3142")];
+            tensor<int32, [4]> var_3147 = const()[name = string("op_3147"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_3148 = reshape(shape = var_3147, x = var_3142)[name = string("op_3148")];
+            tensor<int32, [4]> var_3153 = const()[name = string("op_3153"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_3170_pad_type_0 = const()[name = string("op_3170_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_3170_strides_0 = const()[name = string("op_3170_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_3170_pad_0 = const()[name = string("op_3170_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_3170_dilations_0 = const()[name = string("op_3170_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_3170_groups_0 = const()[name = string("op_3170_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_3170 = conv(dilations = var_3170_dilations_0, groups = var_3170_groups_0, pad = var_3170_pad_0, pad_type = var_3170_pad_type_0, strides = var_3170_strides_0, weight = layers_4_self_attn_v_proj_weight_palettized, x = var_3063_cast_fp16)[name = string("op_3170")];
+            tensor<int32, [4]> var_3175 = const()[name = string("op_3175"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_3176 = reshape(shape = var_3175, x = var_3170)[name = string("op_3176")];
+            tensor<int32, [4]> var_3181 = const()[name = string("op_3181"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_3191 = const()[name = string("op_3191"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp16, [1, 2, 1, 256]> var_3154 = transpose(perm = var_3153, x = var_3148)[name = string("transpose_141")];
+            tensor<fp16, [1, 2, 256]> x_83 = reshape(shape = var_3191, x = var_3154)[name = string("x_83")];
+            int32 var_3197 = const()[name = string("op_3197"), val = int32(-1)];
+            fp16 const_52_promoted = const()[name = string("const_52_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 256]> var_3199 = mul(x = x_83, y = const_52_promoted)[name = string("op_3199")];
+            bool input_131_interleave_0 = const()[name = string("input_131_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512]> input_131 = concat(axis = var_3197, interleave = input_131_interleave_0, values = (x_83, var_3199))[name = string("input_131")];
+            tensor<int32, [1]> normed_125_axes_0 = const()[name = string("normed_125_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3194_to_fp16 = const()[name = string("op_3194_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 512]> normed_125_cast_fp16 = layer_norm(axes = normed_125_axes_0, epsilon = var_3194_to_fp16, x = input_131)[name = string("normed_125_cast_fp16")];
+            tensor<int32, [2]> var_3204_split_sizes_0 = const()[name = string("op_3204_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_3204_axis_0 = const()[name = string("op_3204_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 256]> var_3204_0, tensor<fp16, [1, 2, 256]> var_3204_1 = split(axis = var_3204_axis_0, split_sizes = var_3204_split_sizes_0, x = normed_125_cast_fp16)[name = string("op_3204")];
+            tensor<fp16, [1, 2, 256]> var_3206 = mul(x = var_3204_0, y = layers_4_self_attn_k_norm_weight)[name = string("op_3206")];
+            tensor<int32, [4]> var_3211 = const()[name = string("op_3211"), val = tensor<int32, [4]>([1, 2, 1, 256])];
+            tensor<fp16, [1, 2, 1, 256]> q_37 = reshape(shape = var_3211, x = var_3206)[name = string("q_37")];
+            fp16 var_3213_promoted = const()[name = string("op_3213_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 256]> var_3182 = transpose(perm = var_3181, x = var_3176)[name = string("transpose_140")];
+            tensor<fp16, [1, 2, 1, 256]> var_3214 = pow(x = var_3182, y = var_3213_promoted)[name = string("op_3214")];
+            tensor<int32, [1]> var_3219_axes_0 = const()[name = string("op_3219_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3219_keep_dims_0 = const()[name = string("op_3219_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_3219 = reduce_mean(axes = var_3219_axes_0, keep_dims = var_3219_keep_dims_0, x = var_3214)[name = string("op_3219")];
+            fp16 var_3221_to_fp16 = const()[name = string("op_3221_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_9_cast_fp16 = add(x = var_3219, y = var_3221_to_fp16)[name = string("mean_sq_9_cast_fp16")];
+            fp32 var_3223_epsilon_0 = const()[name = string("op_3223_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_3223_cast_fp16 = rsqrt(epsilon = var_3223_epsilon_0, x = mean_sq_9_cast_fp16)[name = string("op_3223_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_135_cast_fp16 = mul(x = var_3182, y = var_3223_cast_fp16)[name = string("input_135_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> var_3225_cast_fp16 = mul(x = q_37, y = cos_s)[name = string("op_3225_cast_fp16")];
+            tensor<int32, [2]> var_3226_split_sizes_0 = const()[name = string("op_3226_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_3226_axis_0 = const()[name = string("op_3226_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 128]> var_3226_0, tensor<fp16, [1, 2, 1, 128]> var_3226_1 = split(axis = var_3226_axis_0, split_sizes = var_3226_split_sizes_0, x = q_37)[name = string("op_3226")];
+            fp16 const_53_promoted = const()[name = string("const_53_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 128]> var_3228 = mul(x = var_3226_1, y = const_53_promoted)[name = string("op_3228")];
+            int32 var_3230 = const()[name = string("op_3230"), val = int32(-1)];
+            bool var_3231_interleave_0 = const()[name = string("op_3231_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 256]> var_3231 = concat(axis = var_3230, interleave = var_3231_interleave_0, values = (var_3228, var_3226_0))[name = string("op_3231")];
+            tensor<fp16, [1, 2, 1, 256]> var_3232_cast_fp16 = mul(x = var_3231, y = sin_s)[name = string("op_3232_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_133_cast_fp16 = add(x = var_3225_cast_fp16, y = var_3232_cast_fp16)[name = string("input_133_cast_fp16")];
+            tensor<int32, [8]> k_padded_9_pad_0 = const()[name = string("k_padded_9_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_9_mode_0 = const()[name = string("k_padded_9_mode_0"), val = string("constant")];
+            fp16 const_54_to_fp16 = const()[name = string("const_54_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> k_padded_9_cast_fp16 = pad(constant_val = const_54_to_fp16, mode = k_padded_9_mode_0, pad = k_padded_9_pad_0, x = input_133_cast_fp16)[name = string("k_padded_9_cast_fp16")];
+            tensor<int32, [8]> v_padded_9_pad_0 = const()[name = string("v_padded_9_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_9_mode_0 = const()[name = string("v_padded_9_mode_0"), val = string("constant")];
+            fp16 const_55_to_fp16 = const()[name = string("const_55_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> v_padded_9_cast_fp16 = pad(constant_val = const_55_to_fp16, mode = v_padded_9_mode_0, pad = v_padded_9_pad_0, x = input_135_cast_fp16)[name = string("v_padded_9_cast_fp16")];
+            tensor<int32, [4]> var_3261_begin_0 = const()[name = string("op_3261_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_3261_end_0 = const()[name = string("op_3261_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_3261_end_mask_0 = const()[name = string("op_3261_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_3261_cast_fp16 = slice_by_index(begin = var_3261_begin_0, end = var_3261_end_0, end_mask = var_3261_end_mask_0, x = K_sliding_slot_9_cast_fp16)[name = string("op_3261_cast_fp16")];
+            int32 var_3268 = const()[name = string("op_3268"), val = int32(2)];
+            bool K_sliding_out_9_interleave_0 = const()[name = string("K_sliding_out_9_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_out_9_cast_fp16 = concat(axis = var_3268, interleave = K_sliding_out_9_interleave_0, values = (var_3261_cast_fp16, k_padded_9_cast_fp16))[name = string("K_sliding_out_9_cast_fp16")];
+            tensor<int32, [4]> var_3284_begin_0 = const()[name = string("op_3284_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_3284_end_0 = const()[name = string("op_3284_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_3284_end_mask_0 = const()[name = string("op_3284_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_3284_cast_fp16 = slice_by_index(begin = var_3284_begin_0, end = var_3284_end_0, end_mask = var_3284_end_mask_0, x = V_sliding_slot_9_cast_fp16)[name = string("op_3284_cast_fp16")];
+            int32 var_3291 = const()[name = string("op_3291"), val = int32(2)];
+            bool V_sliding_out_9_interleave_0 = const()[name = string("V_sliding_out_9_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_out_9_cast_fp16 = concat(axis = var_3291, interleave = V_sliding_out_9_interleave_0, values = (var_3284_cast_fp16, v_padded_9_cast_fp16))[name = string("V_sliding_out_9_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_9_begin_0 = const()[name = string("K_for_attn_9_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_9_end_0 = const()[name = string("K_for_attn_9_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_9_end_mask_0 = const()[name = string("K_for_attn_9_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_9_cast_fp16 = slice_by_index(begin = K_for_attn_9_begin_0, end = K_for_attn_9_end_0, end_mask = K_for_attn_9_end_mask_0, x = K_sliding_out_9_cast_fp16)[name = string("K_for_attn_9_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_9_begin_0 = const()[name = string("V_for_attn_9_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_9_end_0 = const()[name = string("V_for_attn_9_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_9_end_mask_0 = const()[name = string("V_for_attn_9_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_9_cast_fp16 = slice_by_index(begin = V_for_attn_9_begin_0, end = V_for_attn_9_end_0, end_mask = V_for_attn_9_end_mask_0, x = V_sliding_out_9_cast_fp16)[name = string("V_for_attn_9_cast_fp16")];
+            tensor<int32, [4]> transpose_16_perm_0 = const()[name = string("transpose_16_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_8_reps_0 = const()[name = string("tile_8_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_16_cast_fp16 = transpose(perm = transpose_16_perm_0, x = K_for_attn_9_cast_fp16)[name = string("transpose_139")];
+            tensor<fp16, [8, 1, 512, 256]> tile_8_cast_fp16 = tile(reps = tile_8_reps_0, x = transpose_16_cast_fp16)[name = string("tile_8_cast_fp16")];
+            tensor<int32, [5]> concat_16 = const()[name = string("concat_16"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_16_cast_fp16 = reshape(shape = concat_16, x = tile_8_cast_fp16)[name = string("reshape_16_cast_fp16")];
+            tensor<int32, [5]> transpose_17_perm_0 = const()[name = string("transpose_17_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_17 = const()[name = string("concat_17"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_17_cast_fp16 = transpose(perm = transpose_17_perm_0, x = reshape_16_cast_fp16)[name = string("transpose_138")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_17_cast_fp16 = reshape(shape = concat_17, x = transpose_17_cast_fp16)[name = string("reshape_17_cast_fp16")];
+            tensor<int32, [4]> transpose_52_perm_0 = const()[name = string("transpose_52_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_18_perm_0 = const()[name = string("transpose_18_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_9_reps_0 = const()[name = string("tile_9_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_18_cast_fp16 = transpose(perm = transpose_18_perm_0, x = V_for_attn_9_cast_fp16)[name = string("transpose_137")];
+            tensor<fp16, [8, 1, 512, 256]> tile_9_cast_fp16 = tile(reps = tile_9_reps_0, x = transpose_18_cast_fp16)[name = string("tile_9_cast_fp16")];
+            tensor<int32, [5]> concat_18 = const()[name = string("concat_18"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_18_cast_fp16 = reshape(shape = concat_18, x = tile_9_cast_fp16)[name = string("reshape_18_cast_fp16")];
+            tensor<int32, [5]> transpose_19_perm_0 = const()[name = string("transpose_19_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_19 = const()[name = string("concat_19"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_19_cast_fp16 = transpose(perm = transpose_19_perm_0, x = reshape_18_cast_fp16)[name = string("transpose_136")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_19_cast_fp16 = reshape(shape = concat_19, x = transpose_19_cast_fp16)[name = string("reshape_19_cast_fp16")];
+            tensor<int32, [4]> V_expanded_9_perm_0 = const()[name = string("V_expanded_9_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_17_transpose_x_0 = const()[name = string("attn_weights_17_transpose_x_0"), val = bool(false)];
+            bool attn_weights_17_transpose_y_0 = const()[name = string("attn_weights_17_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_52_cast_fp16 = transpose(perm = transpose_52_perm_0, x = reshape_17_cast_fp16)[name = string("transpose_135")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_17_cast_fp16 = matmul(transpose_x = attn_weights_17_transpose_x_0, transpose_y = attn_weights_17_transpose_y_0, x = q_39_cast_fp16, y = transpose_52_cast_fp16)[name = string("attn_weights_17_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_87_cast_fp16 = add(x = attn_weights_17_cast_fp16, y = causal_mask_sliding)[name = string("x_87_cast_fp16")];
+            tensor<int32, [1]> reduce_max_4_axes_0 = const()[name = string("reduce_max_4_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_4_keep_dims_0 = const()[name = string("reduce_max_4_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_4 = reduce_max(axes = reduce_max_4_axes_0, keep_dims = reduce_max_4_keep_dims_0, x = x_87_cast_fp16)[name = string("reduce_max_4")];
+            tensor<fp16, [1, 8, 1, 512]> var_3332 = sub(x = x_87_cast_fp16, y = reduce_max_4)[name = string("op_3332")];
+            tensor<fp16, [1, 8, 1, 512]> var_3338 = exp(x = var_3332)[name = string("op_3338")];
+            tensor<int32, [1]> var_3348_axes_0 = const()[name = string("op_3348_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3348_keep_dims_0 = const()[name = string("op_3348_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_3348 = reduce_sum(axes = var_3348_axes_0, keep_dims = var_3348_keep_dims_0, x = var_3338)[name = string("op_3348")];
+            tensor<fp16, [1, 8, 1, 512]> var_3354_cast_fp16 = real_div(x = var_3338, y = var_3348)[name = string("op_3354_cast_fp16")];
+            bool attn_output_25_transpose_x_0 = const()[name = string("attn_output_25_transpose_x_0"), val = bool(false)];
+            bool attn_output_25_transpose_y_0 = const()[name = string("attn_output_25_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_9_cast_fp16 = transpose(perm = V_expanded_9_perm_0, x = reshape_19_cast_fp16)[name = string("transpose_134")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_25_cast_fp16 = matmul(transpose_x = attn_output_25_transpose_x_0, transpose_y = attn_output_25_transpose_y_0, x = var_3354_cast_fp16, y = V_expanded_9_cast_fp16)[name = string("attn_output_25_cast_fp16")];
+            tensor<int32, [4]> var_3365 = const()[name = string("op_3365"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_3372 = const()[name = string("op_3372"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_3366_cast_fp16 = transpose(perm = var_3365, x = attn_output_25_cast_fp16)[name = string("transpose_133")];
+            tensor<fp16, [1, 1, 2048]> attn_output_27_cast_fp16 = reshape(shape = var_3372, x = var_3366_cast_fp16)[name = string("attn_output_27_cast_fp16")];
+            tensor<int32, [3]> var_3377 = const()[name = string("op_3377"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_3393_pad_type_0 = const()[name = string("op_3393_pad_type_0"), val = string("valid")];
+            int32 var_3393_groups_0 = const()[name = string("op_3393_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_3393_strides_0 = const()[name = string("op_3393_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_3393_pad_0 = const()[name = string("op_3393_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_3393_dilations_0 = const()[name = string("op_3393_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_4_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(556930880))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(559552384))))[name = string("squeeze_4_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_3378_cast_fp16 = transpose(perm = var_3377, x = attn_output_27_cast_fp16)[name = string("transpose_132")];
+            tensor<fp16, [1, 2560, 1]> var_3393_cast_fp16 = conv(dilations = var_3393_dilations_0, groups = var_3393_groups_0, pad = var_3393_pad_0, pad_type = var_3393_pad_type_0, strides = var_3393_strides_0, weight = squeeze_4_cast_fp16_to_fp32_to_fp16_palettized, x = var_3378_cast_fp16)[name = string("op_3393_cast_fp16")];
+            tensor<int32, [3]> var_3397 = const()[name = string("op_3397"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3403 = const()[name = string("op_3403"), val = int32(-1)];
+            fp16 const_56_promoted_to_fp16 = const()[name = string("const_56_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_91_cast_fp16 = transpose(perm = var_3397, x = var_3393_cast_fp16)[name = string("transpose_131")];
+            tensor<fp16, [1, 1, 2560]> var_3405_cast_fp16 = mul(x = x_91_cast_fp16, y = const_56_promoted_to_fp16)[name = string("op_3405_cast_fp16")];
+            bool input_139_interleave_0 = const()[name = string("input_139_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_139_cast_fp16 = concat(axis = var_3403, interleave = input_139_interleave_0, values = (x_91_cast_fp16, var_3405_cast_fp16))[name = string("input_139_cast_fp16")];
+            tensor<int32, [1]> normed_129_axes_0 = const()[name = string("normed_129_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3400_to_fp16 = const()[name = string("op_3400_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_129_cast_fp16 = layer_norm(axes = normed_129_axes_0, epsilon = var_3400_to_fp16, x = input_139_cast_fp16)[name = string("normed_129_cast_fp16")];
+            tensor<int32, [2]> var_3410_split_sizes_0 = const()[name = string("op_3410_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3410_axis_0 = const()[name = string("op_3410_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3410_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3410_cast_fp16_1 = split(axis = var_3410_axis_0, split_sizes = var_3410_split_sizes_0, x = normed_129_cast_fp16)[name = string("op_3410_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_4_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(559555008)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_29_cast_fp16 = mul(x = var_3410_cast_fp16_0, y = layers_4_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_29_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_93_cast_fp16 = add(x = x_79_cast_fp16, y = attn_output_29_cast_fp16)[name = string("x_93_cast_fp16")];
+            int32 var_3419 = const()[name = string("op_3419"), val = int32(-1)];
+            fp16 const_57_promoted_to_fp16 = const()[name = string("const_57_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_3421_cast_fp16 = mul(x = x_93_cast_fp16, y = const_57_promoted_to_fp16)[name = string("op_3421_cast_fp16")];
+            bool input_141_interleave_0 = const()[name = string("input_141_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_141_cast_fp16 = concat(axis = var_3419, interleave = input_141_interleave_0, values = (x_93_cast_fp16, var_3421_cast_fp16))[name = string("input_141_cast_fp16")];
+            tensor<int32, [1]> normed_133_axes_0 = const()[name = string("normed_133_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3416_to_fp16 = const()[name = string("op_3416_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_133_cast_fp16 = layer_norm(axes = normed_133_axes_0, epsilon = var_3416_to_fp16, x = input_141_cast_fp16)[name = string("normed_133_cast_fp16")];
+            tensor<int32, [2]> var_3426_split_sizes_0 = const()[name = string("op_3426_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3426_axis_0 = const()[name = string("op_3426_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3426_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3426_cast_fp16_1 = split(axis = var_3426_axis_0, split_sizes = var_3426_split_sizes_0, x = normed_133_cast_fp16)[name = string("op_3426_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_4_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(559560192)))];
+            tensor<fp16, [1, 1, 2560]> h_27_cast_fp16 = mul(x = var_3426_cast_fp16_0, y = layers_4_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_27_cast_fp16")];
+            tensor<int32, [3]> var_3437 = const()[name = string("op_3437"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_143_axes_0 = const()[name = string("input_143_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3438 = transpose(perm = var_3437, x = h_27_cast_fp16)[name = string("transpose_130")];
+            tensor<fp16, [1, 2560, 1, 1]> input_143 = expand_dims(axes = input_143_axes_0, x = var_3438)[name = string("input_143")];
+            string gate_17_pad_type_0 = const()[name = string("gate_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_17_strides_0 = const()[name = string("gate_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_17_pad_0 = const()[name = string("gate_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_17_dilations_0 = const()[name = string("gate_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_17_groups_0 = const()[name = string("gate_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_17 = conv(dilations = gate_17_dilations_0, groups = gate_17_groups_0, pad = gate_17_pad_0, pad_type = gate_17_pad_type_0, strides = gate_17_strides_0, weight = layers_4_mlp_gate_proj_weight_palettized, x = input_143)[name = string("gate_17")];
+            string up_9_pad_type_0 = const()[name = string("up_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_9_strides_0 = const()[name = string("up_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_9_pad_0 = const()[name = string("up_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_9_dilations_0 = const()[name = string("up_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_9_groups_0 = const()[name = string("up_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_9 = conv(dilations = up_9_dilations_0, groups = up_9_groups_0, pad = up_9_pad_0, pad_type = up_9_pad_type_0, strides = up_9_strides_0, weight = layers_4_mlp_up_proj_weight_palettized, x = input_143)[name = string("up_9")];
+            string gate_19_mode_0 = const()[name = string("gate_19_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_19 = gelu(mode = gate_19_mode_0, x = gate_17)[name = string("gate_19")];
+            tensor<fp16, [1, 10240, 1, 1]> input_145 = mul(x = gate_19, y = up_9)[name = string("input_145")];
+            string mlp_out_9_pad_type_0 = const()[name = string("mlp_out_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_9_strides_0 = const()[name = string("mlp_out_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_9_pad_0 = const()[name = string("mlp_out_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_9_dilations_0 = const()[name = string("mlp_out_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_9_groups_0 = const()[name = string("mlp_out_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_9 = conv(dilations = mlp_out_9_dilations_0, groups = mlp_out_9_groups_0, pad = mlp_out_9_pad_0, pad_type = mlp_out_9_pad_type_0, strides = mlp_out_9_strides_0, weight = layers_4_mlp_down_proj_weight_palettized, x = input_145)[name = string("mlp_out_9")];
+            tensor<int32, [1]> var_3478_axes_0 = const()[name = string("op_3478_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3478 = squeeze(axes = var_3478_axes_0, x = mlp_out_9)[name = string("op_3478")];
+            tensor<int32, [3]> var_3482 = const()[name = string("op_3482"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3488 = const()[name = string("op_3488"), val = int32(-1)];
+            fp16 const_58_promoted = const()[name = string("const_58_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_95 = transpose(perm = var_3482, x = var_3478)[name = string("transpose_129")];
+            tensor<fp16, [1, 1, 2560]> var_3490 = mul(x = x_95, y = const_58_promoted)[name = string("op_3490")];
+            bool input_147_interleave_0 = const()[name = string("input_147_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_147 = concat(axis = var_3488, interleave = input_147_interleave_0, values = (x_95, var_3490))[name = string("input_147")];
+            tensor<int32, [1]> normed_137_axes_0 = const()[name = string("normed_137_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3485_to_fp16 = const()[name = string("op_3485_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_137_cast_fp16 = layer_norm(axes = normed_137_axes_0, epsilon = var_3485_to_fp16, x = input_147)[name = string("normed_137_cast_fp16")];
+            tensor<int32, [2]> var_3495_split_sizes_0 = const()[name = string("op_3495_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3495_axis_0 = const()[name = string("op_3495_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3495_0, tensor<fp16, [1, 1, 2560]> var_3495_1 = split(axis = var_3495_axis_0, split_sizes = var_3495_split_sizes_0, x = normed_137_cast_fp16)[name = string("op_3495")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_43 = mul(x = var_3495_0, y = layers_4_post_feedforward_layernorm_weight)[name = string("hidden_states_43")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_45_cast_fp16 = add(x = x_93_cast_fp16, y = hidden_states_43)[name = string("hidden_states_45_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_9_begin_0 = const()[name = string("per_layer_slice_9_begin_0"), val = tensor<int32, [3]>([0, 0, 1024])];
+            tensor<int32, [3]> per_layer_slice_9_end_0 = const()[name = string("per_layer_slice_9_end_0"), val = tensor<int32, [3]>([1, 1, 1280])];
+            tensor<bool, [3]> per_layer_slice_9_end_mask_0 = const()[name = string("per_layer_slice_9_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_9_cast_fp16 = slice_by_index(begin = per_layer_slice_9_begin_0, end = per_layer_slice_9_end_0, end_mask = per_layer_slice_9_end_mask_0, x = per_layer_combined_out)[name = string("per_layer_slice_9_cast_fp16")];
+            tensor<int32, [3]> var_3523 = const()[name = string("op_3523"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_149_axes_0 = const()[name = string("input_149_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3524 = transpose(perm = var_3523, x = hidden_states_45_cast_fp16)[name = string("transpose_128")];
+            tensor<fp16, [1, 2560, 1, 1]> input_149 = expand_dims(axes = input_149_axes_0, x = var_3524)[name = string("input_149")];
+            string gated_25_pad_type_0 = const()[name = string("gated_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_25_strides_0 = const()[name = string("gated_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_25_pad_0 = const()[name = string("gated_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_25_dilations_0 = const()[name = string("gated_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_25_groups_0 = const()[name = string("gated_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_25 = conv(dilations = gated_25_dilations_0, groups = gated_25_groups_0, pad = gated_25_pad_0, pad_type = gated_25_pad_type_0, strides = gated_25_strides_0, weight = layers_4_per_layer_input_gate_weight_palettized, x = input_149)[name = string("gated_25")];
+            string gated_27_mode_0 = const()[name = string("gated_27_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_27 = gelu(mode = gated_27_mode_0, x = gated_25)[name = string("gated_27")];
+            tensor<int32, [3]> var_3543 = const()[name = string("op_3543"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_9_axes_0 = const()[name = string("per_layer_slice_conv_9_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_3544_cast_fp16 = transpose(perm = var_3543, x = per_layer_slice_9_cast_fp16)[name = string("transpose_127")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_9_cast_fp16 = expand_dims(axes = per_layer_slice_conv_9_axes_0, x = var_3544_cast_fp16)[name = string("per_layer_slice_conv_9_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_151_cast_fp16 = mul(x = gated_27, y = per_layer_slice_conv_9_cast_fp16)[name = string("input_151_cast_fp16")];
+            string gated_29_pad_type_0 = const()[name = string("gated_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_29_strides_0 = const()[name = string("gated_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_29_pad_0 = const()[name = string("gated_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_29_dilations_0 = const()[name = string("gated_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_29_groups_0 = const()[name = string("gated_29_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_4_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(559565376))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(559893120))))[name = string("layers_4_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_29_cast_fp16 = conv(dilations = gated_29_dilations_0, groups = gated_29_groups_0, pad = gated_29_pad_0, pad_type = gated_29_pad_type_0, strides = gated_29_strides_0, weight = layers_4_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_151_cast_fp16)[name = string("gated_29_cast_fp16")];
+            tensor<int32, [1]> var_3560_axes_0 = const()[name = string("op_3560_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3560_cast_fp16 = squeeze(axes = var_3560_axes_0, x = gated_29_cast_fp16)[name = string("op_3560_cast_fp16")];
+            tensor<int32, [3]> var_3564 = const()[name = string("op_3564"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3570 = const()[name = string("op_3570"), val = int32(-1)];
+            fp16 const_59_promoted_to_fp16 = const()[name = string("const_59_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_97_cast_fp16 = transpose(perm = var_3564, x = var_3560_cast_fp16)[name = string("transpose_126")];
+            tensor<fp16, [1, 1, 2560]> var_3572_cast_fp16 = mul(x = x_97_cast_fp16, y = const_59_promoted_to_fp16)[name = string("op_3572_cast_fp16")];
+            bool input_153_interleave_0 = const()[name = string("input_153_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_153_cast_fp16 = concat(axis = var_3570, interleave = input_153_interleave_0, values = (x_97_cast_fp16, var_3572_cast_fp16))[name = string("input_153_cast_fp16")];
+            tensor<int32, [1]> normed_141_axes_0 = const()[name = string("normed_141_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3567_to_fp16 = const()[name = string("op_3567_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_141_cast_fp16 = layer_norm(axes = normed_141_axes_0, epsilon = var_3567_to_fp16, x = input_153_cast_fp16)[name = string("normed_141_cast_fp16")];
+            tensor<int32, [2]> var_3577_split_sizes_0 = const()[name = string("op_3577_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3577_axis_0 = const()[name = string("op_3577_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3577_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3577_cast_fp16_1 = split(axis = var_3577_axis_0, split_sizes = var_3577_split_sizes_0, x = normed_141_cast_fp16)[name = string("op_3577_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_4_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(559895744)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_49_cast_fp16 = mul(x = var_3577_cast_fp16_0, y = layers_4_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_49_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_51_cast_fp16 = add(x = hidden_states_45_cast_fp16, y = hidden_states_49_cast_fp16)[name = string("hidden_states_51_cast_fp16")];
+            tensor<fp16, [1]> const_60_promoted_to_fp16 = const()[name = string("const_60_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.2cp-1])];
+            tensor<fp16, [1, 1, 2560]> x_99_cast_fp16 = mul(x = hidden_states_51_cast_fp16, y = const_60_promoted_to_fp16)[name = string("x_99_cast_fp16")];
+            tensor<int32, [1]> var_3589_axes_0 = const()[name = string("op_3589_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_3589_cast_fp16 = squeeze(axes = var_3589_axes_0, x = K_sliding_out_9_cast_fp16)[name = string("op_3589_cast_fp16")];
+            tensor<int32, [1]> var_3591_axes_0 = const()[name = string("op_3591_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_3591_cast_fp16 = squeeze(axes = var_3591_axes_0, x = V_sliding_out_9_cast_fp16)[name = string("op_3591_cast_fp16")];
+            tensor<int32, [4]> var_3594_begin_0 = const()[name = string("op_3594_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3594_end_0 = const()[name = string("op_3594_end_0"), val = tensor<int32, [4]>([1, 2, 2048, 512])];
+            tensor<bool, [4]> var_3594_end_mask_0 = const()[name = string("op_3594_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_3594_squeeze_mask_0 = const()[name = string("op_3594_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 2048, 512]> var_3594_cast_fp16 = slice_by_index(begin = var_3594_begin_0, end = var_3594_end_0, end_mask = var_3594_end_mask_0, squeeze_mask = var_3594_squeeze_mask_0, x = K_full_in)[name = string("op_3594_cast_fp16")];
+            tensor<int32, [1]> K_full_slot_1_axes_0 = const()[name = string("K_full_slot_1_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 2048, 512]> K_full_slot_1_cast_fp16 = expand_dims(axes = K_full_slot_1_axes_0, x = var_3594_cast_fp16)[name = string("K_full_slot_1_cast_fp16")];
+            tensor<int32, [4]> var_3599_begin_0 = const()[name = string("op_3599_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3599_end_0 = const()[name = string("op_3599_end_0"), val = tensor<int32, [4]>([1, 2, 2048, 512])];
+            tensor<bool, [4]> var_3599_end_mask_0 = const()[name = string("op_3599_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_3599_squeeze_mask_0 = const()[name = string("op_3599_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 2048, 512]> var_3599_cast_fp16 = slice_by_index(begin = var_3599_begin_0, end = var_3599_end_0, end_mask = var_3599_end_mask_0, squeeze_mask = var_3599_squeeze_mask_0, x = V_full_in)[name = string("op_3599_cast_fp16")];
+            tensor<int32, [1]> V_full_slot_1_axes_0 = const()[name = string("V_full_slot_1_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 2048, 512]> V_full_slot_1_cast_fp16 = expand_dims(axes = V_full_slot_1_axes_0, x = var_3599_cast_fp16)[name = string("V_full_slot_1_cast_fp16")];
+            int32 var_3606 = const()[name = string("op_3606"), val = int32(-1)];
+            fp16 const_61_promoted_to_fp16 = const()[name = string("const_61_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_3608_cast_fp16 = mul(x = x_99_cast_fp16, y = const_61_promoted_to_fp16)[name = string("op_3608_cast_fp16")];
+            bool input_155_interleave_0 = const()[name = string("input_155_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_155_cast_fp16 = concat(axis = var_3606, interleave = input_155_interleave_0, values = (x_99_cast_fp16, var_3608_cast_fp16))[name = string("input_155_cast_fp16")];
+            tensor<int32, [1]> normed_145_axes_0 = const()[name = string("normed_145_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3603_to_fp16 = const()[name = string("op_3603_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_145_cast_fp16 = layer_norm(axes = normed_145_axes_0, epsilon = var_3603_to_fp16, x = input_155_cast_fp16)[name = string("normed_145_cast_fp16")];
+            tensor<int32, [2]> var_3613_split_sizes_0 = const()[name = string("op_3613_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3613_axis_0 = const()[name = string("op_3613_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3613_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3613_cast_fp16_1 = split(axis = var_3613_axis_0, split_sizes = var_3613_split_sizes_0, x = normed_145_cast_fp16)[name = string("op_3613_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_5_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(559900928)))];
+            tensor<fp16, [1, 1, 2560]> h_31_cast_fp16 = mul(x = var_3613_cast_fp16_0, y = layers_5_input_layernorm_weight_promoted_to_fp16)[name = string("h_31_cast_fp16")];
+            tensor<int32, [3]> var_3619 = const()[name = string("op_3619"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_3622_axes_0 = const()[name = string("op_3622_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3620_cast_fp16 = transpose(perm = var_3619, x = h_31_cast_fp16)[name = string("transpose_125")];
+            tensor<fp16, [1, 2560, 1, 1]> var_3622_cast_fp16 = expand_dims(axes = var_3622_axes_0, x = var_3620_cast_fp16)[name = string("op_3622_cast_fp16")];
+            string var_3638_pad_type_0 = const()[name = string("op_3638_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_3638_strides_0 = const()[name = string("op_3638_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_3638_pad_0 = const()[name = string("op_3638_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_3638_dilations_0 = const()[name = string("op_3638_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_3638_groups_0 = const()[name = string("op_3638_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 4096, 1, 1]> var_3638 = conv(dilations = var_3638_dilations_0, groups = var_3638_groups_0, pad = var_3638_pad_0, pad_type = var_3638_pad_type_0, strides = var_3638_strides_0, weight = layers_5_self_attn_q_proj_weight_palettized, x = var_3622_cast_fp16)[name = string("op_3638")];
+            tensor<int32, [4]> var_3643 = const()[name = string("op_3643"), val = tensor<int32, [4]>([1, 8, 512, 1])];
+            tensor<fp16, [1, 8, 512, 1]> var_3644 = reshape(shape = var_3643, x = var_3638)[name = string("op_3644")];
+            tensor<int32, [4]> var_3649 = const()[name = string("op_3649"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_3659 = const()[name = string("op_3659"), val = tensor<int32, [3]>([1, 8, 512])];
+            tensor<fp16, [1, 8, 1, 512]> var_3650 = transpose(perm = var_3649, x = var_3644)[name = string("transpose_124")];
+            tensor<fp16, [1, 8, 512]> x_101 = reshape(shape = var_3659, x = var_3650)[name = string("x_101")];
+            int32 var_3665 = const()[name = string("op_3665"), val = int32(-1)];
+            fp16 const_62_promoted = const()[name = string("const_62_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 512]> var_3667 = mul(x = x_101, y = const_62_promoted)[name = string("op_3667")];
+            bool input_159_interleave_0 = const()[name = string("input_159_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1024]> input_159 = concat(axis = var_3665, interleave = input_159_interleave_0, values = (x_101, var_3667))[name = string("input_159")];
+            tensor<int32, [1]> normed_149_axes_0 = const()[name = string("normed_149_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3662_to_fp16 = const()[name = string("op_3662_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 1024]> normed_149_cast_fp16 = layer_norm(axes = normed_149_axes_0, epsilon = var_3662_to_fp16, x = input_159)[name = string("normed_149_cast_fp16")];
+            tensor<int32, [2]> var_3672_split_sizes_0 = const()[name = string("op_3672_split_sizes_0"), val = tensor<int32, [2]>([512, 512])];
+            int32 var_3672_axis_0 = const()[name = string("op_3672_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 512]> var_3672_0, tensor<fp16, [1, 8, 512]> var_3672_1 = split(axis = var_3672_axis_0, split_sizes = var_3672_split_sizes_0, x = normed_149_cast_fp16)[name = string("op_3672")];
+            tensor<fp16, [1, 8, 512]> var_3674 = mul(x = var_3672_0, y = layers_5_self_attn_q_norm_weight)[name = string("op_3674")];
+            tensor<int32, [4]> var_3679 = const()[name = string("op_3679"), val = tensor<int32, [4]>([1, 8, 1, 512])];
+            tensor<fp16, [1, 8, 1, 512]> q_43 = reshape(shape = var_3679, x = var_3674)[name = string("q_43")];
+            tensor<fp16, [1, 8, 1, 512]> var_3681_cast_fp16 = mul(x = q_43, y = cos_f)[name = string("op_3681_cast_fp16")];
+            tensor<int32, [2]> var_3682_split_sizes_0 = const()[name = string("op_3682_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_3682_axis_0 = const()[name = string("op_3682_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 256]> var_3682_0, tensor<fp16, [1, 8, 1, 256]> var_3682_1 = split(axis = var_3682_axis_0, split_sizes = var_3682_split_sizes_0, x = q_43)[name = string("op_3682")];
+            fp16 const_63_promoted = const()[name = string("const_63_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 256]> var_3684 = mul(x = var_3682_1, y = const_63_promoted)[name = string("op_3684")];
+            int32 var_3686 = const()[name = string("op_3686"), val = int32(-1)];
+            bool var_3687_interleave_0 = const()[name = string("op_3687_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> var_3687 = concat(axis = var_3686, interleave = var_3687_interleave_0, values = (var_3684, var_3682_0))[name = string("op_3687")];
+            tensor<fp16, [1, 8, 1, 512]> var_3688_cast_fp16 = mul(x = var_3687, y = sin_f)[name = string("op_3688_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> q_47_cast_fp16 = add(x = var_3681_cast_fp16, y = var_3688_cast_fp16)[name = string("q_47_cast_fp16")];
+            string var_3701_pad_type_0 = const()[name = string("op_3701_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_3701_strides_0 = const()[name = string("op_3701_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_3701_pad_0 = const()[name = string("op_3701_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_3701_dilations_0 = const()[name = string("op_3701_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_3701_groups_0 = const()[name = string("op_3701_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> var_3701 = conv(dilations = var_3701_dilations_0, groups = var_3701_groups_0, pad = var_3701_pad_0, pad_type = var_3701_pad_type_0, strides = var_3701_strides_0, weight = layers_5_self_attn_k_proj_weight_palettized, x = var_3622_cast_fp16)[name = string("op_3701")];
+            tensor<int32, [4]> var_3706 = const()[name = string("op_3706"), val = tensor<int32, [4]>([1, 2, 512, 1])];
+            tensor<fp16, [1, 2, 512, 1]> var_3707 = reshape(shape = var_3706, x = var_3701)[name = string("op_3707")];
+            tensor<int32, [4]> var_3712 = const()[name = string("op_3712"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_3729_pad_type_0 = const()[name = string("op_3729_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_3729_strides_0 = const()[name = string("op_3729_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_3729_pad_0 = const()[name = string("op_3729_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_3729_dilations_0 = const()[name = string("op_3729_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_3729_groups_0 = const()[name = string("op_3729_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> var_3729 = conv(dilations = var_3729_dilations_0, groups = var_3729_groups_0, pad = var_3729_pad_0, pad_type = var_3729_pad_type_0, strides = var_3729_strides_0, weight = layers_5_self_attn_v_proj_weight_palettized, x = var_3622_cast_fp16)[name = string("op_3729")];
+            tensor<int32, [4]> var_3734 = const()[name = string("op_3734"), val = tensor<int32, [4]>([1, 2, 512, 1])];
+            tensor<fp16, [1, 2, 512, 1]> var_3735 = reshape(shape = var_3734, x = var_3729)[name = string("op_3735")];
+            tensor<int32, [4]> var_3740 = const()[name = string("op_3740"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_3750 = const()[name = string("op_3750"), val = tensor<int32, [3]>([1, 2, 512])];
+            tensor<fp16, [1, 2, 1, 512]> var_3713 = transpose(perm = var_3712, x = var_3707)[name = string("transpose_123")];
+            tensor<fp16, [1, 2, 512]> x_103 = reshape(shape = var_3750, x = var_3713)[name = string("x_103")];
+            int32 var_3756 = const()[name = string("op_3756"), val = int32(-1)];
+            fp16 const_64_promoted = const()[name = string("const_64_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 512]> var_3758 = mul(x = x_103, y = const_64_promoted)[name = string("op_3758")];
+            bool input_161_interleave_0 = const()[name = string("input_161_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1024]> input_161 = concat(axis = var_3756, interleave = input_161_interleave_0, values = (x_103, var_3758))[name = string("input_161")];
+            tensor<int32, [1]> normed_153_axes_0 = const()[name = string("normed_153_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3753_to_fp16 = const()[name = string("op_3753_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1024]> normed_153_cast_fp16 = layer_norm(axes = normed_153_axes_0, epsilon = var_3753_to_fp16, x = input_161)[name = string("normed_153_cast_fp16")];
+            tensor<int32, [2]> var_3763_split_sizes_0 = const()[name = string("op_3763_split_sizes_0"), val = tensor<int32, [2]>([512, 512])];
+            int32 var_3763_axis_0 = const()[name = string("op_3763_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 512]> var_3763_0, tensor<fp16, [1, 2, 512]> var_3763_1 = split(axis = var_3763_axis_0, split_sizes = var_3763_split_sizes_0, x = normed_153_cast_fp16)[name = string("op_3763")];
+            tensor<fp16, [1, 2, 512]> var_3765 = mul(x = var_3763_0, y = layers_5_self_attn_k_norm_weight)[name = string("op_3765")];
+            tensor<int32, [4]> var_3770 = const()[name = string("op_3770"), val = tensor<int32, [4]>([1, 2, 1, 512])];
+            tensor<fp16, [1, 2, 1, 512]> q_45 = reshape(shape = var_3770, x = var_3765)[name = string("q_45")];
+            fp16 var_3772_promoted = const()[name = string("op_3772_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 512]> var_3741 = transpose(perm = var_3740, x = var_3735)[name = string("transpose_122")];
+            tensor<fp16, [1, 2, 1, 512]> var_3773 = pow(x = var_3741, y = var_3772_promoted)[name = string("op_3773")];
+            tensor<int32, [1]> var_3778_axes_0 = const()[name = string("op_3778_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3778_keep_dims_0 = const()[name = string("op_3778_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_3778 = reduce_mean(axes = var_3778_axes_0, keep_dims = var_3778_keep_dims_0, x = var_3773)[name = string("op_3778")];
+            fp16 var_3780_to_fp16 = const()[name = string("op_3780_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_11_cast_fp16 = add(x = var_3778, y = var_3780_to_fp16)[name = string("mean_sq_11_cast_fp16")];
+            fp32 var_3782_epsilon_0 = const()[name = string("op_3782_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_3782_cast_fp16 = rsqrt(epsilon = var_3782_epsilon_0, x = mean_sq_11_cast_fp16)[name = string("op_3782_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 512]> v_1_cast_fp16 = mul(x = var_3741, y = var_3782_cast_fp16)[name = string("v_1_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 512]> var_3784_cast_fp16 = mul(x = q_45, y = cos_f)[name = string("op_3784_cast_fp16")];
+            tensor<int32, [2]> var_3785_split_sizes_0 = const()[name = string("op_3785_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_3785_axis_0 = const()[name = string("op_3785_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 256]> var_3785_0, tensor<fp16, [1, 2, 1, 256]> var_3785_1 = split(axis = var_3785_axis_0, split_sizes = var_3785_split_sizes_0, x = q_45)[name = string("op_3785")];
+            fp16 const_65_promoted = const()[name = string("const_65_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 256]> var_3787 = mul(x = var_3785_1, y = const_65_promoted)[name = string("op_3787")];
+            int32 var_3789 = const()[name = string("op_3789"), val = int32(-1)];
+            bool var_3790_interleave_0 = const()[name = string("op_3790_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 512]> var_3790 = concat(axis = var_3789, interleave = var_3790_interleave_0, values = (var_3787, var_3785_0))[name = string("op_3790")];
+            tensor<fp16, [1, 2, 1, 512]> var_3791_cast_fp16 = mul(x = var_3790, y = sin_f)[name = string("op_3791_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 512]> k_13_cast_fp16 = add(x = var_3784_cast_fp16, y = var_3791_cast_fp16)[name = string("k_13_cast_fp16")];
+            fp16 var_3794_promoted_to_fp16 = const()[name = string("op_3794_promoted_to_fp16"), val = fp16(0x1p+0)];
+            tensor<fp16, [1, 1, 2048, 1]> var_3796_cast_fp16 = sub(x = var_3794_promoted_to_fp16, y = update_mask)[name = string("op_3796_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_3797_cast_fp16 = mul(x = K_full_slot_1_cast_fp16, y = var_3796_cast_fp16)[name = string("op_3797_cast_fp16")];
+            tensor<int32, [4]> var_3798_reps_0 = const()[name = string("op_3798_reps_0"), val = tensor<int32, [4]>([1, 1, 2048, 1])];
+            tensor<fp16, [1, 2, 2048, 512]> var_3798_cast_fp16 = tile(reps = var_3798_reps_0, x = k_13_cast_fp16)[name = string("op_3798_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_3799_cast_fp16 = mul(x = var_3798_cast_fp16, y = update_mask)[name = string("op_3799_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> K_full_out_1_cast_fp16 = add(x = var_3797_cast_fp16, y = var_3799_cast_fp16)[name = string("K_full_out_1_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_3805_cast_fp16 = mul(x = V_full_slot_1_cast_fp16, y = var_3796_cast_fp16)[name = string("op_3805_cast_fp16")];
+            tensor<int32, [4]> var_3806_reps_0 = const()[name = string("op_3806_reps_0"), val = tensor<int32, [4]>([1, 1, 2048, 1])];
+            tensor<fp16, [1, 2, 2048, 512]> var_3806_cast_fp16 = tile(reps = var_3806_reps_0, x = v_1_cast_fp16)[name = string("op_3806_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_3807_cast_fp16 = mul(x = var_3806_cast_fp16, y = update_mask)[name = string("op_3807_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> V_full_out_1_cast_fp16 = add(x = var_3805_cast_fp16, y = var_3807_cast_fp16)[name = string("V_full_out_1_cast_fp16")];
+            tensor<int32, [4]> transpose_20_perm_0 = const()[name = string("transpose_20_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_10_reps_0 = const()[name = string("tile_10_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 2048, 512]> transpose_20_cast_fp16 = transpose(perm = transpose_20_perm_0, x = K_full_out_1_cast_fp16)[name = string("transpose_121")];
+            tensor<fp16, [8, 1, 2048, 512]> tile_10_cast_fp16 = tile(reps = tile_10_reps_0, x = transpose_20_cast_fp16)[name = string("tile_10_cast_fp16")];
+            tensor<int32, [5]> concat_20 = const()[name = string("concat_20"), val = tensor<int32, [5]>([4, 2, 1, 2048, 512])];
+            tensor<fp16, [4, 2, 1, 2048, 512]> reshape_20_cast_fp16 = reshape(shape = concat_20, x = tile_10_cast_fp16)[name = string("reshape_20_cast_fp16")];
+            tensor<int32, [5]> transpose_21_perm_0 = const()[name = string("transpose_21_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_21 = const()[name = string("concat_21"), val = tensor<int32, [4]>([-1, 1, 2048, 512])];
+            tensor<fp16, [2, 4, 1, 2048, 512]> transpose_21_cast_fp16 = transpose(perm = transpose_21_perm_0, x = reshape_20_cast_fp16)[name = string("transpose_120")];
+            tensor<fp16, [8, 1, 2048, 512]> reshape_21_cast_fp16 = reshape(shape = concat_21, x = transpose_21_cast_fp16)[name = string("reshape_21_cast_fp16")];
+            tensor<int32, [4]> transpose_53_perm_0 = const()[name = string("transpose_53_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_22_perm_0 = const()[name = string("transpose_22_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_11_reps_0 = const()[name = string("tile_11_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 2048, 512]> transpose_22_cast_fp16 = transpose(perm = transpose_22_perm_0, x = V_full_out_1_cast_fp16)[name = string("transpose_119")];
+            tensor<fp16, [8, 1, 2048, 512]> tile_11_cast_fp16 = tile(reps = tile_11_reps_0, x = transpose_22_cast_fp16)[name = string("tile_11_cast_fp16")];
+            tensor<int32, [5]> concat_22 = const()[name = string("concat_22"), val = tensor<int32, [5]>([4, 2, 1, 2048, 512])];
+            tensor<fp16, [4, 2, 1, 2048, 512]> reshape_22_cast_fp16 = reshape(shape = concat_22, x = tile_11_cast_fp16)[name = string("reshape_22_cast_fp16")];
+            tensor<int32, [5]> transpose_23_perm_0 = const()[name = string("transpose_23_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_23 = const()[name = string("concat_23"), val = tensor<int32, [4]>([-1, 1, 2048, 512])];
+            tensor<fp16, [2, 4, 1, 2048, 512]> transpose_23_cast_fp16 = transpose(perm = transpose_23_perm_0, x = reshape_22_cast_fp16)[name = string("transpose_118")];
+            tensor<fp16, [8, 1, 2048, 512]> reshape_23_cast_fp16 = reshape(shape = concat_23, x = transpose_23_cast_fp16)[name = string("reshape_23_cast_fp16")];
+            tensor<int32, [4]> V_expanded_11_perm_0 = const()[name = string("V_expanded_11_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_21_transpose_x_0 = const()[name = string("attn_weights_21_transpose_x_0"), val = bool(false)];
+            bool attn_weights_21_transpose_y_0 = const()[name = string("attn_weights_21_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 2048]> transpose_53_cast_fp16 = transpose(perm = transpose_53_perm_0, x = reshape_21_cast_fp16)[name = string("transpose_117")];
+            tensor<fp16, [1, 8, 1, 2048]> attn_weights_21_cast_fp16 = matmul(transpose_x = attn_weights_21_transpose_x_0, transpose_y = attn_weights_21_transpose_y_0, x = q_47_cast_fp16, y = transpose_53_cast_fp16)[name = string("attn_weights_21_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 2048]> x_107_cast_fp16 = add(x = attn_weights_21_cast_fp16, y = causal_mask_full)[name = string("x_107_cast_fp16")];
+            tensor<int32, [1]> reduce_max_5_axes_0 = const()[name = string("reduce_max_5_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_5_keep_dims_0 = const()[name = string("reduce_max_5_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_5 = reduce_max(axes = reduce_max_5_axes_0, keep_dims = reduce_max_5_keep_dims_0, x = x_107_cast_fp16)[name = string("reduce_max_5")];
+            tensor<fp16, [1, 8, 1, 2048]> var_3849 = sub(x = x_107_cast_fp16, y = reduce_max_5)[name = string("op_3849")];
+            tensor<fp16, [1, 8, 1, 2048]> var_3855 = exp(x = var_3849)[name = string("op_3855")];
+            tensor<int32, [1]> var_3865_axes_0 = const()[name = string("op_3865_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3865_keep_dims_0 = const()[name = string("op_3865_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_3865 = reduce_sum(axes = var_3865_axes_0, keep_dims = var_3865_keep_dims_0, x = var_3855)[name = string("op_3865")];
+            tensor<fp16, [1, 8, 1, 2048]> var_3871_cast_fp16 = real_div(x = var_3855, y = var_3865)[name = string("op_3871_cast_fp16")];
+            bool attn_output_31_transpose_x_0 = const()[name = string("attn_output_31_transpose_x_0"), val = bool(false)];
+            bool attn_output_31_transpose_y_0 = const()[name = string("attn_output_31_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 2048, 512]> V_expanded_11_cast_fp16 = transpose(perm = V_expanded_11_perm_0, x = reshape_23_cast_fp16)[name = string("transpose_116")];
+            tensor<fp16, [1, 8, 1, 512]> attn_output_31_cast_fp16 = matmul(transpose_x = attn_output_31_transpose_x_0, transpose_y = attn_output_31_transpose_y_0, x = var_3871_cast_fp16, y = V_expanded_11_cast_fp16)[name = string("attn_output_31_cast_fp16")];
+            tensor<int32, [4]> var_3882 = const()[name = string("op_3882"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_3889 = const()[name = string("op_3889"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 512]> var_3883_cast_fp16 = transpose(perm = var_3882, x = attn_output_31_cast_fp16)[name = string("transpose_115")];
+            tensor<fp16, [1, 1, 4096]> attn_output_33_cast_fp16 = reshape(shape = var_3889, x = var_3883_cast_fp16)[name = string("attn_output_33_cast_fp16")];
+            tensor<int32, [3]> var_3894 = const()[name = string("op_3894"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_3910_pad_type_0 = const()[name = string("op_3910_pad_type_0"), val = string("valid")];
+            int32 var_3910_groups_0 = const()[name = string("op_3910_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_3910_strides_0 = const()[name = string("op_3910_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_3910_pad_0 = const()[name = string("op_3910_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_3910_dilations_0 = const()[name = string("op_3910_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 4096, 1]> squeeze_5_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 4096, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(559906112))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(565149056))))[name = string("squeeze_5_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 4096, 1]> var_3895_cast_fp16 = transpose(perm = var_3894, x = attn_output_33_cast_fp16)[name = string("transpose_114")];
+            tensor<fp16, [1, 2560, 1]> var_3910_cast_fp16 = conv(dilations = var_3910_dilations_0, groups = var_3910_groups_0, pad = var_3910_pad_0, pad_type = var_3910_pad_type_0, strides = var_3910_strides_0, weight = squeeze_5_cast_fp16_to_fp32_to_fp16_palettized, x = var_3895_cast_fp16)[name = string("op_3910_cast_fp16")];
+            tensor<int32, [3]> var_3914 = const()[name = string("op_3914"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3920 = const()[name = string("op_3920"), val = int32(-1)];
+            fp16 const_66_promoted_to_fp16 = const()[name = string("const_66_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_111_cast_fp16 = transpose(perm = var_3914, x = var_3910_cast_fp16)[name = string("transpose_113")];
+            tensor<fp16, [1, 1, 2560]> var_3922_cast_fp16 = mul(x = x_111_cast_fp16, y = const_66_promoted_to_fp16)[name = string("op_3922_cast_fp16")];
+            bool input_165_interleave_0 = const()[name = string("input_165_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_165_cast_fp16 = concat(axis = var_3920, interleave = input_165_interleave_0, values = (x_111_cast_fp16, var_3922_cast_fp16))[name = string("input_165_cast_fp16")];
+            tensor<int32, [1]> normed_157_axes_0 = const()[name = string("normed_157_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3917_to_fp16 = const()[name = string("op_3917_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_157_cast_fp16 = layer_norm(axes = normed_157_axes_0, epsilon = var_3917_to_fp16, x = input_165_cast_fp16)[name = string("normed_157_cast_fp16")];
+            tensor<int32, [2]> var_3927_split_sizes_0 = const()[name = string("op_3927_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3927_axis_0 = const()[name = string("op_3927_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3927_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3927_cast_fp16_1 = split(axis = var_3927_axis_0, split_sizes = var_3927_split_sizes_0, x = normed_157_cast_fp16)[name = string("op_3927_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_5_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(565151680)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_35_cast_fp16 = mul(x = var_3927_cast_fp16_0, y = layers_5_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_35_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_113_cast_fp16 = add(x = x_99_cast_fp16, y = attn_output_35_cast_fp16)[name = string("x_113_cast_fp16")];
+            int32 var_3936 = const()[name = string("op_3936"), val = int32(-1)];
+            fp16 const_67_promoted_to_fp16 = const()[name = string("const_67_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_3938_cast_fp16 = mul(x = x_113_cast_fp16, y = const_67_promoted_to_fp16)[name = string("op_3938_cast_fp16")];
+            bool input_167_interleave_0 = const()[name = string("input_167_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_167_cast_fp16 = concat(axis = var_3936, interleave = input_167_interleave_0, values = (x_113_cast_fp16, var_3938_cast_fp16))[name = string("input_167_cast_fp16")];
+            tensor<int32, [1]> normed_161_axes_0 = const()[name = string("normed_161_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3933_to_fp16 = const()[name = string("op_3933_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_161_cast_fp16 = layer_norm(axes = normed_161_axes_0, epsilon = var_3933_to_fp16, x = input_167_cast_fp16)[name = string("normed_161_cast_fp16")];
+            tensor<int32, [2]> var_3943_split_sizes_0 = const()[name = string("op_3943_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3943_axis_0 = const()[name = string("op_3943_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3943_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3943_cast_fp16_1 = split(axis = var_3943_axis_0, split_sizes = var_3943_split_sizes_0, x = normed_161_cast_fp16)[name = string("op_3943_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_5_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(565156864)))];
+            tensor<fp16, [1, 1, 2560]> h_33_cast_fp16 = mul(x = var_3943_cast_fp16_0, y = layers_5_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_33_cast_fp16")];
+            tensor<int32, [3]> var_3954 = const()[name = string("op_3954"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_169_axes_0 = const()[name = string("input_169_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3955 = transpose(perm = var_3954, x = h_33_cast_fp16)[name = string("transpose_112")];
+            tensor<fp16, [1, 2560, 1, 1]> input_169 = expand_dims(axes = input_169_axes_0, x = var_3955)[name = string("input_169")];
+            string gate_21_pad_type_0 = const()[name = string("gate_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_21_strides_0 = const()[name = string("gate_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_21_pad_0 = const()[name = string("gate_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_21_dilations_0 = const()[name = string("gate_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_21_groups_0 = const()[name = string("gate_21_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_21 = conv(dilations = gate_21_dilations_0, groups = gate_21_groups_0, pad = gate_21_pad_0, pad_type = gate_21_pad_type_0, strides = gate_21_strides_0, weight = layers_5_mlp_gate_proj_weight_palettized, x = input_169)[name = string("gate_21")];
+            string up_11_pad_type_0 = const()[name = string("up_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_11_strides_0 = const()[name = string("up_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_11_pad_0 = const()[name = string("up_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_11_dilations_0 = const()[name = string("up_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_11_groups_0 = const()[name = string("up_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_11 = conv(dilations = up_11_dilations_0, groups = up_11_groups_0, pad = up_11_pad_0, pad_type = up_11_pad_type_0, strides = up_11_strides_0, weight = layers_5_mlp_up_proj_weight_palettized, x = input_169)[name = string("up_11")];
+            string gate_23_mode_0 = const()[name = string("gate_23_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_23 = gelu(mode = gate_23_mode_0, x = gate_21)[name = string("gate_23")];
+            tensor<fp16, [1, 10240, 1, 1]> input_171 = mul(x = gate_23, y = up_11)[name = string("input_171")];
+            string mlp_out_11_pad_type_0 = const()[name = string("mlp_out_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_11_strides_0 = const()[name = string("mlp_out_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_11_pad_0 = const()[name = string("mlp_out_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_11_dilations_0 = const()[name = string("mlp_out_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_11_groups_0 = const()[name = string("mlp_out_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_11 = conv(dilations = mlp_out_11_dilations_0, groups = mlp_out_11_groups_0, pad = mlp_out_11_pad_0, pad_type = mlp_out_11_pad_type_0, strides = mlp_out_11_strides_0, weight = layers_5_mlp_down_proj_weight_palettized, x = input_171)[name = string("mlp_out_11")];
+            tensor<int32, [1]> var_3995_axes_0 = const()[name = string("op_3995_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3995 = squeeze(axes = var_3995_axes_0, x = mlp_out_11)[name = string("op_3995")];
+            tensor<int32, [3]> var_3999 = const()[name = string("op_3999"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_4005 = const()[name = string("op_4005"), val = int32(-1)];
+            fp16 const_68_promoted = const()[name = string("const_68_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_115 = transpose(perm = var_3999, x = var_3995)[name = string("transpose_111")];
+            tensor<fp16, [1, 1, 2560]> var_4007 = mul(x = x_115, y = const_68_promoted)[name = string("op_4007")];
+            bool input_173_interleave_0 = const()[name = string("input_173_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_173 = concat(axis = var_4005, interleave = input_173_interleave_0, values = (x_115, var_4007))[name = string("input_173")];
+            tensor<int32, [1]> normed_165_axes_0 = const()[name = string("normed_165_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4002_to_fp16 = const()[name = string("op_4002_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_165_cast_fp16 = layer_norm(axes = normed_165_axes_0, epsilon = var_4002_to_fp16, x = input_173)[name = string("normed_165_cast_fp16")];
+            tensor<int32, [2]> var_4012_split_sizes_0 = const()[name = string("op_4012_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4012_axis_0 = const()[name = string("op_4012_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_4012_0, tensor<fp16, [1, 1, 2560]> var_4012_1 = split(axis = var_4012_axis_0, split_sizes = var_4012_split_sizes_0, x = normed_165_cast_fp16)[name = string("op_4012")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_53 = mul(x = var_4012_0, y = layers_5_post_feedforward_layernorm_weight)[name = string("hidden_states_53")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_55_cast_fp16 = add(x = x_113_cast_fp16, y = hidden_states_53)[name = string("hidden_states_55_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_11_begin_0 = const()[name = string("per_layer_slice_11_begin_0"), val = tensor<int32, [3]>([0, 0, 1280])];
+            tensor<int32, [3]> per_layer_slice_11_end_0 = const()[name = string("per_layer_slice_11_end_0"), val = tensor<int32, [3]>([1, 1, 1536])];
+            tensor<bool, [3]> per_layer_slice_11_end_mask_0 = const()[name = string("per_layer_slice_11_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_11_cast_fp16 = slice_by_index(begin = per_layer_slice_11_begin_0, end = per_layer_slice_11_end_0, end_mask = per_layer_slice_11_end_mask_0, x = per_layer_combined_out)[name = string("per_layer_slice_11_cast_fp16")];
+            tensor<int32, [3]> var_4040 = const()[name = string("op_4040"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_175_axes_0 = const()[name = string("input_175_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_4041 = transpose(perm = var_4040, x = hidden_states_55_cast_fp16)[name = string("transpose_110")];
+            tensor<fp16, [1, 2560, 1, 1]> input_175 = expand_dims(axes = input_175_axes_0, x = var_4041)[name = string("input_175")];
+            string gated_31_pad_type_0 = const()[name = string("gated_31_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_31_strides_0 = const()[name = string("gated_31_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_31_pad_0 = const()[name = string("gated_31_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_31_dilations_0 = const()[name = string("gated_31_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_31_groups_0 = const()[name = string("gated_31_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_31 = conv(dilations = gated_31_dilations_0, groups = gated_31_groups_0, pad = gated_31_pad_0, pad_type = gated_31_pad_type_0, strides = gated_31_strides_0, weight = layers_5_per_layer_input_gate_weight_palettized, x = input_175)[name = string("gated_31")];
+            string gated_33_mode_0 = const()[name = string("gated_33_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_33 = gelu(mode = gated_33_mode_0, x = gated_31)[name = string("gated_33")];
+            tensor<int32, [3]> var_4060 = const()[name = string("op_4060"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_11_axes_0 = const()[name = string("per_layer_slice_conv_11_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_4061_cast_fp16 = transpose(perm = var_4060, x = per_layer_slice_11_cast_fp16)[name = string("transpose_109")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_11_cast_fp16 = expand_dims(axes = per_layer_slice_conv_11_axes_0, x = var_4061_cast_fp16)[name = string("per_layer_slice_conv_11_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_177_cast_fp16 = mul(x = gated_33, y = per_layer_slice_conv_11_cast_fp16)[name = string("input_177_cast_fp16")];
+            string gated_35_pad_type_0 = const()[name = string("gated_35_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_35_strides_0 = const()[name = string("gated_35_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_35_pad_0 = const()[name = string("gated_35_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_35_dilations_0 = const()[name = string("gated_35_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_35_groups_0 = const()[name = string("gated_35_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_5_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(565162048))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(565489792))))[name = string("layers_5_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_35_cast_fp16 = conv(dilations = gated_35_dilations_0, groups = gated_35_groups_0, pad = gated_35_pad_0, pad_type = gated_35_pad_type_0, strides = gated_35_strides_0, weight = layers_5_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_177_cast_fp16)[name = string("gated_35_cast_fp16")];
+            tensor<int32, [1]> var_4077_axes_0 = const()[name = string("op_4077_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_4077_cast_fp16 = squeeze(axes = var_4077_axes_0, x = gated_35_cast_fp16)[name = string("op_4077_cast_fp16")];
+            tensor<int32, [3]> var_4081 = const()[name = string("op_4081"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_4087 = const()[name = string("op_4087"), val = int32(-1)];
+            fp16 const_69_promoted_to_fp16 = const()[name = string("const_69_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_117_cast_fp16 = transpose(perm = var_4081, x = var_4077_cast_fp16)[name = string("transpose_108")];
+            tensor<fp16, [1, 1, 2560]> var_4089_cast_fp16 = mul(x = x_117_cast_fp16, y = const_69_promoted_to_fp16)[name = string("op_4089_cast_fp16")];
+            bool input_179_interleave_0 = const()[name = string("input_179_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_179_cast_fp16 = concat(axis = var_4087, interleave = input_179_interleave_0, values = (x_117_cast_fp16, var_4089_cast_fp16))[name = string("input_179_cast_fp16")];
+            tensor<int32, [1]> normed_169_axes_0 = const()[name = string("normed_169_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4084_to_fp16 = const()[name = string("op_4084_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_169_cast_fp16 = layer_norm(axes = normed_169_axes_0, epsilon = var_4084_to_fp16, x = input_179_cast_fp16)[name = string("normed_169_cast_fp16")];
+            tensor<int32, [2]> var_4094_split_sizes_0 = const()[name = string("op_4094_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4094_axis_0 = const()[name = string("op_4094_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_4094_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_4094_cast_fp16_1 = split(axis = var_4094_axis_0, split_sizes = var_4094_split_sizes_0, x = normed_169_cast_fp16)[name = string("op_4094_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_5_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(565492416)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_59_cast_fp16 = mul(x = var_4094_cast_fp16_0, y = layers_5_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_59_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_61_cast_fp16 = add(x = hidden_states_55_cast_fp16, y = hidden_states_59_cast_fp16)[name = string("hidden_states_61_cast_fp16")];
+            tensor<fp16, [1]> const_70_promoted_to_fp16 = const()[name = string("const_70_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.36p-1])];
+            tensor<fp16, [1, 1, 2560]> x_119_cast_fp16 = mul(x = hidden_states_61_cast_fp16, y = const_70_promoted_to_fp16)[name = string("x_119_cast_fp16")];
+            tensor<int32, [1]> var_4106_axes_0 = const()[name = string("op_4106_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 2048, 512]> var_4106_cast_fp16 = squeeze(axes = var_4106_axes_0, x = K_full_out_1_cast_fp16)[name = string("op_4106_cast_fp16")];
+            tensor<int32, [1]> var_4108_axes_0 = const()[name = string("op_4108_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 2048, 512]> var_4108_cast_fp16 = squeeze(axes = var_4108_axes_0, x = V_full_out_1_cast_fp16)[name = string("op_4108_cast_fp16")];
+            tensor<int32, [4]> var_4111_begin_0 = const()[name = string("op_4111_begin_0"), val = tensor<int32, [4]>([5, 0, 0, 0])];
+            tensor<int32, [4]> var_4111_end_0 = const()[name = string("op_4111_end_0"), val = tensor<int32, [4]>([6, 2, 512, 512])];
+            tensor<bool, [4]> var_4111_end_mask_0 = const()[name = string("op_4111_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_4111_squeeze_mask_0 = const()[name = string("op_4111_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_4111_cast_fp16 = slice_by_index(begin = var_4111_begin_0, end = var_4111_end_0, end_mask = var_4111_end_mask_0, squeeze_mask = var_4111_squeeze_mask_0, x = K_sliding_in)[name = string("op_4111_cast_fp16")];
+            tensor<int32, [1]> K_sliding_slot_11_axes_0 = const()[name = string("K_sliding_slot_11_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_slot_11_cast_fp16 = expand_dims(axes = K_sliding_slot_11_axes_0, x = var_4111_cast_fp16)[name = string("K_sliding_slot_11_cast_fp16")];
+            tensor<int32, [4]> var_4116_begin_0 = const()[name = string("op_4116_begin_0"), val = tensor<int32, [4]>([5, 0, 0, 0])];
+            tensor<int32, [4]> var_4116_end_0 = const()[name = string("op_4116_end_0"), val = tensor<int32, [4]>([6, 2, 512, 512])];
+            tensor<bool, [4]> var_4116_end_mask_0 = const()[name = string("op_4116_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_4116_squeeze_mask_0 = const()[name = string("op_4116_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_4116_cast_fp16 = slice_by_index(begin = var_4116_begin_0, end = var_4116_end_0, end_mask = var_4116_end_mask_0, squeeze_mask = var_4116_squeeze_mask_0, x = V_sliding_in)[name = string("op_4116_cast_fp16")];
+            tensor<int32, [1]> V_sliding_slot_11_axes_0 = const()[name = string("V_sliding_slot_11_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_slot_11_cast_fp16 = expand_dims(axes = V_sliding_slot_11_axes_0, x = var_4116_cast_fp16)[name = string("V_sliding_slot_11_cast_fp16")];
+            int32 var_4123 = const()[name = string("op_4123"), val = int32(-1)];
+            fp16 const_71_promoted_to_fp16 = const()[name = string("const_71_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_4125_cast_fp16 = mul(x = x_119_cast_fp16, y = const_71_promoted_to_fp16)[name = string("op_4125_cast_fp16")];
+            bool input_181_interleave_0 = const()[name = string("input_181_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_181_cast_fp16 = concat(axis = var_4123, interleave = input_181_interleave_0, values = (x_119_cast_fp16, var_4125_cast_fp16))[name = string("input_181_cast_fp16")];
+            tensor<int32, [1]> normed_173_axes_0 = const()[name = string("normed_173_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4120_to_fp16 = const()[name = string("op_4120_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_173_cast_fp16 = layer_norm(axes = normed_173_axes_0, epsilon = var_4120_to_fp16, x = input_181_cast_fp16)[name = string("normed_173_cast_fp16")];
+            tensor<int32, [2]> var_4130_split_sizes_0 = const()[name = string("op_4130_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4130_axis_0 = const()[name = string("op_4130_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_4130_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_4130_cast_fp16_1 = split(axis = var_4130_axis_0, split_sizes = var_4130_split_sizes_0, x = normed_173_cast_fp16)[name = string("op_4130_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_6_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(565497600)))];
+            tensor<fp16, [1, 1, 2560]> h_37_cast_fp16 = mul(x = var_4130_cast_fp16_0, y = layers_6_input_layernorm_weight_promoted_to_fp16)[name = string("h_37_cast_fp16")];
+            tensor<int32, [3]> var_4136 = const()[name = string("op_4136"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_4139_axes_0 = const()[name = string("op_4139_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_4137_cast_fp16 = transpose(perm = var_4136, x = h_37_cast_fp16)[name = string("transpose_107")];
+            tensor<fp16, [1, 2560, 1, 1]> var_4139_cast_fp16 = expand_dims(axes = var_4139_axes_0, x = var_4137_cast_fp16)[name = string("op_4139_cast_fp16")];
+            string var_4155_pad_type_0 = const()[name = string("op_4155_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_4155_strides_0 = const()[name = string("op_4155_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_4155_pad_0 = const()[name = string("op_4155_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_4155_dilations_0 = const()[name = string("op_4155_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_4155_groups_0 = const()[name = string("op_4155_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_4155 = conv(dilations = var_4155_dilations_0, groups = var_4155_groups_0, pad = var_4155_pad_0, pad_type = var_4155_pad_type_0, strides = var_4155_strides_0, weight = layers_6_self_attn_q_proj_weight_palettized, x = var_4139_cast_fp16)[name = string("op_4155")];
+            tensor<int32, [4]> var_4160 = const()[name = string("op_4160"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_4161 = reshape(shape = var_4160, x = var_4155)[name = string("op_4161")];
+            tensor<int32, [4]> var_4166 = const()[name = string("op_4166"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_4176 = const()[name = string("op_4176"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_4167 = transpose(perm = var_4166, x = var_4161)[name = string("transpose_106")];
+            tensor<fp16, [1, 8, 256]> x_121 = reshape(shape = var_4176, x = var_4167)[name = string("x_121")];
+            int32 var_4182 = const()[name = string("op_4182"), val = int32(-1)];
+            fp16 const_72_promoted = const()[name = string("const_72_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_4184 = mul(x = x_121, y = const_72_promoted)[name = string("op_4184")];
+            bool input_185_interleave_0 = const()[name = string("input_185_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_185 = concat(axis = var_4182, interleave = input_185_interleave_0, values = (x_121, var_4184))[name = string("input_185")];
+            tensor<int32, [1]> normed_177_axes_0 = const()[name = string("normed_177_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4179_to_fp16 = const()[name = string("op_4179_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_177_cast_fp16 = layer_norm(axes = normed_177_axes_0, epsilon = var_4179_to_fp16, x = input_185)[name = string("normed_177_cast_fp16")];
+            tensor<int32, [2]> var_4189_split_sizes_0 = const()[name = string("op_4189_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_4189_axis_0 = const()[name = string("op_4189_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_4189_0, tensor<fp16, [1, 8, 256]> var_4189_1 = split(axis = var_4189_axis_0, split_sizes = var_4189_split_sizes_0, x = normed_177_cast_fp16)[name = string("op_4189")];
+            tensor<fp16, [1, 8, 256]> var_4191 = mul(x = var_4189_0, y = layers_3_self_attn_q_norm_weight)[name = string("op_4191")];
+            tensor<int32, [4]> var_4196 = const()[name = string("op_4196"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_51 = reshape(shape = var_4196, x = var_4191)[name = string("q_51")];
+            tensor<fp16, [1, 8, 1, 256]> var_4198_cast_fp16 = mul(x = q_51, y = cos_s)[name = string("op_4198_cast_fp16")];
+            tensor<int32, [2]> var_4199_split_sizes_0 = const()[name = string("op_4199_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_4199_axis_0 = const()[name = string("op_4199_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_4199_0, tensor<fp16, [1, 8, 1, 128]> var_4199_1 = split(axis = var_4199_axis_0, split_sizes = var_4199_split_sizes_0, x = q_51)[name = string("op_4199")];
+            fp16 const_73_promoted = const()[name = string("const_73_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_4201 = mul(x = var_4199_1, y = const_73_promoted)[name = string("op_4201")];
+            int32 var_4203 = const()[name = string("op_4203"), val = int32(-1)];
+            bool var_4204_interleave_0 = const()[name = string("op_4204_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_4204 = concat(axis = var_4203, interleave = var_4204_interleave_0, values = (var_4201, var_4199_0))[name = string("op_4204")];
+            tensor<fp16, [1, 8, 1, 256]> var_4205_cast_fp16 = mul(x = var_4204, y = sin_s)[name = string("op_4205_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_55_cast_fp16 = add(x = var_4198_cast_fp16, y = var_4205_cast_fp16)[name = string("q_55_cast_fp16")];
+            string var_4218_pad_type_0 = const()[name = string("op_4218_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_4218_strides_0 = const()[name = string("op_4218_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_4218_pad_0 = const()[name = string("op_4218_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_4218_dilations_0 = const()[name = string("op_4218_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_4218_groups_0 = const()[name = string("op_4218_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_4218 = conv(dilations = var_4218_dilations_0, groups = var_4218_groups_0, pad = var_4218_pad_0, pad_type = var_4218_pad_type_0, strides = var_4218_strides_0, weight = layers_6_self_attn_k_proj_weight_palettized, x = var_4139_cast_fp16)[name = string("op_4218")];
+            tensor<int32, [4]> var_4223 = const()[name = string("op_4223"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_4224 = reshape(shape = var_4223, x = var_4218)[name = string("op_4224")];
+            tensor<int32, [4]> var_4229 = const()[name = string("op_4229"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_4246_pad_type_0 = const()[name = string("op_4246_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_4246_strides_0 = const()[name = string("op_4246_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_4246_pad_0 = const()[name = string("op_4246_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_4246_dilations_0 = const()[name = string("op_4246_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_4246_groups_0 = const()[name = string("op_4246_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_4246 = conv(dilations = var_4246_dilations_0, groups = var_4246_groups_0, pad = var_4246_pad_0, pad_type = var_4246_pad_type_0, strides = var_4246_strides_0, weight = layers_6_self_attn_v_proj_weight_palettized, x = var_4139_cast_fp16)[name = string("op_4246")];
+            tensor<int32, [4]> var_4251 = const()[name = string("op_4251"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_4252 = reshape(shape = var_4251, x = var_4246)[name = string("op_4252")];
+            tensor<int32, [4]> var_4257 = const()[name = string("op_4257"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_4267 = const()[name = string("op_4267"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp16, [1, 2, 1, 256]> var_4230 = transpose(perm = var_4229, x = var_4224)[name = string("transpose_105")];
+            tensor<fp16, [1, 2, 256]> x_123 = reshape(shape = var_4267, x = var_4230)[name = string("x_123")];
+            int32 var_4273 = const()[name = string("op_4273"), val = int32(-1)];
+            fp16 const_74_promoted = const()[name = string("const_74_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 256]> var_4275 = mul(x = x_123, y = const_74_promoted)[name = string("op_4275")];
+            bool input_187_interleave_0 = const()[name = string("input_187_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512]> input_187 = concat(axis = var_4273, interleave = input_187_interleave_0, values = (x_123, var_4275))[name = string("input_187")];
+            tensor<int32, [1]> normed_181_axes_0 = const()[name = string("normed_181_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4270_to_fp16 = const()[name = string("op_4270_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 512]> normed_181_cast_fp16 = layer_norm(axes = normed_181_axes_0, epsilon = var_4270_to_fp16, x = input_187)[name = string("normed_181_cast_fp16")];
+            tensor<int32, [2]> var_4280_split_sizes_0 = const()[name = string("op_4280_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_4280_axis_0 = const()[name = string("op_4280_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 256]> var_4280_0, tensor<fp16, [1, 2, 256]> var_4280_1 = split(axis = var_4280_axis_0, split_sizes = var_4280_split_sizes_0, x = normed_181_cast_fp16)[name = string("op_4280")];
+            tensor<fp16, [1, 2, 256]> var_4282 = mul(x = var_4280_0, y = layers_6_self_attn_k_norm_weight)[name = string("op_4282")];
+            tensor<int32, [4]> var_4287 = const()[name = string("op_4287"), val = tensor<int32, [4]>([1, 2, 1, 256])];
+            tensor<fp16, [1, 2, 1, 256]> q_53 = reshape(shape = var_4287, x = var_4282)[name = string("q_53")];
+            fp16 var_4289_promoted = const()[name = string("op_4289_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 256]> var_4258 = transpose(perm = var_4257, x = var_4252)[name = string("transpose_104")];
+            tensor<fp16, [1, 2, 1, 256]> var_4290 = pow(x = var_4258, y = var_4289_promoted)[name = string("op_4290")];
+            tensor<int32, [1]> var_4295_axes_0 = const()[name = string("op_4295_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4295_keep_dims_0 = const()[name = string("op_4295_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_4295 = reduce_mean(axes = var_4295_axes_0, keep_dims = var_4295_keep_dims_0, x = var_4290)[name = string("op_4295")];
+            fp16 var_4297_to_fp16 = const()[name = string("op_4297_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_13_cast_fp16 = add(x = var_4295, y = var_4297_to_fp16)[name = string("mean_sq_13_cast_fp16")];
+            fp32 var_4299_epsilon_0 = const()[name = string("op_4299_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_4299_cast_fp16 = rsqrt(epsilon = var_4299_epsilon_0, x = mean_sq_13_cast_fp16)[name = string("op_4299_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_191_cast_fp16 = mul(x = var_4258, y = var_4299_cast_fp16)[name = string("input_191_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> var_4301_cast_fp16 = mul(x = q_53, y = cos_s)[name = string("op_4301_cast_fp16")];
+            tensor<int32, [2]> var_4302_split_sizes_0 = const()[name = string("op_4302_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_4302_axis_0 = const()[name = string("op_4302_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 128]> var_4302_0, tensor<fp16, [1, 2, 1, 128]> var_4302_1 = split(axis = var_4302_axis_0, split_sizes = var_4302_split_sizes_0, x = q_53)[name = string("op_4302")];
+            fp16 const_75_promoted = const()[name = string("const_75_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 128]> var_4304 = mul(x = var_4302_1, y = const_75_promoted)[name = string("op_4304")];
+            int32 var_4306 = const()[name = string("op_4306"), val = int32(-1)];
+            bool var_4307_interleave_0 = const()[name = string("op_4307_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 256]> var_4307 = concat(axis = var_4306, interleave = var_4307_interleave_0, values = (var_4304, var_4302_0))[name = string("op_4307")];
+            tensor<fp16, [1, 2, 1, 256]> var_4308_cast_fp16 = mul(x = var_4307, y = sin_s)[name = string("op_4308_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_189_cast_fp16 = add(x = var_4301_cast_fp16, y = var_4308_cast_fp16)[name = string("input_189_cast_fp16")];
+            tensor<int32, [8]> k_padded_11_pad_0 = const()[name = string("k_padded_11_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_11_mode_0 = const()[name = string("k_padded_11_mode_0"), val = string("constant")];
+            fp16 const_76_to_fp16 = const()[name = string("const_76_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> k_padded_11_cast_fp16 = pad(constant_val = const_76_to_fp16, mode = k_padded_11_mode_0, pad = k_padded_11_pad_0, x = input_189_cast_fp16)[name = string("k_padded_11_cast_fp16")];
+            tensor<int32, [8]> v_padded_11_pad_0 = const()[name = string("v_padded_11_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_11_mode_0 = const()[name = string("v_padded_11_mode_0"), val = string("constant")];
+            fp16 const_77_to_fp16 = const()[name = string("const_77_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> v_padded_11_cast_fp16 = pad(constant_val = const_77_to_fp16, mode = v_padded_11_mode_0, pad = v_padded_11_pad_0, x = input_191_cast_fp16)[name = string("v_padded_11_cast_fp16")];
+            tensor<int32, [4]> var_4337_begin_0 = const()[name = string("op_4337_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_4337_end_0 = const()[name = string("op_4337_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_4337_end_mask_0 = const()[name = string("op_4337_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_4337_cast_fp16 = slice_by_index(begin = var_4337_begin_0, end = var_4337_end_0, end_mask = var_4337_end_mask_0, x = K_sliding_slot_11_cast_fp16)[name = string("op_4337_cast_fp16")];
+            int32 var_4344 = const()[name = string("op_4344"), val = int32(2)];
+            bool K_sliding_out_11_interleave_0 = const()[name = string("K_sliding_out_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_out_11_cast_fp16 = concat(axis = var_4344, interleave = K_sliding_out_11_interleave_0, values = (var_4337_cast_fp16, k_padded_11_cast_fp16))[name = string("K_sliding_out_11_cast_fp16")];
+            tensor<int32, [4]> var_4360_begin_0 = const()[name = string("op_4360_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_4360_end_0 = const()[name = string("op_4360_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_4360_end_mask_0 = const()[name = string("op_4360_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_4360_cast_fp16 = slice_by_index(begin = var_4360_begin_0, end = var_4360_end_0, end_mask = var_4360_end_mask_0, x = V_sliding_slot_11_cast_fp16)[name = string("op_4360_cast_fp16")];
+            int32 var_4367 = const()[name = string("op_4367"), val = int32(2)];
+            bool V_sliding_out_11_interleave_0 = const()[name = string("V_sliding_out_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_out_11_cast_fp16 = concat(axis = var_4367, interleave = V_sliding_out_11_interleave_0, values = (var_4360_cast_fp16, v_padded_11_cast_fp16))[name = string("V_sliding_out_11_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_13_begin_0 = const()[name = string("K_for_attn_13_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_13_end_0 = const()[name = string("K_for_attn_13_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_13_end_mask_0 = const()[name = string("K_for_attn_13_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_13_cast_fp16 = slice_by_index(begin = K_for_attn_13_begin_0, end = K_for_attn_13_end_0, end_mask = K_for_attn_13_end_mask_0, x = K_sliding_out_11_cast_fp16)[name = string("K_for_attn_13_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_13_begin_0 = const()[name = string("V_for_attn_13_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_13_end_0 = const()[name = string("V_for_attn_13_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_13_end_mask_0 = const()[name = string("V_for_attn_13_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_13_cast_fp16 = slice_by_index(begin = V_for_attn_13_begin_0, end = V_for_attn_13_end_0, end_mask = V_for_attn_13_end_mask_0, x = V_sliding_out_11_cast_fp16)[name = string("V_for_attn_13_cast_fp16")];
+            tensor<int32, [4]> transpose_24_perm_0 = const()[name = string("transpose_24_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_12_reps_0 = const()[name = string("tile_12_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_24_cast_fp16 = transpose(perm = transpose_24_perm_0, x = K_for_attn_13_cast_fp16)[name = string("transpose_103")];
+            tensor<fp16, [8, 1, 512, 256]> tile_12_cast_fp16 = tile(reps = tile_12_reps_0, x = transpose_24_cast_fp16)[name = string("tile_12_cast_fp16")];
+            tensor<int32, [5]> concat_24 = const()[name = string("concat_24"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_24_cast_fp16 = reshape(shape = concat_24, x = tile_12_cast_fp16)[name = string("reshape_24_cast_fp16")];
+            tensor<int32, [5]> transpose_25_perm_0 = const()[name = string("transpose_25_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_25 = const()[name = string("concat_25"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_25_cast_fp16 = transpose(perm = transpose_25_perm_0, x = reshape_24_cast_fp16)[name = string("transpose_102")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_25_cast_fp16 = reshape(shape = concat_25, x = transpose_25_cast_fp16)[name = string("reshape_25_cast_fp16")];
+            tensor<int32, [4]> transpose_54_perm_0 = const()[name = string("transpose_54_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_26_perm_0 = const()[name = string("transpose_26_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_13_reps_0 = const()[name = string("tile_13_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_26_cast_fp16 = transpose(perm = transpose_26_perm_0, x = V_for_attn_13_cast_fp16)[name = string("transpose_101")];
+            tensor<fp16, [8, 1, 512, 256]> tile_13_cast_fp16 = tile(reps = tile_13_reps_0, x = transpose_26_cast_fp16)[name = string("tile_13_cast_fp16")];
+            tensor<int32, [5]> concat_26 = const()[name = string("concat_26"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_26_cast_fp16 = reshape(shape = concat_26, x = tile_13_cast_fp16)[name = string("reshape_26_cast_fp16")];
+            tensor<int32, [5]> transpose_27_perm_0 = const()[name = string("transpose_27_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_27 = const()[name = string("concat_27"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_27_cast_fp16 = transpose(perm = transpose_27_perm_0, x = reshape_26_cast_fp16)[name = string("transpose_100")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_27_cast_fp16 = reshape(shape = concat_27, x = transpose_27_cast_fp16)[name = string("reshape_27_cast_fp16")];
+            tensor<int32, [4]> V_expanded_13_perm_0 = const()[name = string("V_expanded_13_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_25_transpose_x_0 = const()[name = string("attn_weights_25_transpose_x_0"), val = bool(false)];
+            bool attn_weights_25_transpose_y_0 = const()[name = string("attn_weights_25_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_54_cast_fp16 = transpose(perm = transpose_54_perm_0, x = reshape_25_cast_fp16)[name = string("transpose_99")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_25_cast_fp16 = matmul(transpose_x = attn_weights_25_transpose_x_0, transpose_y = attn_weights_25_transpose_y_0, x = q_55_cast_fp16, y = transpose_54_cast_fp16)[name = string("attn_weights_25_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_127_cast_fp16 = add(x = attn_weights_25_cast_fp16, y = causal_mask_sliding)[name = string("x_127_cast_fp16")];
+            tensor<int32, [1]> reduce_max_6_axes_0 = const()[name = string("reduce_max_6_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_6_keep_dims_0 = const()[name = string("reduce_max_6_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_6 = reduce_max(axes = reduce_max_6_axes_0, keep_dims = reduce_max_6_keep_dims_0, x = x_127_cast_fp16)[name = string("reduce_max_6")];
+            tensor<fp16, [1, 8, 1, 512]> var_4408 = sub(x = x_127_cast_fp16, y = reduce_max_6)[name = string("op_4408")];
+            tensor<fp16, [1, 8, 1, 512]> var_4414 = exp(x = var_4408)[name = string("op_4414")];
+            tensor<int32, [1]> var_4424_axes_0 = const()[name = string("op_4424_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4424_keep_dims_0 = const()[name = string("op_4424_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_4424 = reduce_sum(axes = var_4424_axes_0, keep_dims = var_4424_keep_dims_0, x = var_4414)[name = string("op_4424")];
+            tensor<fp16, [1, 8, 1, 512]> var_4430_cast_fp16 = real_div(x = var_4414, y = var_4424)[name = string("op_4430_cast_fp16")];
+            bool attn_output_37_transpose_x_0 = const()[name = string("attn_output_37_transpose_x_0"), val = bool(false)];
+            bool attn_output_37_transpose_y_0 = const()[name = string("attn_output_37_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_13_cast_fp16 = transpose(perm = V_expanded_13_perm_0, x = reshape_27_cast_fp16)[name = string("transpose_98")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_37_cast_fp16 = matmul(transpose_x = attn_output_37_transpose_x_0, transpose_y = attn_output_37_transpose_y_0, x = var_4430_cast_fp16, y = V_expanded_13_cast_fp16)[name = string("attn_output_37_cast_fp16")];
+            tensor<int32, [4]> var_4441 = const()[name = string("op_4441"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_4448 = const()[name = string("op_4448"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_4442_cast_fp16 = transpose(perm = var_4441, x = attn_output_37_cast_fp16)[name = string("transpose_97")];
+            tensor<fp16, [1, 1, 2048]> attn_output_39_cast_fp16 = reshape(shape = var_4448, x = var_4442_cast_fp16)[name = string("attn_output_39_cast_fp16")];
+            tensor<int32, [3]> var_4453 = const()[name = string("op_4453"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_4469_pad_type_0 = const()[name = string("op_4469_pad_type_0"), val = string("valid")];
+            int32 var_4469_groups_0 = const()[name = string("op_4469_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_4469_strides_0 = const()[name = string("op_4469_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_4469_pad_0 = const()[name = string("op_4469_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_4469_dilations_0 = const()[name = string("op_4469_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_6_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(565502784))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(568124288))))[name = string("squeeze_6_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_4454_cast_fp16 = transpose(perm = var_4453, x = attn_output_39_cast_fp16)[name = string("transpose_96")];
+            tensor<fp16, [1, 2560, 1]> var_4469_cast_fp16 = conv(dilations = var_4469_dilations_0, groups = var_4469_groups_0, pad = var_4469_pad_0, pad_type = var_4469_pad_type_0, strides = var_4469_strides_0, weight = squeeze_6_cast_fp16_to_fp32_to_fp16_palettized, x = var_4454_cast_fp16)[name = string("op_4469_cast_fp16")];
+            tensor<int32, [3]> var_4473 = const()[name = string("op_4473"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_4479 = const()[name = string("op_4479"), val = int32(-1)];
+            fp16 const_78_promoted_to_fp16 = const()[name = string("const_78_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_131_cast_fp16 = transpose(perm = var_4473, x = var_4469_cast_fp16)[name = string("transpose_95")];
+            tensor<fp16, [1, 1, 2560]> var_4481_cast_fp16 = mul(x = x_131_cast_fp16, y = const_78_promoted_to_fp16)[name = string("op_4481_cast_fp16")];
+            bool input_195_interleave_0 = const()[name = string("input_195_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_195_cast_fp16 = concat(axis = var_4479, interleave = input_195_interleave_0, values = (x_131_cast_fp16, var_4481_cast_fp16))[name = string("input_195_cast_fp16")];
+            tensor<int32, [1]> normed_185_axes_0 = const()[name = string("normed_185_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4476_to_fp16 = const()[name = string("op_4476_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_185_cast_fp16 = layer_norm(axes = normed_185_axes_0, epsilon = var_4476_to_fp16, x = input_195_cast_fp16)[name = string("normed_185_cast_fp16")];
+            tensor<int32, [2]> var_4486_split_sizes_0 = const()[name = string("op_4486_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4486_axis_0 = const()[name = string("op_4486_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_4486_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_4486_cast_fp16_1 = split(axis = var_4486_axis_0, split_sizes = var_4486_split_sizes_0, x = normed_185_cast_fp16)[name = string("op_4486_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_6_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(568126912)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_41_cast_fp16 = mul(x = var_4486_cast_fp16_0, y = layers_6_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_41_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_133_cast_fp16 = add(x = x_119_cast_fp16, y = attn_output_41_cast_fp16)[name = string("x_133_cast_fp16")];
+            int32 var_4495 = const()[name = string("op_4495"), val = int32(-1)];
+            fp16 const_79_promoted_to_fp16 = const()[name = string("const_79_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_4497_cast_fp16 = mul(x = x_133_cast_fp16, y = const_79_promoted_to_fp16)[name = string("op_4497_cast_fp16")];
+            bool input_197_interleave_0 = const()[name = string("input_197_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_197_cast_fp16 = concat(axis = var_4495, interleave = input_197_interleave_0, values = (x_133_cast_fp16, var_4497_cast_fp16))[name = string("input_197_cast_fp16")];
+            tensor<int32, [1]> normed_189_axes_0 = const()[name = string("normed_189_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4492_to_fp16 = const()[name = string("op_4492_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_189_cast_fp16 = layer_norm(axes = normed_189_axes_0, epsilon = var_4492_to_fp16, x = input_197_cast_fp16)[name = string("normed_189_cast_fp16")];
+            tensor<int32, [2]> var_4502_split_sizes_0 = const()[name = string("op_4502_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4502_axis_0 = const()[name = string("op_4502_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_4502_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_4502_cast_fp16_1 = split(axis = var_4502_axis_0, split_sizes = var_4502_split_sizes_0, x = normed_189_cast_fp16)[name = string("op_4502_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_6_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(568132096)))];
+            tensor<fp16, [1, 1, 2560]> h_39_cast_fp16 = mul(x = var_4502_cast_fp16_0, y = layers_6_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_39_cast_fp16")];
+            tensor<int32, [3]> var_4513 = const()[name = string("op_4513"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_199_axes_0 = const()[name = string("input_199_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_4514 = transpose(perm = var_4513, x = h_39_cast_fp16)[name = string("transpose_94")];
+            tensor<fp16, [1, 2560, 1, 1]> input_199 = expand_dims(axes = input_199_axes_0, x = var_4514)[name = string("input_199")];
+            string gate_25_pad_type_0 = const()[name = string("gate_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_25_strides_0 = const()[name = string("gate_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_25_pad_0 = const()[name = string("gate_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_25_dilations_0 = const()[name = string("gate_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_25_groups_0 = const()[name = string("gate_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_25 = conv(dilations = gate_25_dilations_0, groups = gate_25_groups_0, pad = gate_25_pad_0, pad_type = gate_25_pad_type_0, strides = gate_25_strides_0, weight = layers_6_mlp_gate_proj_weight_palettized, x = input_199)[name = string("gate_25")];
+            string up_13_pad_type_0 = const()[name = string("up_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_13_strides_0 = const()[name = string("up_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_13_pad_0 = const()[name = string("up_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_13_dilations_0 = const()[name = string("up_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_13_groups_0 = const()[name = string("up_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_13 = conv(dilations = up_13_dilations_0, groups = up_13_groups_0, pad = up_13_pad_0, pad_type = up_13_pad_type_0, strides = up_13_strides_0, weight = layers_6_mlp_up_proj_weight_palettized, x = input_199)[name = string("up_13")];
+            string gate_27_mode_0 = const()[name = string("gate_27_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_27 = gelu(mode = gate_27_mode_0, x = gate_25)[name = string("gate_27")];
+            tensor<fp16, [1, 10240, 1, 1]> input_201 = mul(x = gate_27, y = up_13)[name = string("input_201")];
+            string mlp_out_13_pad_type_0 = const()[name = string("mlp_out_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_13_strides_0 = const()[name = string("mlp_out_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_13_pad_0 = const()[name = string("mlp_out_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_13_dilations_0 = const()[name = string("mlp_out_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_13_groups_0 = const()[name = string("mlp_out_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_13 = conv(dilations = mlp_out_13_dilations_0, groups = mlp_out_13_groups_0, pad = mlp_out_13_pad_0, pad_type = mlp_out_13_pad_type_0, strides = mlp_out_13_strides_0, weight = layers_6_mlp_down_proj_weight_palettized, x = input_201)[name = string("mlp_out_13")];
+            tensor<int32, [1]> var_4554_axes_0 = const()[name = string("op_4554_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_4554 = squeeze(axes = var_4554_axes_0, x = mlp_out_13)[name = string("op_4554")];
+            tensor<int32, [3]> var_4558 = const()[name = string("op_4558"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_4564 = const()[name = string("op_4564"), val = int32(-1)];
+            fp16 const_80_promoted = const()[name = string("const_80_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_135 = transpose(perm = var_4558, x = var_4554)[name = string("transpose_93")];
+            tensor<fp16, [1, 1, 2560]> var_4566 = mul(x = x_135, y = const_80_promoted)[name = string("op_4566")];
+            bool input_203_interleave_0 = const()[name = string("input_203_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_203 = concat(axis = var_4564, interleave = input_203_interleave_0, values = (x_135, var_4566))[name = string("input_203")];
+            tensor<int32, [1]> normed_193_axes_0 = const()[name = string("normed_193_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4561_to_fp16 = const()[name = string("op_4561_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_193_cast_fp16 = layer_norm(axes = normed_193_axes_0, epsilon = var_4561_to_fp16, x = input_203)[name = string("normed_193_cast_fp16")];
+            tensor<int32, [2]> var_4571_split_sizes_0 = const()[name = string("op_4571_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4571_axis_0 = const()[name = string("op_4571_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_4571_0, tensor<fp16, [1, 1, 2560]> var_4571_1 = split(axis = var_4571_axis_0, split_sizes = var_4571_split_sizes_0, x = normed_193_cast_fp16)[name = string("op_4571")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_63 = mul(x = var_4571_0, y = layers_6_post_feedforward_layernorm_weight)[name = string("hidden_states_63")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_65_cast_fp16 = add(x = x_133_cast_fp16, y = hidden_states_63)[name = string("hidden_states_65_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_13_begin_0 = const()[name = string("per_layer_slice_13_begin_0"), val = tensor<int32, [3]>([0, 0, 1536])];
+            tensor<int32, [3]> per_layer_slice_13_end_0 = const()[name = string("per_layer_slice_13_end_0"), val = tensor<int32, [3]>([1, 1, 1792])];
+            tensor<bool, [3]> per_layer_slice_13_end_mask_0 = const()[name = string("per_layer_slice_13_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_13_cast_fp16 = slice_by_index(begin = per_layer_slice_13_begin_0, end = per_layer_slice_13_end_0, end_mask = per_layer_slice_13_end_mask_0, x = per_layer_combined_out)[name = string("per_layer_slice_13_cast_fp16")];
+            tensor<int32, [3]> var_4599 = const()[name = string("op_4599"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_205_axes_0 = const()[name = string("input_205_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_4600 = transpose(perm = var_4599, x = hidden_states_65_cast_fp16)[name = string("transpose_92")];
+            tensor<fp16, [1, 2560, 1, 1]> input_205 = expand_dims(axes = input_205_axes_0, x = var_4600)[name = string("input_205")];
+            string gated_37_pad_type_0 = const()[name = string("gated_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_37_strides_0 = const()[name = string("gated_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_37_pad_0 = const()[name = string("gated_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_37_dilations_0 = const()[name = string("gated_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_37_groups_0 = const()[name = string("gated_37_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_37 = conv(dilations = gated_37_dilations_0, groups = gated_37_groups_0, pad = gated_37_pad_0, pad_type = gated_37_pad_type_0, strides = gated_37_strides_0, weight = layers_6_per_layer_input_gate_weight_palettized, x = input_205)[name = string("gated_37")];
+            string gated_39_mode_0 = const()[name = string("gated_39_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_39 = gelu(mode = gated_39_mode_0, x = gated_37)[name = string("gated_39")];
+            tensor<int32, [3]> var_4619 = const()[name = string("op_4619"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_13_axes_0 = const()[name = string("per_layer_slice_conv_13_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_4620_cast_fp16 = transpose(perm = var_4619, x = per_layer_slice_13_cast_fp16)[name = string("transpose_91")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_13_cast_fp16 = expand_dims(axes = per_layer_slice_conv_13_axes_0, x = var_4620_cast_fp16)[name = string("per_layer_slice_conv_13_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_207_cast_fp16 = mul(x = gated_39, y = per_layer_slice_conv_13_cast_fp16)[name = string("input_207_cast_fp16")];
+            string gated_41_pad_type_0 = const()[name = string("gated_41_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_41_strides_0 = const()[name = string("gated_41_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_41_pad_0 = const()[name = string("gated_41_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_41_dilations_0 = const()[name = string("gated_41_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_41_groups_0 = const()[name = string("gated_41_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_6_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(568137280))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(568465024))))[name = string("layers_6_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_41_cast_fp16 = conv(dilations = gated_41_dilations_0, groups = gated_41_groups_0, pad = gated_41_pad_0, pad_type = gated_41_pad_type_0, strides = gated_41_strides_0, weight = layers_6_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_207_cast_fp16)[name = string("gated_41_cast_fp16")];
+            tensor<int32, [1]> var_4636_axes_0 = const()[name = string("op_4636_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_4636_cast_fp16 = squeeze(axes = var_4636_axes_0, x = gated_41_cast_fp16)[name = string("op_4636_cast_fp16")];
+            tensor<int32, [3]> var_4640 = const()[name = string("op_4640"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_4646 = const()[name = string("op_4646"), val = int32(-1)];
+            fp16 const_81_promoted_to_fp16 = const()[name = string("const_81_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_137_cast_fp16 = transpose(perm = var_4640, x = var_4636_cast_fp16)[name = string("transpose_90")];
+            tensor<fp16, [1, 1, 2560]> var_4648_cast_fp16 = mul(x = x_137_cast_fp16, y = const_81_promoted_to_fp16)[name = string("op_4648_cast_fp16")];
+            bool input_209_interleave_0 = const()[name = string("input_209_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_209_cast_fp16 = concat(axis = var_4646, interleave = input_209_interleave_0, values = (x_137_cast_fp16, var_4648_cast_fp16))[name = string("input_209_cast_fp16")];
+            tensor<int32, [1]> normed_197_axes_0 = const()[name = string("normed_197_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4643_to_fp16 = const()[name = string("op_4643_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_197_cast_fp16 = layer_norm(axes = normed_197_axes_0, epsilon = var_4643_to_fp16, x = input_209_cast_fp16)[name = string("normed_197_cast_fp16")];
+            tensor<int32, [2]> var_4653_split_sizes_0 = const()[name = string("op_4653_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4653_axis_0 = const()[name = string("op_4653_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_4653_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_4653_cast_fp16_1 = split(axis = var_4653_axis_0, split_sizes = var_4653_split_sizes_0, x = normed_197_cast_fp16)[name = string("op_4653_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_6_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(568467648)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_69_cast_fp16 = mul(x = var_4653_cast_fp16_0, y = layers_6_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_69_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_71_cast_fp16 = add(x = hidden_states_65_cast_fp16, y = hidden_states_69_cast_fp16)[name = string("hidden_states_71_cast_fp16")];
+            tensor<fp16, [1]> const_82_promoted_to_fp16 = const()[name = string("const_82_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.1ep-1])];
+            tensor<fp16, [1, 1, 2560]> x_139_cast_fp16 = mul(x = hidden_states_71_cast_fp16, y = const_82_promoted_to_fp16)[name = string("x_139_cast_fp16")];
+            tensor<int32, [1]> var_4665_axes_0 = const()[name = string("op_4665_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_4665_cast_fp16 = squeeze(axes = var_4665_axes_0, x = K_sliding_out_11_cast_fp16)[name = string("op_4665_cast_fp16")];
+            tensor<int32, [1]> var_4667_axes_0 = const()[name = string("op_4667_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_4667_cast_fp16 = squeeze(axes = var_4667_axes_0, x = V_sliding_out_11_cast_fp16)[name = string("op_4667_cast_fp16")];
+            tensor<int32, [4]> var_4670_begin_0 = const()[name = string("op_4670_begin_0"), val = tensor<int32, [4]>([6, 0, 0, 0])];
+            tensor<int32, [4]> var_4670_end_0 = const()[name = string("op_4670_end_0"), val = tensor<int32, [4]>([7, 2, 512, 512])];
+            tensor<bool, [4]> var_4670_end_mask_0 = const()[name = string("op_4670_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_4670_squeeze_mask_0 = const()[name = string("op_4670_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_4670_cast_fp16 = slice_by_index(begin = var_4670_begin_0, end = var_4670_end_0, end_mask = var_4670_end_mask_0, squeeze_mask = var_4670_squeeze_mask_0, x = K_sliding_in)[name = string("op_4670_cast_fp16")];
+            tensor<int32, [1]> K_sliding_slot_13_axes_0 = const()[name = string("K_sliding_slot_13_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_slot_13_cast_fp16 = expand_dims(axes = K_sliding_slot_13_axes_0, x = var_4670_cast_fp16)[name = string("K_sliding_slot_13_cast_fp16")];
+            tensor<int32, [4]> var_4675_begin_0 = const()[name = string("op_4675_begin_0"), val = tensor<int32, [4]>([6, 0, 0, 0])];
+            tensor<int32, [4]> var_4675_end_0 = const()[name = string("op_4675_end_0"), val = tensor<int32, [4]>([7, 2, 512, 512])];
+            tensor<bool, [4]> var_4675_end_mask_0 = const()[name = string("op_4675_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_4675_squeeze_mask_0 = const()[name = string("op_4675_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_4675_cast_fp16 = slice_by_index(begin = var_4675_begin_0, end = var_4675_end_0, end_mask = var_4675_end_mask_0, squeeze_mask = var_4675_squeeze_mask_0, x = V_sliding_in)[name = string("op_4675_cast_fp16")];
+            tensor<int32, [1]> V_sliding_slot_13_axes_0 = const()[name = string("V_sliding_slot_13_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_slot_13_cast_fp16 = expand_dims(axes = V_sliding_slot_13_axes_0, x = var_4675_cast_fp16)[name = string("V_sliding_slot_13_cast_fp16")];
+            int32 var_4682 = const()[name = string("op_4682"), val = int32(-1)];
+            fp16 const_83_promoted_to_fp16 = const()[name = string("const_83_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_4684_cast_fp16 = mul(x = x_139_cast_fp16, y = const_83_promoted_to_fp16)[name = string("op_4684_cast_fp16")];
+            bool input_211_interleave_0 = const()[name = string("input_211_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_211_cast_fp16 = concat(axis = var_4682, interleave = input_211_interleave_0, values = (x_139_cast_fp16, var_4684_cast_fp16))[name = string("input_211_cast_fp16")];
+            tensor<int32, [1]> normed_201_axes_0 = const()[name = string("normed_201_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4679_to_fp16 = const()[name = string("op_4679_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_201_cast_fp16 = layer_norm(axes = normed_201_axes_0, epsilon = var_4679_to_fp16, x = input_211_cast_fp16)[name = string("normed_201_cast_fp16")];
+            tensor<int32, [2]> var_4689_split_sizes_0 = const()[name = string("op_4689_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4689_axis_0 = const()[name = string("op_4689_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_4689_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_4689_cast_fp16_1 = split(axis = var_4689_axis_0, split_sizes = var_4689_split_sizes_0, x = normed_201_cast_fp16)[name = string("op_4689_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_7_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(568472832)))];
+            tensor<fp16, [1, 1, 2560]> h_43_cast_fp16 = mul(x = var_4689_cast_fp16_0, y = layers_7_input_layernorm_weight_promoted_to_fp16)[name = string("h_43_cast_fp16")];
+            tensor<int32, [3]> var_4695 = const()[name = string("op_4695"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_4698_axes_0 = const()[name = string("op_4698_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_4696_cast_fp16 = transpose(perm = var_4695, x = h_43_cast_fp16)[name = string("transpose_89")];
+            tensor<fp16, [1, 2560, 1, 1]> var_4698_cast_fp16 = expand_dims(axes = var_4698_axes_0, x = var_4696_cast_fp16)[name = string("op_4698_cast_fp16")];
+            string var_4714_pad_type_0 = const()[name = string("op_4714_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_4714_strides_0 = const()[name = string("op_4714_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_4714_pad_0 = const()[name = string("op_4714_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_4714_dilations_0 = const()[name = string("op_4714_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_4714_groups_0 = const()[name = string("op_4714_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_4714 = conv(dilations = var_4714_dilations_0, groups = var_4714_groups_0, pad = var_4714_pad_0, pad_type = var_4714_pad_type_0, strides = var_4714_strides_0, weight = layers_7_self_attn_q_proj_weight_palettized, x = var_4698_cast_fp16)[name = string("op_4714")];
+            tensor<int32, [4]> var_4719 = const()[name = string("op_4719"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_4720 = reshape(shape = var_4719, x = var_4714)[name = string("op_4720")];
+            tensor<int32, [4]> var_4725 = const()[name = string("op_4725"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_4735 = const()[name = string("op_4735"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_4726 = transpose(perm = var_4725, x = var_4720)[name = string("transpose_88")];
+            tensor<fp16, [1, 8, 256]> x_141 = reshape(shape = var_4735, x = var_4726)[name = string("x_141")];
+            int32 var_4741 = const()[name = string("op_4741"), val = int32(-1)];
+            fp16 const_84_promoted = const()[name = string("const_84_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_4743 = mul(x = x_141, y = const_84_promoted)[name = string("op_4743")];
+            bool input_215_interleave_0 = const()[name = string("input_215_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_215 = concat(axis = var_4741, interleave = input_215_interleave_0, values = (x_141, var_4743))[name = string("input_215")];
+            tensor<int32, [1]> normed_205_axes_0 = const()[name = string("normed_205_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4738_to_fp16 = const()[name = string("op_4738_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_205_cast_fp16 = layer_norm(axes = normed_205_axes_0, epsilon = var_4738_to_fp16, x = input_215)[name = string("normed_205_cast_fp16")];
+            tensor<int32, [2]> var_4748_split_sizes_0 = const()[name = string("op_4748_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_4748_axis_0 = const()[name = string("op_4748_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_4748_0, tensor<fp16, [1, 8, 256]> var_4748_1 = split(axis = var_4748_axis_0, split_sizes = var_4748_split_sizes_0, x = normed_205_cast_fp16)[name = string("op_4748")];
+            tensor<fp16, [1, 8, 256]> var_4750 = mul(x = var_4748_0, y = layers_7_self_attn_q_norm_weight)[name = string("op_4750")];
+            tensor<int32, [4]> var_4755 = const()[name = string("op_4755"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_59 = reshape(shape = var_4755, x = var_4750)[name = string("q_59")];
+            tensor<fp16, [1, 8, 1, 256]> var_4757_cast_fp16 = mul(x = q_59, y = cos_s)[name = string("op_4757_cast_fp16")];
+            tensor<int32, [2]> var_4758_split_sizes_0 = const()[name = string("op_4758_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_4758_axis_0 = const()[name = string("op_4758_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_4758_0, tensor<fp16, [1, 8, 1, 128]> var_4758_1 = split(axis = var_4758_axis_0, split_sizes = var_4758_split_sizes_0, x = q_59)[name = string("op_4758")];
+            fp16 const_85_promoted = const()[name = string("const_85_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_4760 = mul(x = var_4758_1, y = const_85_promoted)[name = string("op_4760")];
+            int32 var_4762 = const()[name = string("op_4762"), val = int32(-1)];
+            bool var_4763_interleave_0 = const()[name = string("op_4763_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_4763 = concat(axis = var_4762, interleave = var_4763_interleave_0, values = (var_4760, var_4758_0))[name = string("op_4763")];
+            tensor<fp16, [1, 8, 1, 256]> var_4764_cast_fp16 = mul(x = var_4763, y = sin_s)[name = string("op_4764_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_63_cast_fp16 = add(x = var_4757_cast_fp16, y = var_4764_cast_fp16)[name = string("q_63_cast_fp16")];
+            string var_4777_pad_type_0 = const()[name = string("op_4777_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_4777_strides_0 = const()[name = string("op_4777_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_4777_pad_0 = const()[name = string("op_4777_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_4777_dilations_0 = const()[name = string("op_4777_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_4777_groups_0 = const()[name = string("op_4777_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_4777 = conv(dilations = var_4777_dilations_0, groups = var_4777_groups_0, pad = var_4777_pad_0, pad_type = var_4777_pad_type_0, strides = var_4777_strides_0, weight = layers_7_self_attn_k_proj_weight_palettized, x = var_4698_cast_fp16)[name = string("op_4777")];
+            tensor<int32, [4]> var_4782 = const()[name = string("op_4782"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_4783 = reshape(shape = var_4782, x = var_4777)[name = string("op_4783")];
+            tensor<int32, [4]> var_4788 = const()[name = string("op_4788"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_4805_pad_type_0 = const()[name = string("op_4805_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_4805_strides_0 = const()[name = string("op_4805_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_4805_pad_0 = const()[name = string("op_4805_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_4805_dilations_0 = const()[name = string("op_4805_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_4805_groups_0 = const()[name = string("op_4805_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_4805 = conv(dilations = var_4805_dilations_0, groups = var_4805_groups_0, pad = var_4805_pad_0, pad_type = var_4805_pad_type_0, strides = var_4805_strides_0, weight = layers_7_self_attn_v_proj_weight_palettized, x = var_4698_cast_fp16)[name = string("op_4805")];
+            tensor<int32, [4]> var_4810 = const()[name = string("op_4810"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_4811 = reshape(shape = var_4810, x = var_4805)[name = string("op_4811")];
+            tensor<int32, [4]> var_4816 = const()[name = string("op_4816"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_4826 = const()[name = string("op_4826"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp16, [1, 2, 1, 256]> var_4789 = transpose(perm = var_4788, x = var_4783)[name = string("transpose_87")];
+            tensor<fp16, [1, 2, 256]> x_143 = reshape(shape = var_4826, x = var_4789)[name = string("x_143")];
+            int32 var_4832 = const()[name = string("op_4832"), val = int32(-1)];
+            fp16 const_86_promoted = const()[name = string("const_86_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 256]> var_4834 = mul(x = x_143, y = const_86_promoted)[name = string("op_4834")];
+            bool input_217_interleave_0 = const()[name = string("input_217_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512]> input_217 = concat(axis = var_4832, interleave = input_217_interleave_0, values = (x_143, var_4834))[name = string("input_217")];
+            tensor<int32, [1]> normed_209_axes_0 = const()[name = string("normed_209_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4829_to_fp16 = const()[name = string("op_4829_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 512]> normed_209_cast_fp16 = layer_norm(axes = normed_209_axes_0, epsilon = var_4829_to_fp16, x = input_217)[name = string("normed_209_cast_fp16")];
+            tensor<int32, [2]> var_4839_split_sizes_0 = const()[name = string("op_4839_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_4839_axis_0 = const()[name = string("op_4839_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 256]> var_4839_0, tensor<fp16, [1, 2, 256]> var_4839_1 = split(axis = var_4839_axis_0, split_sizes = var_4839_split_sizes_0, x = normed_209_cast_fp16)[name = string("op_4839")];
+            tensor<fp16, [1, 2, 256]> var_4841 = mul(x = var_4839_0, y = layers_0_self_attn_k_norm_weight)[name = string("op_4841")];
+            tensor<int32, [4]> var_4846 = const()[name = string("op_4846"), val = tensor<int32, [4]>([1, 2, 1, 256])];
+            tensor<fp16, [1, 2, 1, 256]> q_61 = reshape(shape = var_4846, x = var_4841)[name = string("q_61")];
+            fp16 var_4848_promoted = const()[name = string("op_4848_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 256]> var_4817 = transpose(perm = var_4816, x = var_4811)[name = string("transpose_86")];
+            tensor<fp16, [1, 2, 1, 256]> var_4849 = pow(x = var_4817, y = var_4848_promoted)[name = string("op_4849")];
+            tensor<int32, [1]> var_4854_axes_0 = const()[name = string("op_4854_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4854_keep_dims_0 = const()[name = string("op_4854_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_4854 = reduce_mean(axes = var_4854_axes_0, keep_dims = var_4854_keep_dims_0, x = var_4849)[name = string("op_4854")];
+            fp16 var_4856_to_fp16 = const()[name = string("op_4856_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_15_cast_fp16 = add(x = var_4854, y = var_4856_to_fp16)[name = string("mean_sq_15_cast_fp16")];
+            fp32 var_4858_epsilon_0 = const()[name = string("op_4858_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_4858_cast_fp16 = rsqrt(epsilon = var_4858_epsilon_0, x = mean_sq_15_cast_fp16)[name = string("op_4858_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_221_cast_fp16 = mul(x = var_4817, y = var_4858_cast_fp16)[name = string("input_221_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> var_4860_cast_fp16 = mul(x = q_61, y = cos_s)[name = string("op_4860_cast_fp16")];
+            tensor<int32, [2]> var_4861_split_sizes_0 = const()[name = string("op_4861_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_4861_axis_0 = const()[name = string("op_4861_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 128]> var_4861_0, tensor<fp16, [1, 2, 1, 128]> var_4861_1 = split(axis = var_4861_axis_0, split_sizes = var_4861_split_sizes_0, x = q_61)[name = string("op_4861")];
+            fp16 const_87_promoted = const()[name = string("const_87_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 128]> var_4863 = mul(x = var_4861_1, y = const_87_promoted)[name = string("op_4863")];
+            int32 var_4865 = const()[name = string("op_4865"), val = int32(-1)];
+            bool var_4866_interleave_0 = const()[name = string("op_4866_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 256]> var_4866 = concat(axis = var_4865, interleave = var_4866_interleave_0, values = (var_4863, var_4861_0))[name = string("op_4866")];
+            tensor<fp16, [1, 2, 1, 256]> var_4867_cast_fp16 = mul(x = var_4866, y = sin_s)[name = string("op_4867_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_219_cast_fp16 = add(x = var_4860_cast_fp16, y = var_4867_cast_fp16)[name = string("input_219_cast_fp16")];
+            tensor<int32, [8]> k_padded_13_pad_0 = const()[name = string("k_padded_13_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_13_mode_0 = const()[name = string("k_padded_13_mode_0"), val = string("constant")];
+            fp16 const_88_to_fp16 = const()[name = string("const_88_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> k_padded_13_cast_fp16 = pad(constant_val = const_88_to_fp16, mode = k_padded_13_mode_0, pad = k_padded_13_pad_0, x = input_219_cast_fp16)[name = string("k_padded_13_cast_fp16")];
+            tensor<int32, [8]> v_padded_13_pad_0 = const()[name = string("v_padded_13_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_13_mode_0 = const()[name = string("v_padded_13_mode_0"), val = string("constant")];
+            fp16 const_89_to_fp16 = const()[name = string("const_89_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> v_padded_13_cast_fp16 = pad(constant_val = const_89_to_fp16, mode = v_padded_13_mode_0, pad = v_padded_13_pad_0, x = input_221_cast_fp16)[name = string("v_padded_13_cast_fp16")];
+            tensor<int32, [4]> var_4896_begin_0 = const()[name = string("op_4896_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_4896_end_0 = const()[name = string("op_4896_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_4896_end_mask_0 = const()[name = string("op_4896_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_4896_cast_fp16 = slice_by_index(begin = var_4896_begin_0, end = var_4896_end_0, end_mask = var_4896_end_mask_0, x = K_sliding_slot_13_cast_fp16)[name = string("op_4896_cast_fp16")];
+            int32 var_4903 = const()[name = string("op_4903"), val = int32(2)];
+            bool K_sliding_out_13_interleave_0 = const()[name = string("K_sliding_out_13_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_out_13_cast_fp16 = concat(axis = var_4903, interleave = K_sliding_out_13_interleave_0, values = (var_4896_cast_fp16, k_padded_13_cast_fp16))[name = string("K_sliding_out_13_cast_fp16")];
+            tensor<int32, [4]> var_4919_begin_0 = const()[name = string("op_4919_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_4919_end_0 = const()[name = string("op_4919_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_4919_end_mask_0 = const()[name = string("op_4919_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_4919_cast_fp16 = slice_by_index(begin = var_4919_begin_0, end = var_4919_end_0, end_mask = var_4919_end_mask_0, x = V_sliding_slot_13_cast_fp16)[name = string("op_4919_cast_fp16")];
+            int32 var_4926 = const()[name = string("op_4926"), val = int32(2)];
+            bool V_sliding_out_13_interleave_0 = const()[name = string("V_sliding_out_13_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_out_13_cast_fp16 = concat(axis = var_4926, interleave = V_sliding_out_13_interleave_0, values = (var_4919_cast_fp16, v_padded_13_cast_fp16))[name = string("V_sliding_out_13_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_15_begin_0 = const()[name = string("K_for_attn_15_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_15_end_0 = const()[name = string("K_for_attn_15_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_15_end_mask_0 = const()[name = string("K_for_attn_15_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_15_cast_fp16 = slice_by_index(begin = K_for_attn_15_begin_0, end = K_for_attn_15_end_0, end_mask = K_for_attn_15_end_mask_0, x = K_sliding_out_13_cast_fp16)[name = string("K_for_attn_15_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_15_begin_0 = const()[name = string("V_for_attn_15_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_15_end_0 = const()[name = string("V_for_attn_15_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_15_end_mask_0 = const()[name = string("V_for_attn_15_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_15_cast_fp16 = slice_by_index(begin = V_for_attn_15_begin_0, end = V_for_attn_15_end_0, end_mask = V_for_attn_15_end_mask_0, x = V_sliding_out_13_cast_fp16)[name = string("V_for_attn_15_cast_fp16")];
+            tensor<int32, [4]> transpose_28_perm_0 = const()[name = string("transpose_28_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_14_reps_0 = const()[name = string("tile_14_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_28_cast_fp16 = transpose(perm = transpose_28_perm_0, x = K_for_attn_15_cast_fp16)[name = string("transpose_85")];
+            tensor<fp16, [8, 1, 512, 256]> tile_14_cast_fp16 = tile(reps = tile_14_reps_0, x = transpose_28_cast_fp16)[name = string("tile_14_cast_fp16")];
+            tensor<int32, [5]> concat_28 = const()[name = string("concat_28"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_28_cast_fp16 = reshape(shape = concat_28, x = tile_14_cast_fp16)[name = string("reshape_28_cast_fp16")];
+            tensor<int32, [5]> transpose_29_perm_0 = const()[name = string("transpose_29_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_29 = const()[name = string("concat_29"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_29_cast_fp16 = transpose(perm = transpose_29_perm_0, x = reshape_28_cast_fp16)[name = string("transpose_84")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_29_cast_fp16 = reshape(shape = concat_29, x = transpose_29_cast_fp16)[name = string("reshape_29_cast_fp16")];
+            tensor<int32, [4]> transpose_55_perm_0 = const()[name = string("transpose_55_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_30_perm_0 = const()[name = string("transpose_30_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_15_reps_0 = const()[name = string("tile_15_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_30_cast_fp16 = transpose(perm = transpose_30_perm_0, x = V_for_attn_15_cast_fp16)[name = string("transpose_83")];
+            tensor<fp16, [8, 1, 512, 256]> tile_15_cast_fp16 = tile(reps = tile_15_reps_0, x = transpose_30_cast_fp16)[name = string("tile_15_cast_fp16")];
+            tensor<int32, [5]> concat_30 = const()[name = string("concat_30"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_30_cast_fp16 = reshape(shape = concat_30, x = tile_15_cast_fp16)[name = string("reshape_30_cast_fp16")];
+            tensor<int32, [5]> transpose_31_perm_0 = const()[name = string("transpose_31_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_31 = const()[name = string("concat_31"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_31_cast_fp16 = transpose(perm = transpose_31_perm_0, x = reshape_30_cast_fp16)[name = string("transpose_82")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_31_cast_fp16 = reshape(shape = concat_31, x = transpose_31_cast_fp16)[name = string("reshape_31_cast_fp16")];
+            tensor<int32, [4]> V_expanded_15_perm_0 = const()[name = string("V_expanded_15_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_29_transpose_x_0 = const()[name = string("attn_weights_29_transpose_x_0"), val = bool(false)];
+            bool attn_weights_29_transpose_y_0 = const()[name = string("attn_weights_29_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_55_cast_fp16 = transpose(perm = transpose_55_perm_0, x = reshape_29_cast_fp16)[name = string("transpose_81")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_29_cast_fp16 = matmul(transpose_x = attn_weights_29_transpose_x_0, transpose_y = attn_weights_29_transpose_y_0, x = q_63_cast_fp16, y = transpose_55_cast_fp16)[name = string("attn_weights_29_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_147_cast_fp16 = add(x = attn_weights_29_cast_fp16, y = causal_mask_sliding)[name = string("x_147_cast_fp16")];
+            tensor<int32, [1]> reduce_max_7_axes_0 = const()[name = string("reduce_max_7_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_7_keep_dims_0 = const()[name = string("reduce_max_7_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_7 = reduce_max(axes = reduce_max_7_axes_0, keep_dims = reduce_max_7_keep_dims_0, x = x_147_cast_fp16)[name = string("reduce_max_7")];
+            tensor<fp16, [1, 8, 1, 512]> var_4967 = sub(x = x_147_cast_fp16, y = reduce_max_7)[name = string("op_4967")];
+            tensor<fp16, [1, 8, 1, 512]> var_4973 = exp(x = var_4967)[name = string("op_4973")];
+            tensor<int32, [1]> var_4983_axes_0 = const()[name = string("op_4983_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4983_keep_dims_0 = const()[name = string("op_4983_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_4983 = reduce_sum(axes = var_4983_axes_0, keep_dims = var_4983_keep_dims_0, x = var_4973)[name = string("op_4983")];
+            tensor<fp16, [1, 8, 1, 512]> var_4989_cast_fp16 = real_div(x = var_4973, y = var_4983)[name = string("op_4989_cast_fp16")];
+            bool attn_output_43_transpose_x_0 = const()[name = string("attn_output_43_transpose_x_0"), val = bool(false)];
+            bool attn_output_43_transpose_y_0 = const()[name = string("attn_output_43_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_15_cast_fp16 = transpose(perm = V_expanded_15_perm_0, x = reshape_31_cast_fp16)[name = string("transpose_80")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_43_cast_fp16 = matmul(transpose_x = attn_output_43_transpose_x_0, transpose_y = attn_output_43_transpose_y_0, x = var_4989_cast_fp16, y = V_expanded_15_cast_fp16)[name = string("attn_output_43_cast_fp16")];
+            tensor<int32, [4]> var_5000 = const()[name = string("op_5000"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_5007 = const()[name = string("op_5007"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_5001_cast_fp16 = transpose(perm = var_5000, x = attn_output_43_cast_fp16)[name = string("transpose_79")];
+            tensor<fp16, [1, 1, 2048]> attn_output_45_cast_fp16 = reshape(shape = var_5007, x = var_5001_cast_fp16)[name = string("attn_output_45_cast_fp16")];
+            tensor<int32, [3]> var_5012 = const()[name = string("op_5012"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_5028_pad_type_0 = const()[name = string("op_5028_pad_type_0"), val = string("valid")];
+            int32 var_5028_groups_0 = const()[name = string("op_5028_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_5028_strides_0 = const()[name = string("op_5028_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_5028_pad_0 = const()[name = string("op_5028_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_5028_dilations_0 = const()[name = string("op_5028_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_7_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(568478016))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(571099520))))[name = string("squeeze_7_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_5013_cast_fp16 = transpose(perm = var_5012, x = attn_output_45_cast_fp16)[name = string("transpose_78")];
+            tensor<fp16, [1, 2560, 1]> var_5028_cast_fp16 = conv(dilations = var_5028_dilations_0, groups = var_5028_groups_0, pad = var_5028_pad_0, pad_type = var_5028_pad_type_0, strides = var_5028_strides_0, weight = squeeze_7_cast_fp16_to_fp32_to_fp16_palettized, x = var_5013_cast_fp16)[name = string("op_5028_cast_fp16")];
+            tensor<int32, [3]> var_5032 = const()[name = string("op_5032"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_5038 = const()[name = string("op_5038"), val = int32(-1)];
+            fp16 const_90_promoted_to_fp16 = const()[name = string("const_90_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_151_cast_fp16 = transpose(perm = var_5032, x = var_5028_cast_fp16)[name = string("transpose_77")];
+            tensor<fp16, [1, 1, 2560]> var_5040_cast_fp16 = mul(x = x_151_cast_fp16, y = const_90_promoted_to_fp16)[name = string("op_5040_cast_fp16")];
+            bool input_225_interleave_0 = const()[name = string("input_225_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_225_cast_fp16 = concat(axis = var_5038, interleave = input_225_interleave_0, values = (x_151_cast_fp16, var_5040_cast_fp16))[name = string("input_225_cast_fp16")];
+            tensor<int32, [1]> normed_213_axes_0 = const()[name = string("normed_213_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5035_to_fp16 = const()[name = string("op_5035_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_213_cast_fp16 = layer_norm(axes = normed_213_axes_0, epsilon = var_5035_to_fp16, x = input_225_cast_fp16)[name = string("normed_213_cast_fp16")];
+            tensor<int32, [2]> var_5045_split_sizes_0 = const()[name = string("op_5045_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5045_axis_0 = const()[name = string("op_5045_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_5045_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_5045_cast_fp16_1 = split(axis = var_5045_axis_0, split_sizes = var_5045_split_sizes_0, x = normed_213_cast_fp16)[name = string("op_5045_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_7_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(571102144)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_47_cast_fp16 = mul(x = var_5045_cast_fp16_0, y = layers_7_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_47_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_153_cast_fp16 = add(x = x_139_cast_fp16, y = attn_output_47_cast_fp16)[name = string("x_153_cast_fp16")];
+            int32 var_5054 = const()[name = string("op_5054"), val = int32(-1)];
+            fp16 const_91_promoted_to_fp16 = const()[name = string("const_91_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_5056_cast_fp16 = mul(x = x_153_cast_fp16, y = const_91_promoted_to_fp16)[name = string("op_5056_cast_fp16")];
+            bool input_227_interleave_0 = const()[name = string("input_227_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_227_cast_fp16 = concat(axis = var_5054, interleave = input_227_interleave_0, values = (x_153_cast_fp16, var_5056_cast_fp16))[name = string("input_227_cast_fp16")];
+            tensor<int32, [1]> normed_217_axes_0 = const()[name = string("normed_217_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5051_to_fp16 = const()[name = string("op_5051_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_217_cast_fp16 = layer_norm(axes = normed_217_axes_0, epsilon = var_5051_to_fp16, x = input_227_cast_fp16)[name = string("normed_217_cast_fp16")];
+            tensor<int32, [2]> var_5061_split_sizes_0 = const()[name = string("op_5061_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5061_axis_0 = const()[name = string("op_5061_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_5061_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_5061_cast_fp16_1 = split(axis = var_5061_axis_0, split_sizes = var_5061_split_sizes_0, x = normed_217_cast_fp16)[name = string("op_5061_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_7_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(571107328)))];
+            tensor<fp16, [1, 1, 2560]> h_45_cast_fp16 = mul(x = var_5061_cast_fp16_0, y = layers_7_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_45_cast_fp16")];
+            tensor<int32, [3]> var_5072 = const()[name = string("op_5072"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_229_axes_0 = const()[name = string("input_229_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_5073 = transpose(perm = var_5072, x = h_45_cast_fp16)[name = string("transpose_76")];
+            tensor<fp16, [1, 2560, 1, 1]> input_229 = expand_dims(axes = input_229_axes_0, x = var_5073)[name = string("input_229")];
+            string gate_29_pad_type_0 = const()[name = string("gate_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_29_strides_0 = const()[name = string("gate_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_29_pad_0 = const()[name = string("gate_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_29_dilations_0 = const()[name = string("gate_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_29_groups_0 = const()[name = string("gate_29_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_29 = conv(dilations = gate_29_dilations_0, groups = gate_29_groups_0, pad = gate_29_pad_0, pad_type = gate_29_pad_type_0, strides = gate_29_strides_0, weight = layers_7_mlp_gate_proj_weight_palettized, x = input_229)[name = string("gate_29")];
+            string up_15_pad_type_0 = const()[name = string("up_15_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_15_strides_0 = const()[name = string("up_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_15_pad_0 = const()[name = string("up_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_15_dilations_0 = const()[name = string("up_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_15_groups_0 = const()[name = string("up_15_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_15 = conv(dilations = up_15_dilations_0, groups = up_15_groups_0, pad = up_15_pad_0, pad_type = up_15_pad_type_0, strides = up_15_strides_0, weight = layers_7_mlp_up_proj_weight_palettized, x = input_229)[name = string("up_15")];
+            string gate_31_mode_0 = const()[name = string("gate_31_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_31 = gelu(mode = gate_31_mode_0, x = gate_29)[name = string("gate_31")];
+            tensor<fp16, [1, 10240, 1, 1]> input_231 = mul(x = gate_31, y = up_15)[name = string("input_231")];
+            string mlp_out_15_pad_type_0 = const()[name = string("mlp_out_15_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_15_strides_0 = const()[name = string("mlp_out_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_15_pad_0 = const()[name = string("mlp_out_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_15_dilations_0 = const()[name = string("mlp_out_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_15_groups_0 = const()[name = string("mlp_out_15_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_15 = conv(dilations = mlp_out_15_dilations_0, groups = mlp_out_15_groups_0, pad = mlp_out_15_pad_0, pad_type = mlp_out_15_pad_type_0, strides = mlp_out_15_strides_0, weight = layers_7_mlp_down_proj_weight_palettized, x = input_231)[name = string("mlp_out_15")];
+            tensor<int32, [1]> var_5113_axes_0 = const()[name = string("op_5113_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_5113 = squeeze(axes = var_5113_axes_0, x = mlp_out_15)[name = string("op_5113")];
+            tensor<int32, [3]> var_5117 = const()[name = string("op_5117"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_5123 = const()[name = string("op_5123"), val = int32(-1)];
+            fp16 const_92_promoted = const()[name = string("const_92_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_155 = transpose(perm = var_5117, x = var_5113)[name = string("transpose_75")];
+            tensor<fp16, [1, 1, 2560]> var_5125 = mul(x = x_155, y = const_92_promoted)[name = string("op_5125")];
+            bool input_233_interleave_0 = const()[name = string("input_233_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_233 = concat(axis = var_5123, interleave = input_233_interleave_0, values = (x_155, var_5125))[name = string("input_233")];
+            tensor<int32, [1]> normed_221_axes_0 = const()[name = string("normed_221_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5120_to_fp16 = const()[name = string("op_5120_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_221_cast_fp16 = layer_norm(axes = normed_221_axes_0, epsilon = var_5120_to_fp16, x = input_233)[name = string("normed_221_cast_fp16")];
+            tensor<int32, [2]> var_5130_split_sizes_0 = const()[name = string("op_5130_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5130_axis_0 = const()[name = string("op_5130_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_5130_0, tensor<fp16, [1, 1, 2560]> var_5130_1 = split(axis = var_5130_axis_0, split_sizes = var_5130_split_sizes_0, x = normed_221_cast_fp16)[name = string("op_5130")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_73 = mul(x = var_5130_0, y = layers_7_post_feedforward_layernorm_weight)[name = string("hidden_states_73")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_75_cast_fp16 = add(x = x_153_cast_fp16, y = hidden_states_73)[name = string("hidden_states_75_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_15_begin_0 = const()[name = string("per_layer_slice_15_begin_0"), val = tensor<int32, [3]>([0, 0, 1792])];
+            tensor<int32, [3]> per_layer_slice_15_end_0 = const()[name = string("per_layer_slice_15_end_0"), val = tensor<int32, [3]>([1, 1, 2048])];
+            tensor<bool, [3]> per_layer_slice_15_end_mask_0 = const()[name = string("per_layer_slice_15_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_15_cast_fp16 = slice_by_index(begin = per_layer_slice_15_begin_0, end = per_layer_slice_15_end_0, end_mask = per_layer_slice_15_end_mask_0, x = per_layer_combined_out)[name = string("per_layer_slice_15_cast_fp16")];
+            tensor<int32, [3]> var_5158 = const()[name = string("op_5158"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_235_axes_0 = const()[name = string("input_235_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_5159 = transpose(perm = var_5158, x = hidden_states_75_cast_fp16)[name = string("transpose_74")];
+            tensor<fp16, [1, 2560, 1, 1]> input_235 = expand_dims(axes = input_235_axes_0, x = var_5159)[name = string("input_235")];
+            string gated_43_pad_type_0 = const()[name = string("gated_43_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_43_strides_0 = const()[name = string("gated_43_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_43_pad_0 = const()[name = string("gated_43_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_43_dilations_0 = const()[name = string("gated_43_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_43_groups_0 = const()[name = string("gated_43_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_43 = conv(dilations = gated_43_dilations_0, groups = gated_43_groups_0, pad = gated_43_pad_0, pad_type = gated_43_pad_type_0, strides = gated_43_strides_0, weight = layers_7_per_layer_input_gate_weight_palettized, x = input_235)[name = string("gated_43")];
+            string gated_45_mode_0 = const()[name = string("gated_45_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_45 = gelu(mode = gated_45_mode_0, x = gated_43)[name = string("gated_45")];
+            tensor<int32, [3]> var_5178 = const()[name = string("op_5178"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_15_axes_0 = const()[name = string("per_layer_slice_conv_15_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_5179_cast_fp16 = transpose(perm = var_5178, x = per_layer_slice_15_cast_fp16)[name = string("transpose_73")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_15_cast_fp16 = expand_dims(axes = per_layer_slice_conv_15_axes_0, x = var_5179_cast_fp16)[name = string("per_layer_slice_conv_15_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_237_cast_fp16 = mul(x = gated_45, y = per_layer_slice_conv_15_cast_fp16)[name = string("input_237_cast_fp16")];
+            string gated_47_pad_type_0 = const()[name = string("gated_47_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_47_strides_0 = const()[name = string("gated_47_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_47_pad_0 = const()[name = string("gated_47_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_47_dilations_0 = const()[name = string("gated_47_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_47_groups_0 = const()[name = string("gated_47_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_7_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(571112512))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(571440256))))[name = string("layers_7_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_47_cast_fp16 = conv(dilations = gated_47_dilations_0, groups = gated_47_groups_0, pad = gated_47_pad_0, pad_type = gated_47_pad_type_0, strides = gated_47_strides_0, weight = layers_7_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_237_cast_fp16)[name = string("gated_47_cast_fp16")];
+            tensor<int32, [1]> var_5195_axes_0 = const()[name = string("op_5195_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_5195_cast_fp16 = squeeze(axes = var_5195_axes_0, x = gated_47_cast_fp16)[name = string("op_5195_cast_fp16")];
+            tensor<int32, [3]> var_5199 = const()[name = string("op_5199"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_5205 = const()[name = string("op_5205"), val = int32(-1)];
+            fp16 const_93_promoted_to_fp16 = const()[name = string("const_93_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_157_cast_fp16 = transpose(perm = var_5199, x = var_5195_cast_fp16)[name = string("transpose_72")];
+            tensor<fp16, [1, 1, 2560]> var_5207_cast_fp16 = mul(x = x_157_cast_fp16, y = const_93_promoted_to_fp16)[name = string("op_5207_cast_fp16")];
+            bool input_239_interleave_0 = const()[name = string("input_239_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_239_cast_fp16 = concat(axis = var_5205, interleave = input_239_interleave_0, values = (x_157_cast_fp16, var_5207_cast_fp16))[name = string("input_239_cast_fp16")];
+            tensor<int32, [1]> normed_225_axes_0 = const()[name = string("normed_225_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5202_to_fp16 = const()[name = string("op_5202_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_225_cast_fp16 = layer_norm(axes = normed_225_axes_0, epsilon = var_5202_to_fp16, x = input_239_cast_fp16)[name = string("normed_225_cast_fp16")];
+            tensor<int32, [2]> var_5212_split_sizes_0 = const()[name = string("op_5212_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5212_axis_0 = const()[name = string("op_5212_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_5212_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_5212_cast_fp16_1 = split(axis = var_5212_axis_0, split_sizes = var_5212_split_sizes_0, x = normed_225_cast_fp16)[name = string("op_5212_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_7_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(571442880)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_79_cast_fp16 = mul(x = var_5212_cast_fp16_0, y = layers_7_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_79_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_81_cast_fp16 = add(x = hidden_states_75_cast_fp16, y = hidden_states_79_cast_fp16)[name = string("hidden_states_81_cast_fp16")];
+            tensor<fp16, [1]> const_94_promoted_to_fp16 = const()[name = string("const_94_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.58p-1])];
+            tensor<fp16, [1, 1, 2560]> x_159_cast_fp16 = mul(x = hidden_states_81_cast_fp16, y = const_94_promoted_to_fp16)[name = string("x_159_cast_fp16")];
+            tensor<int32, [1]> var_5224_axes_0 = const()[name = string("op_5224_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_5224_cast_fp16 = squeeze(axes = var_5224_axes_0, x = K_sliding_out_13_cast_fp16)[name = string("op_5224_cast_fp16")];
+            tensor<int32, [1]> var_5226_axes_0 = const()[name = string("op_5226_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_5226_cast_fp16 = squeeze(axes = var_5226_axes_0, x = V_sliding_out_13_cast_fp16)[name = string("op_5226_cast_fp16")];
+            tensor<int32, [4]> var_5229_begin_0 = const()[name = string("op_5229_begin_0"), val = tensor<int32, [4]>([7, 0, 0, 0])];
+            tensor<int32, [4]> var_5229_end_0 = const()[name = string("op_5229_end_0"), val = tensor<int32, [4]>([8, 2, 512, 512])];
+            tensor<bool, [4]> var_5229_end_mask_0 = const()[name = string("op_5229_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_5229_squeeze_mask_0 = const()[name = string("op_5229_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_5229_cast_fp16 = slice_by_index(begin = var_5229_begin_0, end = var_5229_end_0, end_mask = var_5229_end_mask_0, squeeze_mask = var_5229_squeeze_mask_0, x = K_sliding_in)[name = string("op_5229_cast_fp16")];
+            tensor<int32, [1]> K_sliding_slot_15_axes_0 = const()[name = string("K_sliding_slot_15_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_slot_15_cast_fp16 = expand_dims(axes = K_sliding_slot_15_axes_0, x = var_5229_cast_fp16)[name = string("K_sliding_slot_15_cast_fp16")];
+            tensor<int32, [4]> var_5234_begin_0 = const()[name = string("op_5234_begin_0"), val = tensor<int32, [4]>([7, 0, 0, 0])];
+            tensor<int32, [4]> var_5234_end_0 = const()[name = string("op_5234_end_0"), val = tensor<int32, [4]>([8, 2, 512, 512])];
+            tensor<bool, [4]> var_5234_end_mask_0 = const()[name = string("op_5234_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_5234_squeeze_mask_0 = const()[name = string("op_5234_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_5234_cast_fp16 = slice_by_index(begin = var_5234_begin_0, end = var_5234_end_0, end_mask = var_5234_end_mask_0, squeeze_mask = var_5234_squeeze_mask_0, x = V_sliding_in)[name = string("op_5234_cast_fp16")];
+            tensor<int32, [1]> V_sliding_slot_15_axes_0 = const()[name = string("V_sliding_slot_15_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_slot_15_cast_fp16 = expand_dims(axes = V_sliding_slot_15_axes_0, x = var_5234_cast_fp16)[name = string("V_sliding_slot_15_cast_fp16")];
+            int32 var_5241 = const()[name = string("op_5241"), val = int32(-1)];
+            fp16 const_95_promoted_to_fp16 = const()[name = string("const_95_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_5243_cast_fp16 = mul(x = x_159_cast_fp16, y = const_95_promoted_to_fp16)[name = string("op_5243_cast_fp16")];
+            bool input_241_interleave_0 = const()[name = string("input_241_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_241_cast_fp16 = concat(axis = var_5241, interleave = input_241_interleave_0, values = (x_159_cast_fp16, var_5243_cast_fp16))[name = string("input_241_cast_fp16")];
+            tensor<int32, [1]> normed_229_axes_0 = const()[name = string("normed_229_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5238_to_fp16 = const()[name = string("op_5238_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_229_cast_fp16 = layer_norm(axes = normed_229_axes_0, epsilon = var_5238_to_fp16, x = input_241_cast_fp16)[name = string("normed_229_cast_fp16")];
+            tensor<int32, [2]> var_5248_split_sizes_0 = const()[name = string("op_5248_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5248_axis_0 = const()[name = string("op_5248_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_5248_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_5248_cast_fp16_1 = split(axis = var_5248_axis_0, split_sizes = var_5248_split_sizes_0, x = normed_229_cast_fp16)[name = string("op_5248_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_8_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(571448064)))];
+            tensor<fp16, [1, 1, 2560]> h_49_cast_fp16 = mul(x = var_5248_cast_fp16_0, y = layers_8_input_layernorm_weight_promoted_to_fp16)[name = string("h_49_cast_fp16")];
+            tensor<int32, [3]> var_5254 = const()[name = string("op_5254"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_5257_axes_0 = const()[name = string("op_5257_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_5255_cast_fp16 = transpose(perm = var_5254, x = h_49_cast_fp16)[name = string("transpose_71")];
+            tensor<fp16, [1, 2560, 1, 1]> var_5257_cast_fp16 = expand_dims(axes = var_5257_axes_0, x = var_5255_cast_fp16)[name = string("op_5257_cast_fp16")];
+            string var_5273_pad_type_0 = const()[name = string("op_5273_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_5273_strides_0 = const()[name = string("op_5273_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_5273_pad_0 = const()[name = string("op_5273_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_5273_dilations_0 = const()[name = string("op_5273_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_5273_groups_0 = const()[name = string("op_5273_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_5273 = conv(dilations = var_5273_dilations_0, groups = var_5273_groups_0, pad = var_5273_pad_0, pad_type = var_5273_pad_type_0, strides = var_5273_strides_0, weight = layers_8_self_attn_q_proj_weight_palettized, x = var_5257_cast_fp16)[name = string("op_5273")];
+            tensor<int32, [4]> var_5278 = const()[name = string("op_5278"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_5279 = reshape(shape = var_5278, x = var_5273)[name = string("op_5279")];
+            tensor<int32, [4]> var_5284 = const()[name = string("op_5284"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_5294 = const()[name = string("op_5294"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_5285 = transpose(perm = var_5284, x = var_5279)[name = string("transpose_70")];
+            tensor<fp16, [1, 8, 256]> x_161 = reshape(shape = var_5294, x = var_5285)[name = string("x_161")];
+            int32 var_5300 = const()[name = string("op_5300"), val = int32(-1)];
+            fp16 const_96_promoted = const()[name = string("const_96_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_5302 = mul(x = x_161, y = const_96_promoted)[name = string("op_5302")];
+            bool input_245_interleave_0 = const()[name = string("input_245_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_245 = concat(axis = var_5300, interleave = input_245_interleave_0, values = (x_161, var_5302))[name = string("input_245")];
+            tensor<int32, [1]> normed_233_axes_0 = const()[name = string("normed_233_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5297_to_fp16 = const()[name = string("op_5297_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_233_cast_fp16 = layer_norm(axes = normed_233_axes_0, epsilon = var_5297_to_fp16, x = input_245)[name = string("normed_233_cast_fp16")];
+            tensor<int32, [2]> var_5307_split_sizes_0 = const()[name = string("op_5307_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_5307_axis_0 = const()[name = string("op_5307_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_5307_0, tensor<fp16, [1, 8, 256]> var_5307_1 = split(axis = var_5307_axis_0, split_sizes = var_5307_split_sizes_0, x = normed_233_cast_fp16)[name = string("op_5307")];
+            tensor<fp16, [1, 8, 256]> var_5309 = mul(x = var_5307_0, y = layers_8_self_attn_q_norm_weight)[name = string("op_5309")];
+            tensor<int32, [4]> var_5314 = const()[name = string("op_5314"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_67 = reshape(shape = var_5314, x = var_5309)[name = string("q_67")];
+            tensor<fp16, [1, 8, 1, 256]> var_5316_cast_fp16 = mul(x = q_67, y = cos_s)[name = string("op_5316_cast_fp16")];
+            tensor<int32, [2]> var_5317_split_sizes_0 = const()[name = string("op_5317_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_5317_axis_0 = const()[name = string("op_5317_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_5317_0, tensor<fp16, [1, 8, 1, 128]> var_5317_1 = split(axis = var_5317_axis_0, split_sizes = var_5317_split_sizes_0, x = q_67)[name = string("op_5317")];
+            fp16 const_97_promoted = const()[name = string("const_97_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_5319 = mul(x = var_5317_1, y = const_97_promoted)[name = string("op_5319")];
+            int32 var_5321 = const()[name = string("op_5321"), val = int32(-1)];
+            bool var_5322_interleave_0 = const()[name = string("op_5322_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_5322 = concat(axis = var_5321, interleave = var_5322_interleave_0, values = (var_5319, var_5317_0))[name = string("op_5322")];
+            tensor<fp16, [1, 8, 1, 256]> var_5323_cast_fp16 = mul(x = var_5322, y = sin_s)[name = string("op_5323_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_71_cast_fp16 = add(x = var_5316_cast_fp16, y = var_5323_cast_fp16)[name = string("q_71_cast_fp16")];
+            string var_5336_pad_type_0 = const()[name = string("op_5336_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_5336_strides_0 = const()[name = string("op_5336_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_5336_pad_0 = const()[name = string("op_5336_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_5336_dilations_0 = const()[name = string("op_5336_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_5336_groups_0 = const()[name = string("op_5336_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_5336 = conv(dilations = var_5336_dilations_0, groups = var_5336_groups_0, pad = var_5336_pad_0, pad_type = var_5336_pad_type_0, strides = var_5336_strides_0, weight = layers_8_self_attn_k_proj_weight_palettized, x = var_5257_cast_fp16)[name = string("op_5336")];
+            tensor<int32, [4]> var_5341 = const()[name = string("op_5341"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_5342 = reshape(shape = var_5341, x = var_5336)[name = string("op_5342")];
+            tensor<int32, [4]> var_5347 = const()[name = string("op_5347"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_5364_pad_type_0 = const()[name = string("op_5364_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_5364_strides_0 = const()[name = string("op_5364_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_5364_pad_0 = const()[name = string("op_5364_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_5364_dilations_0 = const()[name = string("op_5364_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_5364_groups_0 = const()[name = string("op_5364_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_5364 = conv(dilations = var_5364_dilations_0, groups = var_5364_groups_0, pad = var_5364_pad_0, pad_type = var_5364_pad_type_0, strides = var_5364_strides_0, weight = layers_8_self_attn_v_proj_weight_palettized, x = var_5257_cast_fp16)[name = string("op_5364")];
+            tensor<int32, [4]> var_5369 = const()[name = string("op_5369"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_5370 = reshape(shape = var_5369, x = var_5364)[name = string("op_5370")];
+            tensor<int32, [4]> var_5375 = const()[name = string("op_5375"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_5385 = const()[name = string("op_5385"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp16, [1, 2, 1, 256]> var_5348 = transpose(perm = var_5347, x = var_5342)[name = string("transpose_69")];
+            tensor<fp16, [1, 2, 256]> x_163 = reshape(shape = var_5385, x = var_5348)[name = string("x_163")];
+            int32 var_5391 = const()[name = string("op_5391"), val = int32(-1)];
+            fp16 const_98_promoted = const()[name = string("const_98_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 256]> var_5393 = mul(x = x_163, y = const_98_promoted)[name = string("op_5393")];
+            bool input_247_interleave_0 = const()[name = string("input_247_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512]> input_247 = concat(axis = var_5391, interleave = input_247_interleave_0, values = (x_163, var_5393))[name = string("input_247")];
+            tensor<int32, [1]> normed_237_axes_0 = const()[name = string("normed_237_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5388_to_fp16 = const()[name = string("op_5388_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 512]> normed_237_cast_fp16 = layer_norm(axes = normed_237_axes_0, epsilon = var_5388_to_fp16, x = input_247)[name = string("normed_237_cast_fp16")];
+            tensor<int32, [2]> var_5398_split_sizes_0 = const()[name = string("op_5398_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_5398_axis_0 = const()[name = string("op_5398_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 256]> var_5398_0, tensor<fp16, [1, 2, 256]> var_5398_1 = split(axis = var_5398_axis_0, split_sizes = var_5398_split_sizes_0, x = normed_237_cast_fp16)[name = string("op_5398")];
+            tensor<fp16, [1, 2, 256]> var_5400 = mul(x = var_5398_0, y = layers_8_self_attn_k_norm_weight)[name = string("op_5400")];
+            tensor<int32, [4]> var_5405 = const()[name = string("op_5405"), val = tensor<int32, [4]>([1, 2, 1, 256])];
+            tensor<fp16, [1, 2, 1, 256]> q_69 = reshape(shape = var_5405, x = var_5400)[name = string("q_69")];
+            fp16 var_5407_promoted = const()[name = string("op_5407_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 256]> var_5376 = transpose(perm = var_5375, x = var_5370)[name = string("transpose_68")];
+            tensor<fp16, [1, 2, 1, 256]> var_5408 = pow(x = var_5376, y = var_5407_promoted)[name = string("op_5408")];
+            tensor<int32, [1]> var_5413_axes_0 = const()[name = string("op_5413_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5413_keep_dims_0 = const()[name = string("op_5413_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_5413 = reduce_mean(axes = var_5413_axes_0, keep_dims = var_5413_keep_dims_0, x = var_5408)[name = string("op_5413")];
+            fp16 var_5415_to_fp16 = const()[name = string("op_5415_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_17_cast_fp16 = add(x = var_5413, y = var_5415_to_fp16)[name = string("mean_sq_17_cast_fp16")];
+            fp32 var_5417_epsilon_0 = const()[name = string("op_5417_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_5417_cast_fp16 = rsqrt(epsilon = var_5417_epsilon_0, x = mean_sq_17_cast_fp16)[name = string("op_5417_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_251_cast_fp16 = mul(x = var_5376, y = var_5417_cast_fp16)[name = string("input_251_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> var_5419_cast_fp16 = mul(x = q_69, y = cos_s)[name = string("op_5419_cast_fp16")];
+            tensor<int32, [2]> var_5420_split_sizes_0 = const()[name = string("op_5420_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_5420_axis_0 = const()[name = string("op_5420_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 128]> var_5420_0, tensor<fp16, [1, 2, 1, 128]> var_5420_1 = split(axis = var_5420_axis_0, split_sizes = var_5420_split_sizes_0, x = q_69)[name = string("op_5420")];
+            fp16 const_99_promoted = const()[name = string("const_99_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 128]> var_5422 = mul(x = var_5420_1, y = const_99_promoted)[name = string("op_5422")];
+            int32 var_5424 = const()[name = string("op_5424"), val = int32(-1)];
+            bool var_5425_interleave_0 = const()[name = string("op_5425_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 256]> var_5425 = concat(axis = var_5424, interleave = var_5425_interleave_0, values = (var_5422, var_5420_0))[name = string("op_5425")];
+            tensor<fp16, [1, 2, 1, 256]> var_5426_cast_fp16 = mul(x = var_5425, y = sin_s)[name = string("op_5426_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_249_cast_fp16 = add(x = var_5419_cast_fp16, y = var_5426_cast_fp16)[name = string("input_249_cast_fp16")];
+            tensor<int32, [8]> k_padded_15_pad_0 = const()[name = string("k_padded_15_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_15_mode_0 = const()[name = string("k_padded_15_mode_0"), val = string("constant")];
+            fp16 const_100_to_fp16 = const()[name = string("const_100_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> k_padded_15_cast_fp16 = pad(constant_val = const_100_to_fp16, mode = k_padded_15_mode_0, pad = k_padded_15_pad_0, x = input_249_cast_fp16)[name = string("k_padded_15_cast_fp16")];
+            tensor<int32, [8]> v_padded_15_pad_0 = const()[name = string("v_padded_15_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_15_mode_0 = const()[name = string("v_padded_15_mode_0"), val = string("constant")];
+            fp16 const_101_to_fp16 = const()[name = string("const_101_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> v_padded_15_cast_fp16 = pad(constant_val = const_101_to_fp16, mode = v_padded_15_mode_0, pad = v_padded_15_pad_0, x = input_251_cast_fp16)[name = string("v_padded_15_cast_fp16")];
+            tensor<int32, [4]> var_5455_begin_0 = const()[name = string("op_5455_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_5455_end_0 = const()[name = string("op_5455_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_5455_end_mask_0 = const()[name = string("op_5455_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_5455_cast_fp16 = slice_by_index(begin = var_5455_begin_0, end = var_5455_end_0, end_mask = var_5455_end_mask_0, x = K_sliding_slot_15_cast_fp16)[name = string("op_5455_cast_fp16")];
+            int32 var_5462 = const()[name = string("op_5462"), val = int32(2)];
+            bool K_sliding_out_15_interleave_0 = const()[name = string("K_sliding_out_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_out_15_cast_fp16 = concat(axis = var_5462, interleave = K_sliding_out_15_interleave_0, values = (var_5455_cast_fp16, k_padded_15_cast_fp16))[name = string("K_sliding_out_15_cast_fp16")];
+            tensor<int32, [4]> var_5478_begin_0 = const()[name = string("op_5478_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_5478_end_0 = const()[name = string("op_5478_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_5478_end_mask_0 = const()[name = string("op_5478_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_5478_cast_fp16 = slice_by_index(begin = var_5478_begin_0, end = var_5478_end_0, end_mask = var_5478_end_mask_0, x = V_sliding_slot_15_cast_fp16)[name = string("op_5478_cast_fp16")];
+            int32 var_5485 = const()[name = string("op_5485"), val = int32(2)];
+            bool V_sliding_out_15_interleave_0 = const()[name = string("V_sliding_out_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_out_15_cast_fp16 = concat(axis = var_5485, interleave = V_sliding_out_15_interleave_0, values = (var_5478_cast_fp16, v_padded_15_cast_fp16))[name = string("V_sliding_out_15_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_17_begin_0 = const()[name = string("K_for_attn_17_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_17_end_0 = const()[name = string("K_for_attn_17_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_17_end_mask_0 = const()[name = string("K_for_attn_17_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_17_cast_fp16 = slice_by_index(begin = K_for_attn_17_begin_0, end = K_for_attn_17_end_0, end_mask = K_for_attn_17_end_mask_0, x = K_sliding_out_15_cast_fp16)[name = string("K_for_attn_17_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_17_begin_0 = const()[name = string("V_for_attn_17_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_17_end_0 = const()[name = string("V_for_attn_17_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_17_end_mask_0 = const()[name = string("V_for_attn_17_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_17_cast_fp16 = slice_by_index(begin = V_for_attn_17_begin_0, end = V_for_attn_17_end_0, end_mask = V_for_attn_17_end_mask_0, x = V_sliding_out_15_cast_fp16)[name = string("V_for_attn_17_cast_fp16")];
+            tensor<int32, [4]> transpose_32_perm_0 = const()[name = string("transpose_32_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_16_reps_0 = const()[name = string("tile_16_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_32_cast_fp16 = transpose(perm = transpose_32_perm_0, x = K_for_attn_17_cast_fp16)[name = string("transpose_67")];
+            tensor<fp16, [8, 1, 512, 256]> tile_16_cast_fp16 = tile(reps = tile_16_reps_0, x = transpose_32_cast_fp16)[name = string("tile_16_cast_fp16")];
+            tensor<int32, [5]> concat_32 = const()[name = string("concat_32"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_32_cast_fp16 = reshape(shape = concat_32, x = tile_16_cast_fp16)[name = string("reshape_32_cast_fp16")];
+            tensor<int32, [5]> transpose_33_perm_0 = const()[name = string("transpose_33_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_33 = const()[name = string("concat_33"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_33_cast_fp16 = transpose(perm = transpose_33_perm_0, x = reshape_32_cast_fp16)[name = string("transpose_66")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_33_cast_fp16 = reshape(shape = concat_33, x = transpose_33_cast_fp16)[name = string("reshape_33_cast_fp16")];
+            tensor<int32, [4]> transpose_56_perm_0 = const()[name = string("transpose_56_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_34_perm_0 = const()[name = string("transpose_34_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_17_reps_0 = const()[name = string("tile_17_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_34_cast_fp16 = transpose(perm = transpose_34_perm_0, x = V_for_attn_17_cast_fp16)[name = string("transpose_65")];
+            tensor<fp16, [8, 1, 512, 256]> tile_17_cast_fp16 = tile(reps = tile_17_reps_0, x = transpose_34_cast_fp16)[name = string("tile_17_cast_fp16")];
+            tensor<int32, [5]> concat_34 = const()[name = string("concat_34"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_34_cast_fp16 = reshape(shape = concat_34, x = tile_17_cast_fp16)[name = string("reshape_34_cast_fp16")];
+            tensor<int32, [5]> transpose_35_perm_0 = const()[name = string("transpose_35_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_35 = const()[name = string("concat_35"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_35_cast_fp16 = transpose(perm = transpose_35_perm_0, x = reshape_34_cast_fp16)[name = string("transpose_64")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_35_cast_fp16 = reshape(shape = concat_35, x = transpose_35_cast_fp16)[name = string("reshape_35_cast_fp16")];
+            tensor<int32, [4]> V_expanded_17_perm_0 = const()[name = string("V_expanded_17_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_33_transpose_x_0 = const()[name = string("attn_weights_33_transpose_x_0"), val = bool(false)];
+            bool attn_weights_33_transpose_y_0 = const()[name = string("attn_weights_33_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_56_cast_fp16 = transpose(perm = transpose_56_perm_0, x = reshape_33_cast_fp16)[name = string("transpose_63")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_33_cast_fp16 = matmul(transpose_x = attn_weights_33_transpose_x_0, transpose_y = attn_weights_33_transpose_y_0, x = q_71_cast_fp16, y = transpose_56_cast_fp16)[name = string("attn_weights_33_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_167_cast_fp16 = add(x = attn_weights_33_cast_fp16, y = causal_mask_sliding)[name = string("x_167_cast_fp16")];
+            tensor<int32, [1]> reduce_max_8_axes_0 = const()[name = string("reduce_max_8_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_8_keep_dims_0 = const()[name = string("reduce_max_8_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_8 = reduce_max(axes = reduce_max_8_axes_0, keep_dims = reduce_max_8_keep_dims_0, x = x_167_cast_fp16)[name = string("reduce_max_8")];
+            tensor<fp16, [1, 8, 1, 512]> var_5526 = sub(x = x_167_cast_fp16, y = reduce_max_8)[name = string("op_5526")];
+            tensor<fp16, [1, 8, 1, 512]> var_5532 = exp(x = var_5526)[name = string("op_5532")];
+            tensor<int32, [1]> var_5542_axes_0 = const()[name = string("op_5542_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5542_keep_dims_0 = const()[name = string("op_5542_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_5542 = reduce_sum(axes = var_5542_axes_0, keep_dims = var_5542_keep_dims_0, x = var_5532)[name = string("op_5542")];
+            tensor<fp16, [1, 8, 1, 512]> var_5548_cast_fp16 = real_div(x = var_5532, y = var_5542)[name = string("op_5548_cast_fp16")];
+            bool attn_output_49_transpose_x_0 = const()[name = string("attn_output_49_transpose_x_0"), val = bool(false)];
+            bool attn_output_49_transpose_y_0 = const()[name = string("attn_output_49_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_17_cast_fp16 = transpose(perm = V_expanded_17_perm_0, x = reshape_35_cast_fp16)[name = string("transpose_62")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_49_cast_fp16 = matmul(transpose_x = attn_output_49_transpose_x_0, transpose_y = attn_output_49_transpose_y_0, x = var_5548_cast_fp16, y = V_expanded_17_cast_fp16)[name = string("attn_output_49_cast_fp16")];
+            tensor<int32, [4]> var_5559 = const()[name = string("op_5559"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_5566 = const()[name = string("op_5566"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_5560_cast_fp16 = transpose(perm = var_5559, x = attn_output_49_cast_fp16)[name = string("transpose_61")];
+            tensor<fp16, [1, 1, 2048]> attn_output_51_cast_fp16 = reshape(shape = var_5566, x = var_5560_cast_fp16)[name = string("attn_output_51_cast_fp16")];
+            tensor<int32, [3]> var_5571 = const()[name = string("op_5571"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_5587_pad_type_0 = const()[name = string("op_5587_pad_type_0"), val = string("valid")];
+            int32 var_5587_groups_0 = const()[name = string("op_5587_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_5587_strides_0 = const()[name = string("op_5587_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_5587_pad_0 = const()[name = string("op_5587_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_5587_dilations_0 = const()[name = string("op_5587_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_8_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(571453248))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(574074752))))[name = string("squeeze_8_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_5572_cast_fp16 = transpose(perm = var_5571, x = attn_output_51_cast_fp16)[name = string("transpose_60")];
+            tensor<fp16, [1, 2560, 1]> var_5587_cast_fp16 = conv(dilations = var_5587_dilations_0, groups = var_5587_groups_0, pad = var_5587_pad_0, pad_type = var_5587_pad_type_0, strides = var_5587_strides_0, weight = squeeze_8_cast_fp16_to_fp32_to_fp16_palettized, x = var_5572_cast_fp16)[name = string("op_5587_cast_fp16")];
+            tensor<int32, [3]> var_5591 = const()[name = string("op_5591"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_5597 = const()[name = string("op_5597"), val = int32(-1)];
+            fp16 const_102_promoted_to_fp16 = const()[name = string("const_102_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_171_cast_fp16 = transpose(perm = var_5591, x = var_5587_cast_fp16)[name = string("transpose_59")];
+            tensor<fp16, [1, 1, 2560]> var_5599_cast_fp16 = mul(x = x_171_cast_fp16, y = const_102_promoted_to_fp16)[name = string("op_5599_cast_fp16")];
+            bool input_255_interleave_0 = const()[name = string("input_255_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_255_cast_fp16 = concat(axis = var_5597, interleave = input_255_interleave_0, values = (x_171_cast_fp16, var_5599_cast_fp16))[name = string("input_255_cast_fp16")];
+            tensor<int32, [1]> normed_241_axes_0 = const()[name = string("normed_241_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5594_to_fp16 = const()[name = string("op_5594_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_241_cast_fp16 = layer_norm(axes = normed_241_axes_0, epsilon = var_5594_to_fp16, x = input_255_cast_fp16)[name = string("normed_241_cast_fp16")];
+            tensor<int32, [2]> var_5604_split_sizes_0 = const()[name = string("op_5604_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5604_axis_0 = const()[name = string("op_5604_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_5604_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_5604_cast_fp16_1 = split(axis = var_5604_axis_0, split_sizes = var_5604_split_sizes_0, x = normed_241_cast_fp16)[name = string("op_5604_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_8_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(574077376)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_53_cast_fp16 = mul(x = var_5604_cast_fp16_0, y = layers_8_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_53_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_173_cast_fp16 = add(x = x_159_cast_fp16, y = attn_output_53_cast_fp16)[name = string("x_173_cast_fp16")];
+            int32 var_5613 = const()[name = string("op_5613"), val = int32(-1)];
+            fp16 const_103_promoted_to_fp16 = const()[name = string("const_103_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_5615_cast_fp16 = mul(x = x_173_cast_fp16, y = const_103_promoted_to_fp16)[name = string("op_5615_cast_fp16")];
+            bool input_257_interleave_0 = const()[name = string("input_257_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_257_cast_fp16 = concat(axis = var_5613, interleave = input_257_interleave_0, values = (x_173_cast_fp16, var_5615_cast_fp16))[name = string("input_257_cast_fp16")];
+            tensor<int32, [1]> normed_245_axes_0 = const()[name = string("normed_245_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5610_to_fp16 = const()[name = string("op_5610_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_245_cast_fp16 = layer_norm(axes = normed_245_axes_0, epsilon = var_5610_to_fp16, x = input_257_cast_fp16)[name = string("normed_245_cast_fp16")];
+            tensor<int32, [2]> var_5620_split_sizes_0 = const()[name = string("op_5620_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5620_axis_0 = const()[name = string("op_5620_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_5620_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_5620_cast_fp16_1 = split(axis = var_5620_axis_0, split_sizes = var_5620_split_sizes_0, x = normed_245_cast_fp16)[name = string("op_5620_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_8_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(574082560)))];
+            tensor<fp16, [1, 1, 2560]> h_51_cast_fp16 = mul(x = var_5620_cast_fp16_0, y = layers_8_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_51_cast_fp16")];
+            tensor<int32, [3]> var_5631 = const()[name = string("op_5631"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_259_axes_0 = const()[name = string("input_259_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_5632 = transpose(perm = var_5631, x = h_51_cast_fp16)[name = string("transpose_58")];
+            tensor<fp16, [1, 2560, 1, 1]> input_259 = expand_dims(axes = input_259_axes_0, x = var_5632)[name = string("input_259")];
+            string gate_33_pad_type_0 = const()[name = string("gate_33_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_33_strides_0 = const()[name = string("gate_33_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_33_pad_0 = const()[name = string("gate_33_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_33_dilations_0 = const()[name = string("gate_33_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_33_groups_0 = const()[name = string("gate_33_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_33 = conv(dilations = gate_33_dilations_0, groups = gate_33_groups_0, pad = gate_33_pad_0, pad_type = gate_33_pad_type_0, strides = gate_33_strides_0, weight = layers_8_mlp_gate_proj_weight_palettized, x = input_259)[name = string("gate_33")];
+            string up_17_pad_type_0 = const()[name = string("up_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_17_strides_0 = const()[name = string("up_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_17_pad_0 = const()[name = string("up_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_17_dilations_0 = const()[name = string("up_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_17_groups_0 = const()[name = string("up_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_17 = conv(dilations = up_17_dilations_0, groups = up_17_groups_0, pad = up_17_pad_0, pad_type = up_17_pad_type_0, strides = up_17_strides_0, weight = layers_8_mlp_up_proj_weight_palettized, x = input_259)[name = string("up_17")];
+            string gate_35_mode_0 = const()[name = string("gate_35_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_35 = gelu(mode = gate_35_mode_0, x = gate_33)[name = string("gate_35")];
+            tensor<fp16, [1, 10240, 1, 1]> input_261 = mul(x = gate_35, y = up_17)[name = string("input_261")];
+            string mlp_out_17_pad_type_0 = const()[name = string("mlp_out_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_17_strides_0 = const()[name = string("mlp_out_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_17_pad_0 = const()[name = string("mlp_out_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_17_dilations_0 = const()[name = string("mlp_out_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_17_groups_0 = const()[name = string("mlp_out_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_17 = conv(dilations = mlp_out_17_dilations_0, groups = mlp_out_17_groups_0, pad = mlp_out_17_pad_0, pad_type = mlp_out_17_pad_type_0, strides = mlp_out_17_strides_0, weight = layers_8_mlp_down_proj_weight_palettized, x = input_261)[name = string("mlp_out_17")];
+            tensor<int32, [1]> var_5672_axes_0 = const()[name = string("op_5672_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_5672 = squeeze(axes = var_5672_axes_0, x = mlp_out_17)[name = string("op_5672")];
+            tensor<int32, [3]> var_5676 = const()[name = string("op_5676"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_5682 = const()[name = string("op_5682"), val = int32(-1)];
+            fp16 const_104_promoted = const()[name = string("const_104_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_175 = transpose(perm = var_5676, x = var_5672)[name = string("transpose_57")];
+            tensor<fp16, [1, 1, 2560]> var_5684 = mul(x = x_175, y = const_104_promoted)[name = string("op_5684")];
+            bool input_263_interleave_0 = const()[name = string("input_263_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_263 = concat(axis = var_5682, interleave = input_263_interleave_0, values = (x_175, var_5684))[name = string("input_263")];
+            tensor<int32, [1]> normed_249_axes_0 = const()[name = string("normed_249_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5679_to_fp16 = const()[name = string("op_5679_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_249_cast_fp16 = layer_norm(axes = normed_249_axes_0, epsilon = var_5679_to_fp16, x = input_263)[name = string("normed_249_cast_fp16")];
+            tensor<int32, [2]> var_5689_split_sizes_0 = const()[name = string("op_5689_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5689_axis_0 = const()[name = string("op_5689_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_5689_0, tensor<fp16, [1, 1, 2560]> var_5689_1 = split(axis = var_5689_axis_0, split_sizes = var_5689_split_sizes_0, x = normed_249_cast_fp16)[name = string("op_5689")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_83 = mul(x = var_5689_0, y = layers_8_post_feedforward_layernorm_weight)[name = string("hidden_states_83")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_85_cast_fp16 = add(x = x_173_cast_fp16, y = hidden_states_83)[name = string("hidden_states_85_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_17_begin_0 = const()[name = string("per_layer_slice_17_begin_0"), val = tensor<int32, [3]>([0, 0, 2048])];
+            tensor<int32, [3]> per_layer_slice_17_end_0 = const()[name = string("per_layer_slice_17_end_0"), val = tensor<int32, [3]>([1, 1, 2304])];
+            tensor<bool, [3]> per_layer_slice_17_end_mask_0 = const()[name = string("per_layer_slice_17_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_17_cast_fp16 = slice_by_index(begin = per_layer_slice_17_begin_0, end = per_layer_slice_17_end_0, end_mask = per_layer_slice_17_end_mask_0, x = per_layer_combined_out)[name = string("per_layer_slice_17_cast_fp16")];
+            tensor<int32, [3]> var_5717 = const()[name = string("op_5717"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_265_axes_0 = const()[name = string("input_265_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_5718 = transpose(perm = var_5717, x = hidden_states_85_cast_fp16)[name = string("transpose_56")];
+            tensor<fp16, [1, 2560, 1, 1]> input_265 = expand_dims(axes = input_265_axes_0, x = var_5718)[name = string("input_265")];
+            string gated_49_pad_type_0 = const()[name = string("gated_49_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_49_strides_0 = const()[name = string("gated_49_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_49_pad_0 = const()[name = string("gated_49_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_49_dilations_0 = const()[name = string("gated_49_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_49_groups_0 = const()[name = string("gated_49_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_49 = conv(dilations = gated_49_dilations_0, groups = gated_49_groups_0, pad = gated_49_pad_0, pad_type = gated_49_pad_type_0, strides = gated_49_strides_0, weight = layers_8_per_layer_input_gate_weight_palettized, x = input_265)[name = string("gated_49")];
+            string gated_51_mode_0 = const()[name = string("gated_51_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_51 = gelu(mode = gated_51_mode_0, x = gated_49)[name = string("gated_51")];
+            tensor<int32, [3]> var_5737 = const()[name = string("op_5737"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_17_axes_0 = const()[name = string("per_layer_slice_conv_17_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_5738_cast_fp16 = transpose(perm = var_5737, x = per_layer_slice_17_cast_fp16)[name = string("transpose_55")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_17_cast_fp16 = expand_dims(axes = per_layer_slice_conv_17_axes_0, x = var_5738_cast_fp16)[name = string("per_layer_slice_conv_17_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_267_cast_fp16 = mul(x = gated_51, y = per_layer_slice_conv_17_cast_fp16)[name = string("input_267_cast_fp16")];
+            string gated_53_pad_type_0 = const()[name = string("gated_53_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_53_strides_0 = const()[name = string("gated_53_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_53_pad_0 = const()[name = string("gated_53_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_53_dilations_0 = const()[name = string("gated_53_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_53_groups_0 = const()[name = string("gated_53_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_8_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(574087744))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(574415488))))[name = string("layers_8_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_53_cast_fp16 = conv(dilations = gated_53_dilations_0, groups = gated_53_groups_0, pad = gated_53_pad_0, pad_type = gated_53_pad_type_0, strides = gated_53_strides_0, weight = layers_8_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_267_cast_fp16)[name = string("gated_53_cast_fp16")];
+            tensor<int32, [1]> var_5754_axes_0 = const()[name = string("op_5754_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_5754_cast_fp16 = squeeze(axes = var_5754_axes_0, x = gated_53_cast_fp16)[name = string("op_5754_cast_fp16")];
+            tensor<int32, [3]> var_5758 = const()[name = string("op_5758"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_5764 = const()[name = string("op_5764"), val = int32(-1)];
+            fp16 const_105_promoted_to_fp16 = const()[name = string("const_105_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_177_cast_fp16 = transpose(perm = var_5758, x = var_5754_cast_fp16)[name = string("transpose_54")];
+            tensor<fp16, [1, 1, 2560]> var_5766_cast_fp16 = mul(x = x_177_cast_fp16, y = const_105_promoted_to_fp16)[name = string("op_5766_cast_fp16")];
+            bool input_269_interleave_0 = const()[name = string("input_269_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_269_cast_fp16 = concat(axis = var_5764, interleave = input_269_interleave_0, values = (x_177_cast_fp16, var_5766_cast_fp16))[name = string("input_269_cast_fp16")];
+            tensor<int32, [1]> normed_253_axes_0 = const()[name = string("normed_253_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5761_to_fp16 = const()[name = string("op_5761_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_253_cast_fp16 = layer_norm(axes = normed_253_axes_0, epsilon = var_5761_to_fp16, x = input_269_cast_fp16)[name = string("normed_253_cast_fp16")];
+            tensor<int32, [2]> var_5771_split_sizes_0 = const()[name = string("op_5771_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5771_axis_0 = const()[name = string("op_5771_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_5771_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_5771_cast_fp16_1 = split(axis = var_5771_axis_0, split_sizes = var_5771_split_sizes_0, x = normed_253_cast_fp16)[name = string("op_5771_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_8_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(574418112)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_89_cast_fp16 = mul(x = var_5771_cast_fp16_0, y = layers_8_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_89_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_91_cast_fp16 = add(x = hidden_states_85_cast_fp16, y = hidden_states_89_cast_fp16)[name = string("hidden_states_91_cast_fp16")];
+            tensor<fp16, [1]> const_106_promoted_to_fp16 = const()[name = string("const_106_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.d4p-3])];
+            tensor<fp16, [1, 1, 2560]> x_179_cast_fp16 = mul(x = hidden_states_91_cast_fp16, y = const_106_promoted_to_fp16)[name = string("x_179_cast_fp16")];
+            tensor<int32, [1]> var_5783_axes_0 = const()[name = string("op_5783_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_5783_cast_fp16 = squeeze(axes = var_5783_axes_0, x = K_sliding_out_15_cast_fp16)[name = string("op_5783_cast_fp16")];
+            tensor<int32, [1]> var_5785_axes_0 = const()[name = string("op_5785_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_5785_cast_fp16 = squeeze(axes = var_5785_axes_0, x = V_sliding_out_15_cast_fp16)[name = string("op_5785_cast_fp16")];
+            tensor<int32, [4]> var_5788_begin_0 = const()[name = string("op_5788_begin_0"), val = tensor<int32, [4]>([8, 0, 0, 0])];
+            tensor<int32, [4]> var_5788_end_0 = const()[name = string("op_5788_end_0"), val = tensor<int32, [4]>([9, 2, 512, 512])];
+            tensor<bool, [4]> var_5788_end_mask_0 = const()[name = string("op_5788_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_5788_squeeze_mask_0 = const()[name = string("op_5788_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_5788_cast_fp16 = slice_by_index(begin = var_5788_begin_0, end = var_5788_end_0, end_mask = var_5788_end_mask_0, squeeze_mask = var_5788_squeeze_mask_0, x = K_sliding_in)[name = string("op_5788_cast_fp16")];
+            tensor<int32, [1]> K_sliding_slot_17_axes_0 = const()[name = string("K_sliding_slot_17_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_slot_17_cast_fp16 = expand_dims(axes = K_sliding_slot_17_axes_0, x = var_5788_cast_fp16)[name = string("K_sliding_slot_17_cast_fp16")];
+            tensor<int32, [4]> var_5793_begin_0 = const()[name = string("op_5793_begin_0"), val = tensor<int32, [4]>([8, 0, 0, 0])];
+            tensor<int32, [4]> var_5793_end_0 = const()[name = string("op_5793_end_0"), val = tensor<int32, [4]>([9, 2, 512, 512])];
+            tensor<bool, [4]> var_5793_end_mask_0 = const()[name = string("op_5793_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_5793_squeeze_mask_0 = const()[name = string("op_5793_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_5793_cast_fp16 = slice_by_index(begin = var_5793_begin_0, end = var_5793_end_0, end_mask = var_5793_end_mask_0, squeeze_mask = var_5793_squeeze_mask_0, x = V_sliding_in)[name = string("op_5793_cast_fp16")];
+            tensor<int32, [1]> V_sliding_slot_17_axes_0 = const()[name = string("V_sliding_slot_17_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_slot_17_cast_fp16 = expand_dims(axes = V_sliding_slot_17_axes_0, x = var_5793_cast_fp16)[name = string("V_sliding_slot_17_cast_fp16")];
+            int32 var_5800 = const()[name = string("op_5800"), val = int32(-1)];
+            fp16 const_107_promoted_to_fp16 = const()[name = string("const_107_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_5802_cast_fp16 = mul(x = x_179_cast_fp16, y = const_107_promoted_to_fp16)[name = string("op_5802_cast_fp16")];
+            bool input_271_interleave_0 = const()[name = string("input_271_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_271_cast_fp16 = concat(axis = var_5800, interleave = input_271_interleave_0, values = (x_179_cast_fp16, var_5802_cast_fp16))[name = string("input_271_cast_fp16")];
+            tensor<int32, [1]> normed_257_axes_0 = const()[name = string("normed_257_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5797_to_fp16 = const()[name = string("op_5797_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_257_cast_fp16 = layer_norm(axes = normed_257_axes_0, epsilon = var_5797_to_fp16, x = input_271_cast_fp16)[name = string("normed_257_cast_fp16")];
+            tensor<int32, [2]> var_5807_split_sizes_0 = const()[name = string("op_5807_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5807_axis_0 = const()[name = string("op_5807_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_5807_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_5807_cast_fp16_1 = split(axis = var_5807_axis_0, split_sizes = var_5807_split_sizes_0, x = normed_257_cast_fp16)[name = string("op_5807_cast_fp16")];
+            tensor<fp16, [2560]> layers_9_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_9_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(574423296)))];
+            tensor<fp16, [1, 1, 2560]> h_55_cast_fp16 = mul(x = var_5807_cast_fp16_0, y = layers_9_input_layernorm_weight_promoted_to_fp16)[name = string("h_55_cast_fp16")];
+            tensor<int32, [3]> var_5813 = const()[name = string("op_5813"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_5816_axes_0 = const()[name = string("op_5816_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_5814_cast_fp16 = transpose(perm = var_5813, x = h_55_cast_fp16)[name = string("transpose_53")];
+            tensor<fp16, [1, 2560, 1, 1]> var_5816_cast_fp16 = expand_dims(axes = var_5816_axes_0, x = var_5814_cast_fp16)[name = string("op_5816_cast_fp16")];
+            string var_5832_pad_type_0 = const()[name = string("op_5832_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_5832_strides_0 = const()[name = string("op_5832_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_5832_pad_0 = const()[name = string("op_5832_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_5832_dilations_0 = const()[name = string("op_5832_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_5832_groups_0 = const()[name = string("op_5832_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_5832 = conv(dilations = var_5832_dilations_0, groups = var_5832_groups_0, pad = var_5832_pad_0, pad_type = var_5832_pad_type_0, strides = var_5832_strides_0, weight = layers_9_self_attn_q_proj_weight_palettized, x = var_5816_cast_fp16)[name = string("op_5832")];
+            tensor<int32, [4]> var_5837 = const()[name = string("op_5837"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_5838 = reshape(shape = var_5837, x = var_5832)[name = string("op_5838")];
+            tensor<int32, [4]> var_5843 = const()[name = string("op_5843"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_5853 = const()[name = string("op_5853"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_5844 = transpose(perm = var_5843, x = var_5838)[name = string("transpose_52")];
+            tensor<fp16, [1, 8, 256]> x_181 = reshape(shape = var_5853, x = var_5844)[name = string("x_181")];
+            int32 var_5859 = const()[name = string("op_5859"), val = int32(-1)];
+            fp16 const_108_promoted = const()[name = string("const_108_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_5861 = mul(x = x_181, y = const_108_promoted)[name = string("op_5861")];
+            bool input_275_interleave_0 = const()[name = string("input_275_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_275 = concat(axis = var_5859, interleave = input_275_interleave_0, values = (x_181, var_5861))[name = string("input_275")];
+            tensor<int32, [1]> normed_261_axes_0 = const()[name = string("normed_261_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5856_to_fp16 = const()[name = string("op_5856_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_261_cast_fp16 = layer_norm(axes = normed_261_axes_0, epsilon = var_5856_to_fp16, x = input_275)[name = string("normed_261_cast_fp16")];
+            tensor<int32, [2]> var_5866_split_sizes_0 = const()[name = string("op_5866_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_5866_axis_0 = const()[name = string("op_5866_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_5866_0, tensor<fp16, [1, 8, 256]> var_5866_1 = split(axis = var_5866_axis_0, split_sizes = var_5866_split_sizes_0, x = normed_261_cast_fp16)[name = string("op_5866")];
+            tensor<fp16, [1, 8, 256]> var_5868 = mul(x = var_5866_0, y = layers_9_self_attn_q_norm_weight)[name = string("op_5868")];
+            tensor<int32, [4]> var_5873 = const()[name = string("op_5873"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_75 = reshape(shape = var_5873, x = var_5868)[name = string("q_75")];
+            tensor<fp16, [1, 8, 1, 256]> var_5875_cast_fp16 = mul(x = q_75, y = cos_s)[name = string("op_5875_cast_fp16")];
+            tensor<int32, [2]> var_5876_split_sizes_0 = const()[name = string("op_5876_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_5876_axis_0 = const()[name = string("op_5876_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_5876_0, tensor<fp16, [1, 8, 1, 128]> var_5876_1 = split(axis = var_5876_axis_0, split_sizes = var_5876_split_sizes_0, x = q_75)[name = string("op_5876")];
+            fp16 const_109_promoted = const()[name = string("const_109_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_5878 = mul(x = var_5876_1, y = const_109_promoted)[name = string("op_5878")];
+            int32 var_5880 = const()[name = string("op_5880"), val = int32(-1)];
+            bool var_5881_interleave_0 = const()[name = string("op_5881_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_5881 = concat(axis = var_5880, interleave = var_5881_interleave_0, values = (var_5878, var_5876_0))[name = string("op_5881")];
+            tensor<fp16, [1, 8, 1, 256]> var_5882_cast_fp16 = mul(x = var_5881, y = sin_s)[name = string("op_5882_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_79_cast_fp16 = add(x = var_5875_cast_fp16, y = var_5882_cast_fp16)[name = string("q_79_cast_fp16")];
+            string var_5895_pad_type_0 = const()[name = string("op_5895_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_5895_strides_0 = const()[name = string("op_5895_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_5895_pad_0 = const()[name = string("op_5895_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_5895_dilations_0 = const()[name = string("op_5895_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_5895_groups_0 = const()[name = string("op_5895_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_5895 = conv(dilations = var_5895_dilations_0, groups = var_5895_groups_0, pad = var_5895_pad_0, pad_type = var_5895_pad_type_0, strides = var_5895_strides_0, weight = layers_9_self_attn_k_proj_weight_palettized, x = var_5816_cast_fp16)[name = string("op_5895")];
+            tensor<int32, [4]> var_5900 = const()[name = string("op_5900"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_5901 = reshape(shape = var_5900, x = var_5895)[name = string("op_5901")];
+            tensor<int32, [4]> var_5906 = const()[name = string("op_5906"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_5923_pad_type_0 = const()[name = string("op_5923_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_5923_strides_0 = const()[name = string("op_5923_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_5923_pad_0 = const()[name = string("op_5923_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_5923_dilations_0 = const()[name = string("op_5923_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_5923_groups_0 = const()[name = string("op_5923_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_5923 = conv(dilations = var_5923_dilations_0, groups = var_5923_groups_0, pad = var_5923_pad_0, pad_type = var_5923_pad_type_0, strides = var_5923_strides_0, weight = layers_9_self_attn_v_proj_weight_palettized, x = var_5816_cast_fp16)[name = string("op_5923")];
+            tensor<int32, [4]> var_5928 = const()[name = string("op_5928"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_5929 = reshape(shape = var_5928, x = var_5923)[name = string("op_5929")];
+            tensor<int32, [4]> var_5934 = const()[name = string("op_5934"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_5944 = const()[name = string("op_5944"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp16, [1, 2, 1, 256]> var_5907 = transpose(perm = var_5906, x = var_5901)[name = string("transpose_51")];
+            tensor<fp16, [1, 2, 256]> x_183 = reshape(shape = var_5944, x = var_5907)[name = string("x_183")];
+            int32 var_5950 = const()[name = string("op_5950"), val = int32(-1)];
+            fp16 const_110_promoted = const()[name = string("const_110_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 256]> var_5952 = mul(x = x_183, y = const_110_promoted)[name = string("op_5952")];
+            bool input_277_interleave_0 = const()[name = string("input_277_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512]> input_277 = concat(axis = var_5950, interleave = input_277_interleave_0, values = (x_183, var_5952))[name = string("input_277")];
+            tensor<int32, [1]> normed_265_axes_0 = const()[name = string("normed_265_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5947_to_fp16 = const()[name = string("op_5947_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 512]> normed_265_cast_fp16 = layer_norm(axes = normed_265_axes_0, epsilon = var_5947_to_fp16, x = input_277)[name = string("normed_265_cast_fp16")];
+            tensor<int32, [2]> var_5957_split_sizes_0 = const()[name = string("op_5957_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_5957_axis_0 = const()[name = string("op_5957_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 256]> var_5957_0, tensor<fp16, [1, 2, 256]> var_5957_1 = split(axis = var_5957_axis_0, split_sizes = var_5957_split_sizes_0, x = normed_265_cast_fp16)[name = string("op_5957")];
+            tensor<fp16, [1, 2, 256]> var_5959 = mul(x = var_5957_0, y = layers_9_self_attn_k_norm_weight)[name = string("op_5959")];
+            tensor<int32, [4]> var_5964 = const()[name = string("op_5964"), val = tensor<int32, [4]>([1, 2, 1, 256])];
+            tensor<fp16, [1, 2, 1, 256]> q_77 = reshape(shape = var_5964, x = var_5959)[name = string("q_77")];
+            fp16 var_5966_promoted = const()[name = string("op_5966_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 256]> var_5935 = transpose(perm = var_5934, x = var_5929)[name = string("transpose_50")];
+            tensor<fp16, [1, 2, 1, 256]> var_5967 = pow(x = var_5935, y = var_5966_promoted)[name = string("op_5967")];
+            tensor<int32, [1]> var_5972_axes_0 = const()[name = string("op_5972_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5972_keep_dims_0 = const()[name = string("op_5972_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_5972 = reduce_mean(axes = var_5972_axes_0, keep_dims = var_5972_keep_dims_0, x = var_5967)[name = string("op_5972")];
+            fp16 var_5974_to_fp16 = const()[name = string("op_5974_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_19_cast_fp16 = add(x = var_5972, y = var_5974_to_fp16)[name = string("mean_sq_19_cast_fp16")];
+            fp32 var_5976_epsilon_0 = const()[name = string("op_5976_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_5976_cast_fp16 = rsqrt(epsilon = var_5976_epsilon_0, x = mean_sq_19_cast_fp16)[name = string("op_5976_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_281_cast_fp16 = mul(x = var_5935, y = var_5976_cast_fp16)[name = string("input_281_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> var_5978_cast_fp16 = mul(x = q_77, y = cos_s)[name = string("op_5978_cast_fp16")];
+            tensor<int32, [2]> var_5979_split_sizes_0 = const()[name = string("op_5979_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_5979_axis_0 = const()[name = string("op_5979_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 128]> var_5979_0, tensor<fp16, [1, 2, 1, 128]> var_5979_1 = split(axis = var_5979_axis_0, split_sizes = var_5979_split_sizes_0, x = q_77)[name = string("op_5979")];
+            fp16 const_111_promoted = const()[name = string("const_111_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 128]> var_5981 = mul(x = var_5979_1, y = const_111_promoted)[name = string("op_5981")];
+            int32 var_5983 = const()[name = string("op_5983"), val = int32(-1)];
+            bool var_5984_interleave_0 = const()[name = string("op_5984_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 256]> var_5984 = concat(axis = var_5983, interleave = var_5984_interleave_0, values = (var_5981, var_5979_0))[name = string("op_5984")];
+            tensor<fp16, [1, 2, 1, 256]> var_5985_cast_fp16 = mul(x = var_5984, y = sin_s)[name = string("op_5985_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_279_cast_fp16 = add(x = var_5978_cast_fp16, y = var_5985_cast_fp16)[name = string("input_279_cast_fp16")];
+            tensor<int32, [8]> k_padded_17_pad_0 = const()[name = string("k_padded_17_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_17_mode_0 = const()[name = string("k_padded_17_mode_0"), val = string("constant")];
+            fp16 const_112_to_fp16 = const()[name = string("const_112_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> k_padded_17_cast_fp16 = pad(constant_val = const_112_to_fp16, mode = k_padded_17_mode_0, pad = k_padded_17_pad_0, x = input_279_cast_fp16)[name = string("k_padded_17_cast_fp16")];
+            tensor<int32, [8]> v_padded_17_pad_0 = const()[name = string("v_padded_17_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_17_mode_0 = const()[name = string("v_padded_17_mode_0"), val = string("constant")];
+            fp16 const_113_to_fp16 = const()[name = string("const_113_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> v_padded_17_cast_fp16 = pad(constant_val = const_113_to_fp16, mode = v_padded_17_mode_0, pad = v_padded_17_pad_0, x = input_281_cast_fp16)[name = string("v_padded_17_cast_fp16")];
+            tensor<int32, [4]> var_6014_begin_0 = const()[name = string("op_6014_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_6014_end_0 = const()[name = string("op_6014_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_6014_end_mask_0 = const()[name = string("op_6014_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_6014_cast_fp16 = slice_by_index(begin = var_6014_begin_0, end = var_6014_end_0, end_mask = var_6014_end_mask_0, x = K_sliding_slot_17_cast_fp16)[name = string("op_6014_cast_fp16")];
+            int32 var_6021 = const()[name = string("op_6021"), val = int32(2)];
+            bool K_sliding_out_17_interleave_0 = const()[name = string("K_sliding_out_17_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_out_17_cast_fp16 = concat(axis = var_6021, interleave = K_sliding_out_17_interleave_0, values = (var_6014_cast_fp16, k_padded_17_cast_fp16))[name = string("K_sliding_out_17_cast_fp16")];
+            tensor<int32, [4]> var_6037_begin_0 = const()[name = string("op_6037_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_6037_end_0 = const()[name = string("op_6037_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_6037_end_mask_0 = const()[name = string("op_6037_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_6037_cast_fp16 = slice_by_index(begin = var_6037_begin_0, end = var_6037_end_0, end_mask = var_6037_end_mask_0, x = V_sliding_slot_17_cast_fp16)[name = string("op_6037_cast_fp16")];
+            int32 var_6044 = const()[name = string("op_6044"), val = int32(2)];
+            bool V_sliding_out_17_interleave_0 = const()[name = string("V_sliding_out_17_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_out_17_cast_fp16 = concat(axis = var_6044, interleave = V_sliding_out_17_interleave_0, values = (var_6037_cast_fp16, v_padded_17_cast_fp16))[name = string("V_sliding_out_17_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_19_begin_0 = const()[name = string("K_for_attn_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_19_end_0 = const()[name = string("K_for_attn_19_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_19_end_mask_0 = const()[name = string("K_for_attn_19_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_19_cast_fp16 = slice_by_index(begin = K_for_attn_19_begin_0, end = K_for_attn_19_end_0, end_mask = K_for_attn_19_end_mask_0, x = K_sliding_out_17_cast_fp16)[name = string("K_for_attn_19_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_19_begin_0 = const()[name = string("V_for_attn_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_19_end_0 = const()[name = string("V_for_attn_19_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_19_end_mask_0 = const()[name = string("V_for_attn_19_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_19_cast_fp16 = slice_by_index(begin = V_for_attn_19_begin_0, end = V_for_attn_19_end_0, end_mask = V_for_attn_19_end_mask_0, x = V_sliding_out_17_cast_fp16)[name = string("V_for_attn_19_cast_fp16")];
+            tensor<int32, [4]> transpose_36_perm_0 = const()[name = string("transpose_36_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_18_reps_0 = const()[name = string("tile_18_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_36_cast_fp16 = transpose(perm = transpose_36_perm_0, x = K_for_attn_19_cast_fp16)[name = string("transpose_49")];
+            tensor<fp16, [8, 1, 512, 256]> tile_18_cast_fp16 = tile(reps = tile_18_reps_0, x = transpose_36_cast_fp16)[name = string("tile_18_cast_fp16")];
+            tensor<int32, [5]> concat_36 = const()[name = string("concat_36"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_36_cast_fp16 = reshape(shape = concat_36, x = tile_18_cast_fp16)[name = string("reshape_36_cast_fp16")];
+            tensor<int32, [5]> transpose_37_perm_0 = const()[name = string("transpose_37_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_37 = const()[name = string("concat_37"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_37_cast_fp16 = transpose(perm = transpose_37_perm_0, x = reshape_36_cast_fp16)[name = string("transpose_48")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_37_cast_fp16 = reshape(shape = concat_37, x = transpose_37_cast_fp16)[name = string("reshape_37_cast_fp16")];
+            tensor<int32, [4]> transpose_57_perm_0 = const()[name = string("transpose_57_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_38_perm_0 = const()[name = string("transpose_38_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_19_reps_0 = const()[name = string("tile_19_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_38_cast_fp16 = transpose(perm = transpose_38_perm_0, x = V_for_attn_19_cast_fp16)[name = string("transpose_47")];
+            tensor<fp16, [8, 1, 512, 256]> tile_19_cast_fp16 = tile(reps = tile_19_reps_0, x = transpose_38_cast_fp16)[name = string("tile_19_cast_fp16")];
+            tensor<int32, [5]> concat_38 = const()[name = string("concat_38"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_38_cast_fp16 = reshape(shape = concat_38, x = tile_19_cast_fp16)[name = string("reshape_38_cast_fp16")];
+            tensor<int32, [5]> transpose_39_perm_0 = const()[name = string("transpose_39_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_39 = const()[name = string("concat_39"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_39_cast_fp16 = transpose(perm = transpose_39_perm_0, x = reshape_38_cast_fp16)[name = string("transpose_46")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_39_cast_fp16 = reshape(shape = concat_39, x = transpose_39_cast_fp16)[name = string("reshape_39_cast_fp16")];
+            tensor<int32, [4]> V_expanded_19_perm_0 = const()[name = string("V_expanded_19_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_37_transpose_x_0 = const()[name = string("attn_weights_37_transpose_x_0"), val = bool(false)];
+            bool attn_weights_37_transpose_y_0 = const()[name = string("attn_weights_37_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_57_cast_fp16 = transpose(perm = transpose_57_perm_0, x = reshape_37_cast_fp16)[name = string("transpose_45")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_37_cast_fp16 = matmul(transpose_x = attn_weights_37_transpose_x_0, transpose_y = attn_weights_37_transpose_y_0, x = q_79_cast_fp16, y = transpose_57_cast_fp16)[name = string("attn_weights_37_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_187_cast_fp16 = add(x = attn_weights_37_cast_fp16, y = causal_mask_sliding)[name = string("x_187_cast_fp16")];
+            tensor<int32, [1]> reduce_max_9_axes_0 = const()[name = string("reduce_max_9_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_9_keep_dims_0 = const()[name = string("reduce_max_9_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_9 = reduce_max(axes = reduce_max_9_axes_0, keep_dims = reduce_max_9_keep_dims_0, x = x_187_cast_fp16)[name = string("reduce_max_9")];
+            tensor<fp16, [1, 8, 1, 512]> var_6085 = sub(x = x_187_cast_fp16, y = reduce_max_9)[name = string("op_6085")];
+            tensor<fp16, [1, 8, 1, 512]> var_6091 = exp(x = var_6085)[name = string("op_6091")];
+            tensor<int32, [1]> var_6101_axes_0 = const()[name = string("op_6101_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_6101_keep_dims_0 = const()[name = string("op_6101_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_6101 = reduce_sum(axes = var_6101_axes_0, keep_dims = var_6101_keep_dims_0, x = var_6091)[name = string("op_6101")];
+            tensor<fp16, [1, 8, 1, 512]> var_6107_cast_fp16 = real_div(x = var_6091, y = var_6101)[name = string("op_6107_cast_fp16")];
+            bool attn_output_55_transpose_x_0 = const()[name = string("attn_output_55_transpose_x_0"), val = bool(false)];
+            bool attn_output_55_transpose_y_0 = const()[name = string("attn_output_55_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_19_cast_fp16 = transpose(perm = V_expanded_19_perm_0, x = reshape_39_cast_fp16)[name = string("transpose_44")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_55_cast_fp16 = matmul(transpose_x = attn_output_55_transpose_x_0, transpose_y = attn_output_55_transpose_y_0, x = var_6107_cast_fp16, y = V_expanded_19_cast_fp16)[name = string("attn_output_55_cast_fp16")];
+            tensor<int32, [4]> var_6118 = const()[name = string("op_6118"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_6125 = const()[name = string("op_6125"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_6119_cast_fp16 = transpose(perm = var_6118, x = attn_output_55_cast_fp16)[name = string("transpose_43")];
+            tensor<fp16, [1, 1, 2048]> attn_output_57_cast_fp16 = reshape(shape = var_6125, x = var_6119_cast_fp16)[name = string("attn_output_57_cast_fp16")];
+            tensor<int32, [3]> var_6130 = const()[name = string("op_6130"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_6146_pad_type_0 = const()[name = string("op_6146_pad_type_0"), val = string("valid")];
+            int32 var_6146_groups_0 = const()[name = string("op_6146_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_6146_strides_0 = const()[name = string("op_6146_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_6146_pad_0 = const()[name = string("op_6146_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_6146_dilations_0 = const()[name = string("op_6146_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_9_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(574428480))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(577049984))))[name = string("squeeze_9_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_6131_cast_fp16 = transpose(perm = var_6130, x = attn_output_57_cast_fp16)[name = string("transpose_42")];
+            tensor<fp16, [1, 2560, 1]> var_6146_cast_fp16 = conv(dilations = var_6146_dilations_0, groups = var_6146_groups_0, pad = var_6146_pad_0, pad_type = var_6146_pad_type_0, strides = var_6146_strides_0, weight = squeeze_9_cast_fp16_to_fp32_to_fp16_palettized, x = var_6131_cast_fp16)[name = string("op_6146_cast_fp16")];
+            tensor<int32, [3]> var_6150 = const()[name = string("op_6150"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_6156 = const()[name = string("op_6156"), val = int32(-1)];
+            fp16 const_114_promoted_to_fp16 = const()[name = string("const_114_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_191_cast_fp16 = transpose(perm = var_6150, x = var_6146_cast_fp16)[name = string("transpose_41")];
+            tensor<fp16, [1, 1, 2560]> var_6158_cast_fp16 = mul(x = x_191_cast_fp16, y = const_114_promoted_to_fp16)[name = string("op_6158_cast_fp16")];
+            bool input_285_interleave_0 = const()[name = string("input_285_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_285_cast_fp16 = concat(axis = var_6156, interleave = input_285_interleave_0, values = (x_191_cast_fp16, var_6158_cast_fp16))[name = string("input_285_cast_fp16")];
+            tensor<int32, [1]> normed_269_axes_0 = const()[name = string("normed_269_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6153_to_fp16 = const()[name = string("op_6153_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_269_cast_fp16 = layer_norm(axes = normed_269_axes_0, epsilon = var_6153_to_fp16, x = input_285_cast_fp16)[name = string("normed_269_cast_fp16")];
+            tensor<int32, [2]> var_6163_split_sizes_0 = const()[name = string("op_6163_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6163_axis_0 = const()[name = string("op_6163_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_6163_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_6163_cast_fp16_1 = split(axis = var_6163_axis_0, split_sizes = var_6163_split_sizes_0, x = normed_269_cast_fp16)[name = string("op_6163_cast_fp16")];
+            tensor<fp16, [2560]> layers_9_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_9_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(577052608)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_59_cast_fp16 = mul(x = var_6163_cast_fp16_0, y = layers_9_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_59_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_193_cast_fp16 = add(x = x_179_cast_fp16, y = attn_output_59_cast_fp16)[name = string("x_193_cast_fp16")];
+            int32 var_6172 = const()[name = string("op_6172"), val = int32(-1)];
+            fp16 const_115_promoted_to_fp16 = const()[name = string("const_115_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_6174_cast_fp16 = mul(x = x_193_cast_fp16, y = const_115_promoted_to_fp16)[name = string("op_6174_cast_fp16")];
+            bool input_287_interleave_0 = const()[name = string("input_287_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_287_cast_fp16 = concat(axis = var_6172, interleave = input_287_interleave_0, values = (x_193_cast_fp16, var_6174_cast_fp16))[name = string("input_287_cast_fp16")];
+            tensor<int32, [1]> normed_273_axes_0 = const()[name = string("normed_273_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6169_to_fp16 = const()[name = string("op_6169_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_273_cast_fp16 = layer_norm(axes = normed_273_axes_0, epsilon = var_6169_to_fp16, x = input_287_cast_fp16)[name = string("normed_273_cast_fp16")];
+            tensor<int32, [2]> var_6179_split_sizes_0 = const()[name = string("op_6179_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6179_axis_0 = const()[name = string("op_6179_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_6179_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_6179_cast_fp16_1 = split(axis = var_6179_axis_0, split_sizes = var_6179_split_sizes_0, x = normed_273_cast_fp16)[name = string("op_6179_cast_fp16")];
+            tensor<fp16, [2560]> layers_9_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_9_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(577057792)))];
+            tensor<fp16, [1, 1, 2560]> h_57_cast_fp16 = mul(x = var_6179_cast_fp16_0, y = layers_9_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_57_cast_fp16")];
+            tensor<int32, [3]> var_6190 = const()[name = string("op_6190"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_289_axes_0 = const()[name = string("input_289_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_6191 = transpose(perm = var_6190, x = h_57_cast_fp16)[name = string("transpose_40")];
+            tensor<fp16, [1, 2560, 1, 1]> input_289 = expand_dims(axes = input_289_axes_0, x = var_6191)[name = string("input_289")];
+            string gate_37_pad_type_0 = const()[name = string("gate_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_37_strides_0 = const()[name = string("gate_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_37_pad_0 = const()[name = string("gate_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_37_dilations_0 = const()[name = string("gate_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_37_groups_0 = const()[name = string("gate_37_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_37 = conv(dilations = gate_37_dilations_0, groups = gate_37_groups_0, pad = gate_37_pad_0, pad_type = gate_37_pad_type_0, strides = gate_37_strides_0, weight = layers_9_mlp_gate_proj_weight_palettized, x = input_289)[name = string("gate_37")];
+            string up_19_pad_type_0 = const()[name = string("up_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_19_strides_0 = const()[name = string("up_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_19_pad_0 = const()[name = string("up_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_19_dilations_0 = const()[name = string("up_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_19_groups_0 = const()[name = string("up_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_19 = conv(dilations = up_19_dilations_0, groups = up_19_groups_0, pad = up_19_pad_0, pad_type = up_19_pad_type_0, strides = up_19_strides_0, weight = layers_9_mlp_up_proj_weight_palettized, x = input_289)[name = string("up_19")];
+            string gate_39_mode_0 = const()[name = string("gate_39_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_39 = gelu(mode = gate_39_mode_0, x = gate_37)[name = string("gate_39")];
+            tensor<fp16, [1, 10240, 1, 1]> input_291 = mul(x = gate_39, y = up_19)[name = string("input_291")];
+            string mlp_out_19_pad_type_0 = const()[name = string("mlp_out_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_19_strides_0 = const()[name = string("mlp_out_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_19_pad_0 = const()[name = string("mlp_out_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_19_dilations_0 = const()[name = string("mlp_out_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_19_groups_0 = const()[name = string("mlp_out_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_19 = conv(dilations = mlp_out_19_dilations_0, groups = mlp_out_19_groups_0, pad = mlp_out_19_pad_0, pad_type = mlp_out_19_pad_type_0, strides = mlp_out_19_strides_0, weight = layers_9_mlp_down_proj_weight_palettized, x = input_291)[name = string("mlp_out_19")];
+            tensor<int32, [1]> var_6231_axes_0 = const()[name = string("op_6231_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_6231 = squeeze(axes = var_6231_axes_0, x = mlp_out_19)[name = string("op_6231")];
+            tensor<int32, [3]> var_6235 = const()[name = string("op_6235"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_6241 = const()[name = string("op_6241"), val = int32(-1)];
+            fp16 const_116_promoted = const()[name = string("const_116_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_195 = transpose(perm = var_6235, x = var_6231)[name = string("transpose_39")];
+            tensor<fp16, [1, 1, 2560]> var_6243 = mul(x = x_195, y = const_116_promoted)[name = string("op_6243")];
+            bool input_293_interleave_0 = const()[name = string("input_293_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_293 = concat(axis = var_6241, interleave = input_293_interleave_0, values = (x_195, var_6243))[name = string("input_293")];
+            tensor<int32, [1]> normed_277_axes_0 = const()[name = string("normed_277_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6238_to_fp16 = const()[name = string("op_6238_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_277_cast_fp16 = layer_norm(axes = normed_277_axes_0, epsilon = var_6238_to_fp16, x = input_293)[name = string("normed_277_cast_fp16")];
+            tensor<int32, [2]> var_6248_split_sizes_0 = const()[name = string("op_6248_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6248_axis_0 = const()[name = string("op_6248_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_6248_0, tensor<fp16, [1, 1, 2560]> var_6248_1 = split(axis = var_6248_axis_0, split_sizes = var_6248_split_sizes_0, x = normed_277_cast_fp16)[name = string("op_6248")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_93 = mul(x = var_6248_0, y = layers_9_post_feedforward_layernorm_weight)[name = string("hidden_states_93")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_95_cast_fp16 = add(x = x_193_cast_fp16, y = hidden_states_93)[name = string("hidden_states_95_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_19_begin_0 = const()[name = string("per_layer_slice_19_begin_0"), val = tensor<int32, [3]>([0, 0, 2304])];
+            tensor<int32, [3]> per_layer_slice_19_end_0 = const()[name = string("per_layer_slice_19_end_0"), val = tensor<int32, [3]>([1, 1, 2560])];
+            tensor<bool, [3]> per_layer_slice_19_end_mask_0 = const()[name = string("per_layer_slice_19_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_19_cast_fp16 = slice_by_index(begin = per_layer_slice_19_begin_0, end = per_layer_slice_19_end_0, end_mask = per_layer_slice_19_end_mask_0, x = per_layer_combined_out)[name = string("per_layer_slice_19_cast_fp16")];
+            tensor<int32, [3]> var_6276 = const()[name = string("op_6276"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_295_axes_0 = const()[name = string("input_295_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_6277 = transpose(perm = var_6276, x = hidden_states_95_cast_fp16)[name = string("transpose_38")];
+            tensor<fp16, [1, 2560, 1, 1]> input_295 = expand_dims(axes = input_295_axes_0, x = var_6277)[name = string("input_295")];
+            string gated_55_pad_type_0 = const()[name = string("gated_55_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_55_strides_0 = const()[name = string("gated_55_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_55_pad_0 = const()[name = string("gated_55_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_55_dilations_0 = const()[name = string("gated_55_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_55_groups_0 = const()[name = string("gated_55_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_55 = conv(dilations = gated_55_dilations_0, groups = gated_55_groups_0, pad = gated_55_pad_0, pad_type = gated_55_pad_type_0, strides = gated_55_strides_0, weight = layers_9_per_layer_input_gate_weight_palettized, x = input_295)[name = string("gated_55")];
+            string gated_57_mode_0 = const()[name = string("gated_57_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_57 = gelu(mode = gated_57_mode_0, x = gated_55)[name = string("gated_57")];
+            tensor<int32, [3]> var_6296 = const()[name = string("op_6296"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_19_axes_0 = const()[name = string("per_layer_slice_conv_19_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_6297_cast_fp16 = transpose(perm = var_6296, x = per_layer_slice_19_cast_fp16)[name = string("transpose_37")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_19_cast_fp16 = expand_dims(axes = per_layer_slice_conv_19_axes_0, x = var_6297_cast_fp16)[name = string("per_layer_slice_conv_19_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_297_cast_fp16 = mul(x = gated_57, y = per_layer_slice_conv_19_cast_fp16)[name = string("input_297_cast_fp16")];
+            string gated_59_pad_type_0 = const()[name = string("gated_59_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_59_strides_0 = const()[name = string("gated_59_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_59_pad_0 = const()[name = string("gated_59_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_59_dilations_0 = const()[name = string("gated_59_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_59_groups_0 = const()[name = string("gated_59_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_9_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(577062976))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(577390720))))[name = string("layers_9_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_59_cast_fp16 = conv(dilations = gated_59_dilations_0, groups = gated_59_groups_0, pad = gated_59_pad_0, pad_type = gated_59_pad_type_0, strides = gated_59_strides_0, weight = layers_9_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_297_cast_fp16)[name = string("gated_59_cast_fp16")];
+            tensor<int32, [1]> var_6313_axes_0 = const()[name = string("op_6313_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_6313_cast_fp16 = squeeze(axes = var_6313_axes_0, x = gated_59_cast_fp16)[name = string("op_6313_cast_fp16")];
+            tensor<int32, [3]> var_6317 = const()[name = string("op_6317"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_6323 = const()[name = string("op_6323"), val = int32(-1)];
+            fp16 const_117_promoted_to_fp16 = const()[name = string("const_117_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_197_cast_fp16 = transpose(perm = var_6317, x = var_6313_cast_fp16)[name = string("transpose_36")];
+            tensor<fp16, [1, 1, 2560]> var_6325_cast_fp16 = mul(x = x_197_cast_fp16, y = const_117_promoted_to_fp16)[name = string("op_6325_cast_fp16")];
+            bool input_299_interleave_0 = const()[name = string("input_299_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_299_cast_fp16 = concat(axis = var_6323, interleave = input_299_interleave_0, values = (x_197_cast_fp16, var_6325_cast_fp16))[name = string("input_299_cast_fp16")];
+            tensor<int32, [1]> normed_281_axes_0 = const()[name = string("normed_281_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6320_to_fp16 = const()[name = string("op_6320_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_281_cast_fp16 = layer_norm(axes = normed_281_axes_0, epsilon = var_6320_to_fp16, x = input_299_cast_fp16)[name = string("normed_281_cast_fp16")];
+            tensor<int32, [2]> var_6330_split_sizes_0 = const()[name = string("op_6330_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6330_axis_0 = const()[name = string("op_6330_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_6330_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_6330_cast_fp16_1 = split(axis = var_6330_axis_0, split_sizes = var_6330_split_sizes_0, x = normed_281_cast_fp16)[name = string("op_6330_cast_fp16")];
+            tensor<fp16, [2560]> layers_9_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_9_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(577393344)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_99_cast_fp16 = mul(x = var_6330_cast_fp16_0, y = layers_9_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_99_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_101_cast_fp16 = add(x = hidden_states_95_cast_fp16, y = hidden_states_99_cast_fp16)[name = string("hidden_states_101_cast_fp16")];
+            tensor<fp16, [1]> const_118_promoted_to_fp16 = const()[name = string("const_118_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.a8p-2])];
+            tensor<fp16, [1, 1, 2560]> x_199_cast_fp16 = mul(x = hidden_states_101_cast_fp16, y = const_118_promoted_to_fp16)[name = string("x_199_cast_fp16")];
+            tensor<int32, [1]> var_6342_axes_0 = const()[name = string("op_6342_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_6342_cast_fp16 = squeeze(axes = var_6342_axes_0, x = K_sliding_out_17_cast_fp16)[name = string("op_6342_cast_fp16")];
+            tensor<int32, [1]> var_6344_axes_0 = const()[name = string("op_6344_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_6344_cast_fp16 = squeeze(axes = var_6344_axes_0, x = V_sliding_out_17_cast_fp16)[name = string("op_6344_cast_fp16")];
+            tensor<int32, [4]> var_6347_begin_0 = const()[name = string("op_6347_begin_0"), val = tensor<int32, [4]>([9, 0, 0, 0])];
+            tensor<int32, [4]> var_6347_end_0 = const()[name = string("op_6347_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_6347_end_mask_0 = const()[name = string("op_6347_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_6347_squeeze_mask_0 = const()[name = string("op_6347_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_6347_cast_fp16 = slice_by_index(begin = var_6347_begin_0, end = var_6347_end_0, end_mask = var_6347_end_mask_0, squeeze_mask = var_6347_squeeze_mask_0, x = K_sliding_in)[name = string("op_6347_cast_fp16")];
+            tensor<int32, [1]> K_sliding_slot_axes_0 = const()[name = string("K_sliding_slot_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_slot_cast_fp16 = expand_dims(axes = K_sliding_slot_axes_0, x = var_6347_cast_fp16)[name = string("K_sliding_slot_cast_fp16")];
+            tensor<int32, [4]> var_6352_begin_0 = const()[name = string("op_6352_begin_0"), val = tensor<int32, [4]>([9, 0, 0, 0])];
+            tensor<int32, [4]> var_6352_end_0 = const()[name = string("op_6352_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_6352_end_mask_0 = const()[name = string("op_6352_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_6352_squeeze_mask_0 = const()[name = string("op_6352_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_6352_cast_fp16 = slice_by_index(begin = var_6352_begin_0, end = var_6352_end_0, end_mask = var_6352_end_mask_0, squeeze_mask = var_6352_squeeze_mask_0, x = V_sliding_in)[name = string("op_6352_cast_fp16")];
+            tensor<int32, [1]> V_sliding_slot_axes_0 = const()[name = string("V_sliding_slot_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_slot_cast_fp16 = expand_dims(axes = V_sliding_slot_axes_0, x = var_6352_cast_fp16)[name = string("V_sliding_slot_cast_fp16")];
+            int32 var_6359 = const()[name = string("op_6359"), val = int32(-1)];
+            fp16 const_119_promoted_to_fp16 = const()[name = string("const_119_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_6361_cast_fp16 = mul(x = x_199_cast_fp16, y = const_119_promoted_to_fp16)[name = string("op_6361_cast_fp16")];
+            bool input_301_interleave_0 = const()[name = string("input_301_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_301_cast_fp16 = concat(axis = var_6359, interleave = input_301_interleave_0, values = (x_199_cast_fp16, var_6361_cast_fp16))[name = string("input_301_cast_fp16")];
+            tensor<int32, [1]> normed_285_axes_0 = const()[name = string("normed_285_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6356_to_fp16 = const()[name = string("op_6356_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_285_cast_fp16 = layer_norm(axes = normed_285_axes_0, epsilon = var_6356_to_fp16, x = input_301_cast_fp16)[name = string("normed_285_cast_fp16")];
+            tensor<int32, [2]> var_6366_split_sizes_0 = const()[name = string("op_6366_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6366_axis_0 = const()[name = string("op_6366_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_6366_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_6366_cast_fp16_1 = split(axis = var_6366_axis_0, split_sizes = var_6366_split_sizes_0, x = normed_285_cast_fp16)[name = string("op_6366_cast_fp16")];
+            tensor<fp16, [2560]> layers_10_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_10_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(577398528)))];
+            tensor<fp16, [1, 1, 2560]> h_61_cast_fp16 = mul(x = var_6366_cast_fp16_0, y = layers_10_input_layernorm_weight_promoted_to_fp16)[name = string("h_61_cast_fp16")];
+            tensor<int32, [3]> var_6372 = const()[name = string("op_6372"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_6375_axes_0 = const()[name = string("op_6375_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_6373_cast_fp16 = transpose(perm = var_6372, x = h_61_cast_fp16)[name = string("transpose_35")];
+            tensor<fp16, [1, 2560, 1, 1]> var_6375_cast_fp16 = expand_dims(axes = var_6375_axes_0, x = var_6373_cast_fp16)[name = string("op_6375_cast_fp16")];
+            string var_6391_pad_type_0 = const()[name = string("op_6391_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_6391_strides_0 = const()[name = string("op_6391_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_6391_pad_0 = const()[name = string("op_6391_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_6391_dilations_0 = const()[name = string("op_6391_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_6391_groups_0 = const()[name = string("op_6391_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_6391 = conv(dilations = var_6391_dilations_0, groups = var_6391_groups_0, pad = var_6391_pad_0, pad_type = var_6391_pad_type_0, strides = var_6391_strides_0, weight = layers_10_self_attn_q_proj_weight_palettized, x = var_6375_cast_fp16)[name = string("op_6391")];
+            tensor<int32, [4]> var_6396 = const()[name = string("op_6396"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_6397 = reshape(shape = var_6396, x = var_6391)[name = string("op_6397")];
+            tensor<int32, [4]> var_6402 = const()[name = string("op_6402"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_6412 = const()[name = string("op_6412"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_6403 = transpose(perm = var_6402, x = var_6397)[name = string("transpose_34")];
+            tensor<fp16, [1, 8, 256]> x_201 = reshape(shape = var_6412, x = var_6403)[name = string("x_201")];
+            int32 var_6418 = const()[name = string("op_6418"), val = int32(-1)];
+            fp16 const_120_promoted = const()[name = string("const_120_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_6420 = mul(x = x_201, y = const_120_promoted)[name = string("op_6420")];
+            bool input_305_interleave_0 = const()[name = string("input_305_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_305 = concat(axis = var_6418, interleave = input_305_interleave_0, values = (x_201, var_6420))[name = string("input_305")];
+            tensor<int32, [1]> normed_289_axes_0 = const()[name = string("normed_289_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6415_to_fp16 = const()[name = string("op_6415_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_289_cast_fp16 = layer_norm(axes = normed_289_axes_0, epsilon = var_6415_to_fp16, x = input_305)[name = string("normed_289_cast_fp16")];
+            tensor<int32, [2]> var_6425_split_sizes_0 = const()[name = string("op_6425_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_6425_axis_0 = const()[name = string("op_6425_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_6425_0, tensor<fp16, [1, 8, 256]> var_6425_1 = split(axis = var_6425_axis_0, split_sizes = var_6425_split_sizes_0, x = normed_289_cast_fp16)[name = string("op_6425")];
+            tensor<fp16, [1, 8, 256]> var_6427 = mul(x = var_6425_0, y = layers_3_self_attn_q_norm_weight)[name = string("op_6427")];
+            tensor<int32, [4]> var_6432 = const()[name = string("op_6432"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_83 = reshape(shape = var_6432, x = var_6427)[name = string("q_83")];
+            tensor<fp16, [1, 8, 1, 256]> var_6434_cast_fp16 = mul(x = q_83, y = cos_s)[name = string("op_6434_cast_fp16")];
+            tensor<int32, [2]> var_6435_split_sizes_0 = const()[name = string("op_6435_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_6435_axis_0 = const()[name = string("op_6435_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_6435_0, tensor<fp16, [1, 8, 1, 128]> var_6435_1 = split(axis = var_6435_axis_0, split_sizes = var_6435_split_sizes_0, x = q_83)[name = string("op_6435")];
+            fp16 const_121_promoted = const()[name = string("const_121_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_6437 = mul(x = var_6435_1, y = const_121_promoted)[name = string("op_6437")];
+            int32 var_6439 = const()[name = string("op_6439"), val = int32(-1)];
+            bool var_6440_interleave_0 = const()[name = string("op_6440_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_6440 = concat(axis = var_6439, interleave = var_6440_interleave_0, values = (var_6437, var_6435_0))[name = string("op_6440")];
+            tensor<fp16, [1, 8, 1, 256]> var_6441_cast_fp16 = mul(x = var_6440, y = sin_s)[name = string("op_6441_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_87_cast_fp16 = add(x = var_6434_cast_fp16, y = var_6441_cast_fp16)[name = string("q_87_cast_fp16")];
+            string var_6454_pad_type_0 = const()[name = string("op_6454_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_6454_strides_0 = const()[name = string("op_6454_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_6454_pad_0 = const()[name = string("op_6454_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_6454_dilations_0 = const()[name = string("op_6454_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_6454_groups_0 = const()[name = string("op_6454_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_6454 = conv(dilations = var_6454_dilations_0, groups = var_6454_groups_0, pad = var_6454_pad_0, pad_type = var_6454_pad_type_0, strides = var_6454_strides_0, weight = layers_10_self_attn_k_proj_weight_palettized, x = var_6375_cast_fp16)[name = string("op_6454")];
+            tensor<int32, [4]> var_6459 = const()[name = string("op_6459"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_6460 = reshape(shape = var_6459, x = var_6454)[name = string("op_6460")];
+            tensor<int32, [4]> var_6465 = const()[name = string("op_6465"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_6482_pad_type_0 = const()[name = string("op_6482_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_6482_strides_0 = const()[name = string("op_6482_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_6482_pad_0 = const()[name = string("op_6482_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_6482_dilations_0 = const()[name = string("op_6482_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_6482_groups_0 = const()[name = string("op_6482_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_6482 = conv(dilations = var_6482_dilations_0, groups = var_6482_groups_0, pad = var_6482_pad_0, pad_type = var_6482_pad_type_0, strides = var_6482_strides_0, weight = layers_10_self_attn_v_proj_weight_palettized, x = var_6375_cast_fp16)[name = string("op_6482")];
+            tensor<int32, [4]> var_6487 = const()[name = string("op_6487"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_6488 = reshape(shape = var_6487, x = var_6482)[name = string("op_6488")];
+            tensor<int32, [4]> var_6493 = const()[name = string("op_6493"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_6503 = const()[name = string("op_6503"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp16, [1, 2, 1, 256]> var_6466 = transpose(perm = var_6465, x = var_6460)[name = string("transpose_33")];
+            tensor<fp16, [1, 2, 256]> x_203 = reshape(shape = var_6503, x = var_6466)[name = string("x_203")];
+            int32 var_6509 = const()[name = string("op_6509"), val = int32(-1)];
+            fp16 const_122_promoted = const()[name = string("const_122_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 256]> var_6511 = mul(x = x_203, y = const_122_promoted)[name = string("op_6511")];
+            bool input_307_interleave_0 = const()[name = string("input_307_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512]> input_307 = concat(axis = var_6509, interleave = input_307_interleave_0, values = (x_203, var_6511))[name = string("input_307")];
+            tensor<int32, [1]> normed_293_axes_0 = const()[name = string("normed_293_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6506_to_fp16 = const()[name = string("op_6506_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 512]> normed_293_cast_fp16 = layer_norm(axes = normed_293_axes_0, epsilon = var_6506_to_fp16, x = input_307)[name = string("normed_293_cast_fp16")];
+            tensor<int32, [2]> var_6516_split_sizes_0 = const()[name = string("op_6516_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_6516_axis_0 = const()[name = string("op_6516_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 256]> var_6516_0, tensor<fp16, [1, 2, 256]> var_6516_1 = split(axis = var_6516_axis_0, split_sizes = var_6516_split_sizes_0, x = normed_293_cast_fp16)[name = string("op_6516")];
+            tensor<fp16, [1, 2, 256]> var_6518 = mul(x = var_6516_0, y = layers_6_self_attn_k_norm_weight)[name = string("op_6518")];
+            tensor<int32, [4]> var_6523 = const()[name = string("op_6523"), val = tensor<int32, [4]>([1, 2, 1, 256])];
+            tensor<fp16, [1, 2, 1, 256]> q_85 = reshape(shape = var_6523, x = var_6518)[name = string("q_85")];
+            fp16 var_6525_promoted = const()[name = string("op_6525_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 256]> var_6494 = transpose(perm = var_6493, x = var_6488)[name = string("transpose_32")];
+            tensor<fp16, [1, 2, 1, 256]> var_6526 = pow(x = var_6494, y = var_6525_promoted)[name = string("op_6526")];
+            tensor<int32, [1]> var_6531_axes_0 = const()[name = string("op_6531_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_6531_keep_dims_0 = const()[name = string("op_6531_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_6531 = reduce_mean(axes = var_6531_axes_0, keep_dims = var_6531_keep_dims_0, x = var_6526)[name = string("op_6531")];
+            fp16 var_6533_to_fp16 = const()[name = string("op_6533_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_21_cast_fp16 = add(x = var_6531, y = var_6533_to_fp16)[name = string("mean_sq_21_cast_fp16")];
+            fp32 var_6535_epsilon_0 = const()[name = string("op_6535_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_6535_cast_fp16 = rsqrt(epsilon = var_6535_epsilon_0, x = mean_sq_21_cast_fp16)[name = string("op_6535_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_311_cast_fp16 = mul(x = var_6494, y = var_6535_cast_fp16)[name = string("input_311_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> var_6537_cast_fp16 = mul(x = q_85, y = cos_s)[name = string("op_6537_cast_fp16")];
+            tensor<int32, [2]> var_6538_split_sizes_0 = const()[name = string("op_6538_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_6538_axis_0 = const()[name = string("op_6538_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 128]> var_6538_0, tensor<fp16, [1, 2, 1, 128]> var_6538_1 = split(axis = var_6538_axis_0, split_sizes = var_6538_split_sizes_0, x = q_85)[name = string("op_6538")];
+            fp16 const_123_promoted = const()[name = string("const_123_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 128]> var_6540 = mul(x = var_6538_1, y = const_123_promoted)[name = string("op_6540")];
+            int32 var_6542 = const()[name = string("op_6542"), val = int32(-1)];
+            bool var_6543_interleave_0 = const()[name = string("op_6543_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 256]> var_6543 = concat(axis = var_6542, interleave = var_6543_interleave_0, values = (var_6540, var_6538_0))[name = string("op_6543")];
+            tensor<fp16, [1, 2, 1, 256]> var_6544_cast_fp16 = mul(x = var_6543, y = sin_s)[name = string("op_6544_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_309_cast_fp16 = add(x = var_6537_cast_fp16, y = var_6544_cast_fp16)[name = string("input_309_cast_fp16")];
+            tensor<int32, [8]> k_padded_pad_0 = const()[name = string("k_padded_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_mode_0 = const()[name = string("k_padded_mode_0"), val = string("constant")];
+            fp16 const_124_to_fp16 = const()[name = string("const_124_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> k_padded_cast_fp16 = pad(constant_val = const_124_to_fp16, mode = k_padded_mode_0, pad = k_padded_pad_0, x = input_309_cast_fp16)[name = string("k_padded_cast_fp16")];
+            tensor<int32, [8]> v_padded_pad_0 = const()[name = string("v_padded_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_mode_0 = const()[name = string("v_padded_mode_0"), val = string("constant")];
+            fp16 const_125_to_fp16 = const()[name = string("const_125_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> v_padded_cast_fp16 = pad(constant_val = const_125_to_fp16, mode = v_padded_mode_0, pad = v_padded_pad_0, x = input_311_cast_fp16)[name = string("v_padded_cast_fp16")];
+            tensor<int32, [4]> var_6573_begin_0 = const()[name = string("op_6573_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_6573_end_0 = const()[name = string("op_6573_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_6573_end_mask_0 = const()[name = string("op_6573_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_6573_cast_fp16 = slice_by_index(begin = var_6573_begin_0, end = var_6573_end_0, end_mask = var_6573_end_mask_0, x = K_sliding_slot_cast_fp16)[name = string("op_6573_cast_fp16")];
+            int32 var_6580 = const()[name = string("op_6580"), val = int32(2)];
+            bool K_sliding_out_interleave_0 = const()[name = string("K_sliding_out_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_out_cast_fp16 = concat(axis = var_6580, interleave = K_sliding_out_interleave_0, values = (var_6573_cast_fp16, k_padded_cast_fp16))[name = string("K_sliding_out_cast_fp16")];
+            tensor<int32, [4]> var_6596_begin_0 = const()[name = string("op_6596_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_6596_end_0 = const()[name = string("op_6596_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_6596_end_mask_0 = const()[name = string("op_6596_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_6596_cast_fp16 = slice_by_index(begin = var_6596_begin_0, end = var_6596_end_0, end_mask = var_6596_end_mask_0, x = V_sliding_slot_cast_fp16)[name = string("op_6596_cast_fp16")];
+            int32 var_6603 = const()[name = string("op_6603"), val = int32(2)];
+            bool V_sliding_out_interleave_0 = const()[name = string("V_sliding_out_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_out_cast_fp16 = concat(axis = var_6603, interleave = V_sliding_out_interleave_0, values = (var_6596_cast_fp16, v_padded_cast_fp16))[name = string("V_sliding_out_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_21_begin_0 = const()[name = string("K_for_attn_21_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_21_end_0 = const()[name = string("K_for_attn_21_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_21_end_mask_0 = const()[name = string("K_for_attn_21_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_21_cast_fp16 = slice_by_index(begin = K_for_attn_21_begin_0, end = K_for_attn_21_end_0, end_mask = K_for_attn_21_end_mask_0, x = K_sliding_out_cast_fp16)[name = string("K_for_attn_21_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_21_begin_0 = const()[name = string("V_for_attn_21_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_21_end_0 = const()[name = string("V_for_attn_21_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_21_end_mask_0 = const()[name = string("V_for_attn_21_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_21_cast_fp16 = slice_by_index(begin = V_for_attn_21_begin_0, end = V_for_attn_21_end_0, end_mask = V_for_attn_21_end_mask_0, x = V_sliding_out_cast_fp16)[name = string("V_for_attn_21_cast_fp16")];
+            tensor<int32, [4]> transpose_40_perm_0 = const()[name = string("transpose_40_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_20_reps_0 = const()[name = string("tile_20_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_40_cast_fp16 = transpose(perm = transpose_40_perm_0, x = K_for_attn_21_cast_fp16)[name = string("transpose_31")];
+            tensor<fp16, [8, 1, 512, 256]> tile_20_cast_fp16 = tile(reps = tile_20_reps_0, x = transpose_40_cast_fp16)[name = string("tile_20_cast_fp16")];
+            tensor<int32, [5]> concat_40 = const()[name = string("concat_40"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_40_cast_fp16 = reshape(shape = concat_40, x = tile_20_cast_fp16)[name = string("reshape_40_cast_fp16")];
+            tensor<int32, [5]> transpose_41_perm_0 = const()[name = string("transpose_41_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_41 = const()[name = string("concat_41"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_41_cast_fp16 = transpose(perm = transpose_41_perm_0, x = reshape_40_cast_fp16)[name = string("transpose_30")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_41_cast_fp16 = reshape(shape = concat_41, x = transpose_41_cast_fp16)[name = string("reshape_41_cast_fp16")];
+            tensor<int32, [4]> transpose_58_perm_0 = const()[name = string("transpose_58_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_42_perm_0 = const()[name = string("transpose_42_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_21_reps_0 = const()[name = string("tile_21_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_42_cast_fp16 = transpose(perm = transpose_42_perm_0, x = V_for_attn_21_cast_fp16)[name = string("transpose_29")];
+            tensor<fp16, [8, 1, 512, 256]> tile_21_cast_fp16 = tile(reps = tile_21_reps_0, x = transpose_42_cast_fp16)[name = string("tile_21_cast_fp16")];
+            tensor<int32, [5]> concat_42 = const()[name = string("concat_42"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_42_cast_fp16 = reshape(shape = concat_42, x = tile_21_cast_fp16)[name = string("reshape_42_cast_fp16")];
+            tensor<int32, [5]> transpose_43_perm_0 = const()[name = string("transpose_43_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_43 = const()[name = string("concat_43"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_43_cast_fp16 = transpose(perm = transpose_43_perm_0, x = reshape_42_cast_fp16)[name = string("transpose_28")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_43_cast_fp16 = reshape(shape = concat_43, x = transpose_43_cast_fp16)[name = string("reshape_43_cast_fp16")];
+            tensor<int32, [4]> V_expanded_21_perm_0 = const()[name = string("V_expanded_21_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_41_transpose_x_0 = const()[name = string("attn_weights_41_transpose_x_0"), val = bool(false)];
+            bool attn_weights_41_transpose_y_0 = const()[name = string("attn_weights_41_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_58_cast_fp16 = transpose(perm = transpose_58_perm_0, x = reshape_41_cast_fp16)[name = string("transpose_27")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_41_cast_fp16 = matmul(transpose_x = attn_weights_41_transpose_x_0, transpose_y = attn_weights_41_transpose_y_0, x = q_87_cast_fp16, y = transpose_58_cast_fp16)[name = string("attn_weights_41_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_207_cast_fp16 = add(x = attn_weights_41_cast_fp16, y = causal_mask_sliding)[name = string("x_207_cast_fp16")];
+            tensor<int32, [1]> reduce_max_10_axes_0 = const()[name = string("reduce_max_10_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_10_keep_dims_0 = const()[name = string("reduce_max_10_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_10 = reduce_max(axes = reduce_max_10_axes_0, keep_dims = reduce_max_10_keep_dims_0, x = x_207_cast_fp16)[name = string("reduce_max_10")];
+            tensor<fp16, [1, 8, 1, 512]> var_6644 = sub(x = x_207_cast_fp16, y = reduce_max_10)[name = string("op_6644")];
+            tensor<fp16, [1, 8, 1, 512]> var_6650 = exp(x = var_6644)[name = string("op_6650")];
+            tensor<int32, [1]> var_6660_axes_0 = const()[name = string("op_6660_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_6660_keep_dims_0 = const()[name = string("op_6660_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_6660 = reduce_sum(axes = var_6660_axes_0, keep_dims = var_6660_keep_dims_0, x = var_6650)[name = string("op_6660")];
+            tensor<fp16, [1, 8, 1, 512]> var_6666_cast_fp16 = real_div(x = var_6650, y = var_6660)[name = string("op_6666_cast_fp16")];
+            bool attn_output_61_transpose_x_0 = const()[name = string("attn_output_61_transpose_x_0"), val = bool(false)];
+            bool attn_output_61_transpose_y_0 = const()[name = string("attn_output_61_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_21_cast_fp16 = transpose(perm = V_expanded_21_perm_0, x = reshape_43_cast_fp16)[name = string("transpose_26")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_61_cast_fp16 = matmul(transpose_x = attn_output_61_transpose_x_0, transpose_y = attn_output_61_transpose_y_0, x = var_6666_cast_fp16, y = V_expanded_21_cast_fp16)[name = string("attn_output_61_cast_fp16")];
+            tensor<int32, [4]> var_6677 = const()[name = string("op_6677"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_6684 = const()[name = string("op_6684"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_6678_cast_fp16 = transpose(perm = var_6677, x = attn_output_61_cast_fp16)[name = string("transpose_25")];
+            tensor<fp16, [1, 1, 2048]> attn_output_63_cast_fp16 = reshape(shape = var_6684, x = var_6678_cast_fp16)[name = string("attn_output_63_cast_fp16")];
+            tensor<int32, [3]> var_6689 = const()[name = string("op_6689"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_6705_pad_type_0 = const()[name = string("op_6705_pad_type_0"), val = string("valid")];
+            int32 var_6705_groups_0 = const()[name = string("op_6705_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_6705_strides_0 = const()[name = string("op_6705_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_6705_pad_0 = const()[name = string("op_6705_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_6705_dilations_0 = const()[name = string("op_6705_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_10_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(577403712))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(580025216))))[name = string("squeeze_10_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_6690_cast_fp16 = transpose(perm = var_6689, x = attn_output_63_cast_fp16)[name = string("transpose_24")];
+            tensor<fp16, [1, 2560, 1]> var_6705_cast_fp16 = conv(dilations = var_6705_dilations_0, groups = var_6705_groups_0, pad = var_6705_pad_0, pad_type = var_6705_pad_type_0, strides = var_6705_strides_0, weight = squeeze_10_cast_fp16_to_fp32_to_fp16_palettized, x = var_6690_cast_fp16)[name = string("op_6705_cast_fp16")];
+            tensor<int32, [3]> var_6709 = const()[name = string("op_6709"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_6715 = const()[name = string("op_6715"), val = int32(-1)];
+            fp16 const_126_promoted_to_fp16 = const()[name = string("const_126_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_211_cast_fp16 = transpose(perm = var_6709, x = var_6705_cast_fp16)[name = string("transpose_23")];
+            tensor<fp16, [1, 1, 2560]> var_6717_cast_fp16 = mul(x = x_211_cast_fp16, y = const_126_promoted_to_fp16)[name = string("op_6717_cast_fp16")];
+            bool input_315_interleave_0 = const()[name = string("input_315_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_315_cast_fp16 = concat(axis = var_6715, interleave = input_315_interleave_0, values = (x_211_cast_fp16, var_6717_cast_fp16))[name = string("input_315_cast_fp16")];
+            tensor<int32, [1]> normed_297_axes_0 = const()[name = string("normed_297_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6712_to_fp16 = const()[name = string("op_6712_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_297_cast_fp16 = layer_norm(axes = normed_297_axes_0, epsilon = var_6712_to_fp16, x = input_315_cast_fp16)[name = string("normed_297_cast_fp16")];
+            tensor<int32, [2]> var_6722_split_sizes_0 = const()[name = string("op_6722_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6722_axis_0 = const()[name = string("op_6722_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_6722_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_6722_cast_fp16_1 = split(axis = var_6722_axis_0, split_sizes = var_6722_split_sizes_0, x = normed_297_cast_fp16)[name = string("op_6722_cast_fp16")];
+            tensor<fp16, [2560]> layers_10_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_10_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(580027840)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_65_cast_fp16 = mul(x = var_6722_cast_fp16_0, y = layers_10_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_65_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_213_cast_fp16 = add(x = x_199_cast_fp16, y = attn_output_65_cast_fp16)[name = string("x_213_cast_fp16")];
+            int32 var_6731 = const()[name = string("op_6731"), val = int32(-1)];
+            fp16 const_127_promoted_to_fp16 = const()[name = string("const_127_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_6733_cast_fp16 = mul(x = x_213_cast_fp16, y = const_127_promoted_to_fp16)[name = string("op_6733_cast_fp16")];
+            bool input_317_interleave_0 = const()[name = string("input_317_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_317_cast_fp16 = concat(axis = var_6731, interleave = input_317_interleave_0, values = (x_213_cast_fp16, var_6733_cast_fp16))[name = string("input_317_cast_fp16")];
+            tensor<int32, [1]> normed_301_axes_0 = const()[name = string("normed_301_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6728_to_fp16 = const()[name = string("op_6728_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_301_cast_fp16 = layer_norm(axes = normed_301_axes_0, epsilon = var_6728_to_fp16, x = input_317_cast_fp16)[name = string("normed_301_cast_fp16")];
+            tensor<int32, [2]> var_6738_split_sizes_0 = const()[name = string("op_6738_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6738_axis_0 = const()[name = string("op_6738_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_6738_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_6738_cast_fp16_1 = split(axis = var_6738_axis_0, split_sizes = var_6738_split_sizes_0, x = normed_301_cast_fp16)[name = string("op_6738_cast_fp16")];
+            tensor<fp16, [2560]> layers_10_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_10_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(580033024)))];
+            tensor<fp16, [1, 1, 2560]> h_63_cast_fp16 = mul(x = var_6738_cast_fp16_0, y = layers_10_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_63_cast_fp16")];
+            tensor<int32, [3]> var_6749 = const()[name = string("op_6749"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_319_axes_0 = const()[name = string("input_319_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_6750 = transpose(perm = var_6749, x = h_63_cast_fp16)[name = string("transpose_22")];
+            tensor<fp16, [1, 2560, 1, 1]> input_319 = expand_dims(axes = input_319_axes_0, x = var_6750)[name = string("input_319")];
+            string gate_41_pad_type_0 = const()[name = string("gate_41_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_41_strides_0 = const()[name = string("gate_41_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_41_pad_0 = const()[name = string("gate_41_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_41_dilations_0 = const()[name = string("gate_41_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_41_groups_0 = const()[name = string("gate_41_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_41 = conv(dilations = gate_41_dilations_0, groups = gate_41_groups_0, pad = gate_41_pad_0, pad_type = gate_41_pad_type_0, strides = gate_41_strides_0, weight = layers_10_mlp_gate_proj_weight_palettized, x = input_319)[name = string("gate_41")];
+            string up_21_pad_type_0 = const()[name = string("up_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_21_strides_0 = const()[name = string("up_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_21_pad_0 = const()[name = string("up_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_21_dilations_0 = const()[name = string("up_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_21_groups_0 = const()[name = string("up_21_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_21 = conv(dilations = up_21_dilations_0, groups = up_21_groups_0, pad = up_21_pad_0, pad_type = up_21_pad_type_0, strides = up_21_strides_0, weight = layers_10_mlp_up_proj_weight_palettized, x = input_319)[name = string("up_21")];
+            string gate_43_mode_0 = const()[name = string("gate_43_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_43 = gelu(mode = gate_43_mode_0, x = gate_41)[name = string("gate_43")];
+            tensor<fp16, [1, 10240, 1, 1]> input_321 = mul(x = gate_43, y = up_21)[name = string("input_321")];
+            string mlp_out_21_pad_type_0 = const()[name = string("mlp_out_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_21_strides_0 = const()[name = string("mlp_out_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_21_pad_0 = const()[name = string("mlp_out_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_21_dilations_0 = const()[name = string("mlp_out_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_21_groups_0 = const()[name = string("mlp_out_21_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_21 = conv(dilations = mlp_out_21_dilations_0, groups = mlp_out_21_groups_0, pad = mlp_out_21_pad_0, pad_type = mlp_out_21_pad_type_0, strides = mlp_out_21_strides_0, weight = layers_10_mlp_down_proj_weight_palettized, x = input_321)[name = string("mlp_out_21")];
+            tensor<int32, [1]> var_6790_axes_0 = const()[name = string("op_6790_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_6790 = squeeze(axes = var_6790_axes_0, x = mlp_out_21)[name = string("op_6790")];
+            tensor<int32, [3]> var_6794 = const()[name = string("op_6794"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_6800 = const()[name = string("op_6800"), val = int32(-1)];
+            fp16 const_128_promoted = const()[name = string("const_128_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_215 = transpose(perm = var_6794, x = var_6790)[name = string("transpose_21")];
+            tensor<fp16, [1, 1, 2560]> var_6802 = mul(x = x_215, y = const_128_promoted)[name = string("op_6802")];
+            bool input_323_interleave_0 = const()[name = string("input_323_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_323 = concat(axis = var_6800, interleave = input_323_interleave_0, values = (x_215, var_6802))[name = string("input_323")];
+            tensor<int32, [1]> normed_305_axes_0 = const()[name = string("normed_305_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6797_to_fp16 = const()[name = string("op_6797_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_305_cast_fp16 = layer_norm(axes = normed_305_axes_0, epsilon = var_6797_to_fp16, x = input_323)[name = string("normed_305_cast_fp16")];
+            tensor<int32, [2]> var_6807_split_sizes_0 = const()[name = string("op_6807_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6807_axis_0 = const()[name = string("op_6807_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_6807_0, tensor<fp16, [1, 1, 2560]> var_6807_1 = split(axis = var_6807_axis_0, split_sizes = var_6807_split_sizes_0, x = normed_305_cast_fp16)[name = string("op_6807")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_103 = mul(x = var_6807_0, y = layers_10_post_feedforward_layernorm_weight)[name = string("hidden_states_103")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_105_cast_fp16 = add(x = x_213_cast_fp16, y = hidden_states_103)[name = string("hidden_states_105_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_21_begin_0 = const()[name = string("per_layer_slice_21_begin_0"), val = tensor<int32, [3]>([0, 0, 2560])];
+            tensor<int32, [3]> per_layer_slice_21_end_0 = const()[name = string("per_layer_slice_21_end_0"), val = tensor<int32, [3]>([1, 1, 2816])];
+            tensor<bool, [3]> per_layer_slice_21_end_mask_0 = const()[name = string("per_layer_slice_21_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_21_cast_fp16 = slice_by_index(begin = per_layer_slice_21_begin_0, end = per_layer_slice_21_end_0, end_mask = per_layer_slice_21_end_mask_0, x = per_layer_combined_out)[name = string("per_layer_slice_21_cast_fp16")];
+            tensor<int32, [3]> var_6835 = const()[name = string("op_6835"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_325_axes_0 = const()[name = string("input_325_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_6836 = transpose(perm = var_6835, x = hidden_states_105_cast_fp16)[name = string("transpose_20")];
+            tensor<fp16, [1, 2560, 1, 1]> input_325 = expand_dims(axes = input_325_axes_0, x = var_6836)[name = string("input_325")];
+            string gated_61_pad_type_0 = const()[name = string("gated_61_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_61_strides_0 = const()[name = string("gated_61_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_61_pad_0 = const()[name = string("gated_61_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_61_dilations_0 = const()[name = string("gated_61_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_61_groups_0 = const()[name = string("gated_61_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_61 = conv(dilations = gated_61_dilations_0, groups = gated_61_groups_0, pad = gated_61_pad_0, pad_type = gated_61_pad_type_0, strides = gated_61_strides_0, weight = layers_10_per_layer_input_gate_weight_palettized, x = input_325)[name = string("gated_61")];
+            string gated_63_mode_0 = const()[name = string("gated_63_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_63 = gelu(mode = gated_63_mode_0, x = gated_61)[name = string("gated_63")];
+            tensor<int32, [3]> var_6855 = const()[name = string("op_6855"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_21_axes_0 = const()[name = string("per_layer_slice_conv_21_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_6856_cast_fp16 = transpose(perm = var_6855, x = per_layer_slice_21_cast_fp16)[name = string("transpose_19")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_21_cast_fp16 = expand_dims(axes = per_layer_slice_conv_21_axes_0, x = var_6856_cast_fp16)[name = string("per_layer_slice_conv_21_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_327_cast_fp16 = mul(x = gated_63, y = per_layer_slice_conv_21_cast_fp16)[name = string("input_327_cast_fp16")];
+            string gated_65_pad_type_0 = const()[name = string("gated_65_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_65_strides_0 = const()[name = string("gated_65_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_65_pad_0 = const()[name = string("gated_65_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_65_dilations_0 = const()[name = string("gated_65_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_65_groups_0 = const()[name = string("gated_65_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_10_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(580038208))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(580365952))))[name = string("layers_10_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_65_cast_fp16 = conv(dilations = gated_65_dilations_0, groups = gated_65_groups_0, pad = gated_65_pad_0, pad_type = gated_65_pad_type_0, strides = gated_65_strides_0, weight = layers_10_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_327_cast_fp16)[name = string("gated_65_cast_fp16")];
+            tensor<int32, [1]> var_6872_axes_0 = const()[name = string("op_6872_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_6872_cast_fp16 = squeeze(axes = var_6872_axes_0, x = gated_65_cast_fp16)[name = string("op_6872_cast_fp16")];
+            tensor<int32, [3]> var_6876 = const()[name = string("op_6876"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_6882 = const()[name = string("op_6882"), val = int32(-1)];
+            fp16 const_129_promoted_to_fp16 = const()[name = string("const_129_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_217_cast_fp16 = transpose(perm = var_6876, x = var_6872_cast_fp16)[name = string("transpose_18")];
+            tensor<fp16, [1, 1, 2560]> var_6884_cast_fp16 = mul(x = x_217_cast_fp16, y = const_129_promoted_to_fp16)[name = string("op_6884_cast_fp16")];
+            bool input_329_interleave_0 = const()[name = string("input_329_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_329_cast_fp16 = concat(axis = var_6882, interleave = input_329_interleave_0, values = (x_217_cast_fp16, var_6884_cast_fp16))[name = string("input_329_cast_fp16")];
+            tensor<int32, [1]> normed_309_axes_0 = const()[name = string("normed_309_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6879_to_fp16 = const()[name = string("op_6879_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_309_cast_fp16 = layer_norm(axes = normed_309_axes_0, epsilon = var_6879_to_fp16, x = input_329_cast_fp16)[name = string("normed_309_cast_fp16")];
+            tensor<int32, [2]> var_6889_split_sizes_0 = const()[name = string("op_6889_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6889_axis_0 = const()[name = string("op_6889_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_6889_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_6889_cast_fp16_1 = split(axis = var_6889_axis_0, split_sizes = var_6889_split_sizes_0, x = normed_309_cast_fp16)[name = string("op_6889_cast_fp16")];
+            tensor<fp16, [2560]> layers_10_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_10_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(580368576)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_109_cast_fp16 = mul(x = var_6889_cast_fp16_0, y = layers_10_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_109_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_111_cast_fp16 = add(x = hidden_states_105_cast_fp16, y = hidden_states_109_cast_fp16)[name = string("hidden_states_111_cast_fp16")];
+            tensor<fp16, [1]> const_130_promoted_to_fp16 = const()[name = string("const_130_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.3ep-1])];
+            tensor<fp16, [1, 1, 2560]> x_219_cast_fp16 = mul(x = hidden_states_111_cast_fp16, y = const_130_promoted_to_fp16)[name = string("x_219_cast_fp16")];
+            tensor<int32, [1]> var_6901_axes_0 = const()[name = string("op_6901_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_6901_cast_fp16 = squeeze(axes = var_6901_axes_0, x = K_sliding_out_cast_fp16)[name = string("op_6901_cast_fp16")];
+            tensor<int32, [1]> var_6903_axes_0 = const()[name = string("op_6903_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_6903_cast_fp16 = squeeze(axes = var_6903_axes_0, x = V_sliding_out_cast_fp16)[name = string("op_6903_cast_fp16")];
+            tensor<int32, [4]> var_6906_begin_0 = const()[name = string("op_6906_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> var_6906_end_0 = const()[name = string("op_6906_end_0"), val = tensor<int32, [4]>([2, 2, 2048, 512])];
+            tensor<bool, [4]> var_6906_end_mask_0 = const()[name = string("op_6906_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_6906_squeeze_mask_0 = const()[name = string("op_6906_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 2048, 512]> var_6906_cast_fp16 = slice_by_index(begin = var_6906_begin_0, end = var_6906_end_0, end_mask = var_6906_end_mask_0, squeeze_mask = var_6906_squeeze_mask_0, x = K_full_in)[name = string("op_6906_cast_fp16")];
+            tensor<int32, [1]> K_full_slot_axes_0 = const()[name = string("K_full_slot_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 2048, 512]> K_full_slot_cast_fp16 = expand_dims(axes = K_full_slot_axes_0, x = var_6906_cast_fp16)[name = string("K_full_slot_cast_fp16")];
+            tensor<int32, [4]> var_6911_begin_0 = const()[name = string("op_6911_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> var_6911_end_0 = const()[name = string("op_6911_end_0"), val = tensor<int32, [4]>([2, 2, 2048, 512])];
+            tensor<bool, [4]> var_6911_end_mask_0 = const()[name = string("op_6911_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_6911_squeeze_mask_0 = const()[name = string("op_6911_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 2048, 512]> var_6911_cast_fp16 = slice_by_index(begin = var_6911_begin_0, end = var_6911_end_0, end_mask = var_6911_end_mask_0, squeeze_mask = var_6911_squeeze_mask_0, x = V_full_in)[name = string("op_6911_cast_fp16")];
+            tensor<int32, [1]> V_full_slot_axes_0 = const()[name = string("V_full_slot_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 2048, 512]> V_full_slot_cast_fp16 = expand_dims(axes = V_full_slot_axes_0, x = var_6911_cast_fp16)[name = string("V_full_slot_cast_fp16")];
+            int32 var_6918 = const()[name = string("op_6918"), val = int32(-1)];
+            fp16 const_131_promoted_to_fp16 = const()[name = string("const_131_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_6920_cast_fp16 = mul(x = x_219_cast_fp16, y = const_131_promoted_to_fp16)[name = string("op_6920_cast_fp16")];
+            bool input_331_interleave_0 = const()[name = string("input_331_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_331_cast_fp16 = concat(axis = var_6918, interleave = input_331_interleave_0, values = (x_219_cast_fp16, var_6920_cast_fp16))[name = string("input_331_cast_fp16")];
+            tensor<int32, [1]> normed_313_axes_0 = const()[name = string("normed_313_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6915_to_fp16 = const()[name = string("op_6915_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_313_cast_fp16 = layer_norm(axes = normed_313_axes_0, epsilon = var_6915_to_fp16, x = input_331_cast_fp16)[name = string("normed_313_cast_fp16")];
+            tensor<int32, [2]> var_6925_split_sizes_0 = const()[name = string("op_6925_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6925_axis_0 = const()[name = string("op_6925_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_6925_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_6925_cast_fp16_1 = split(axis = var_6925_axis_0, split_sizes = var_6925_split_sizes_0, x = normed_313_cast_fp16)[name = string("op_6925_cast_fp16")];
+            tensor<fp16, [2560]> layers_11_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_11_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(580373760)))];
+            tensor<fp16, [1, 1, 2560]> h_67_cast_fp16 = mul(x = var_6925_cast_fp16_0, y = layers_11_input_layernorm_weight_promoted_to_fp16)[name = string("h_67_cast_fp16")];
+            tensor<int32, [3]> var_6931 = const()[name = string("op_6931"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_6934_axes_0 = const()[name = string("op_6934_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_6932_cast_fp16 = transpose(perm = var_6931, x = h_67_cast_fp16)[name = string("transpose_17")];
+            tensor<fp16, [1, 2560, 1, 1]> var_6934_cast_fp16 = expand_dims(axes = var_6934_axes_0, x = var_6932_cast_fp16)[name = string("op_6934_cast_fp16")];
+            string var_6950_pad_type_0 = const()[name = string("op_6950_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_6950_strides_0 = const()[name = string("op_6950_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_6950_pad_0 = const()[name = string("op_6950_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_6950_dilations_0 = const()[name = string("op_6950_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_6950_groups_0 = const()[name = string("op_6950_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 4096, 1, 1]> var_6950 = conv(dilations = var_6950_dilations_0, groups = var_6950_groups_0, pad = var_6950_pad_0, pad_type = var_6950_pad_type_0, strides = var_6950_strides_0, weight = layers_11_self_attn_q_proj_weight_palettized, x = var_6934_cast_fp16)[name = string("op_6950")];
+            tensor<int32, [4]> var_6955 = const()[name = string("op_6955"), val = tensor<int32, [4]>([1, 8, 512, 1])];
+            tensor<fp16, [1, 8, 512, 1]> var_6956 = reshape(shape = var_6955, x = var_6950)[name = string("op_6956")];
+            tensor<int32, [4]> var_6961 = const()[name = string("op_6961"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_6971 = const()[name = string("op_6971"), val = tensor<int32, [3]>([1, 8, 512])];
+            tensor<fp16, [1, 8, 1, 512]> var_6962 = transpose(perm = var_6961, x = var_6956)[name = string("transpose_16")];
+            tensor<fp16, [1, 8, 512]> x_221 = reshape(shape = var_6971, x = var_6962)[name = string("x_221")];
+            int32 var_6977 = const()[name = string("op_6977"), val = int32(-1)];
+            fp16 const_132_promoted = const()[name = string("const_132_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 512]> var_6979 = mul(x = x_221, y = const_132_promoted)[name = string("op_6979")];
+            bool input_335_interleave_0 = const()[name = string("input_335_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1024]> input_335 = concat(axis = var_6977, interleave = input_335_interleave_0, values = (x_221, var_6979))[name = string("input_335")];
+            tensor<int32, [1]> normed_317_axes_0 = const()[name = string("normed_317_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6974_to_fp16 = const()[name = string("op_6974_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 1024]> normed_317_cast_fp16 = layer_norm(axes = normed_317_axes_0, epsilon = var_6974_to_fp16, x = input_335)[name = string("normed_317_cast_fp16")];
+            tensor<int32, [2]> var_6984_split_sizes_0 = const()[name = string("op_6984_split_sizes_0"), val = tensor<int32, [2]>([512, 512])];
+            int32 var_6984_axis_0 = const()[name = string("op_6984_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 512]> var_6984_0, tensor<fp16, [1, 8, 512]> var_6984_1 = split(axis = var_6984_axis_0, split_sizes = var_6984_split_sizes_0, x = normed_317_cast_fp16)[name = string("op_6984")];
+            tensor<fp16, [1, 8, 512]> var_6986 = mul(x = var_6984_0, y = layers_11_self_attn_q_norm_weight)[name = string("op_6986")];
+            tensor<int32, [4]> var_6991 = const()[name = string("op_6991"), val = tensor<int32, [4]>([1, 8, 1, 512])];
+            tensor<fp16, [1, 8, 1, 512]> q_91 = reshape(shape = var_6991, x = var_6986)[name = string("q_91")];
+            tensor<fp16, [1, 8, 1, 512]> var_6993_cast_fp16 = mul(x = q_91, y = cos_f)[name = string("op_6993_cast_fp16")];
+            tensor<int32, [2]> var_6994_split_sizes_0 = const()[name = string("op_6994_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_6994_axis_0 = const()[name = string("op_6994_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 256]> var_6994_0, tensor<fp16, [1, 8, 1, 256]> var_6994_1 = split(axis = var_6994_axis_0, split_sizes = var_6994_split_sizes_0, x = q_91)[name = string("op_6994")];
+            fp16 const_133_promoted = const()[name = string("const_133_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 256]> var_6996 = mul(x = var_6994_1, y = const_133_promoted)[name = string("op_6996")];
+            int32 var_6998 = const()[name = string("op_6998"), val = int32(-1)];
+            bool var_6999_interleave_0 = const()[name = string("op_6999_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> var_6999 = concat(axis = var_6998, interleave = var_6999_interleave_0, values = (var_6996, var_6994_0))[name = string("op_6999")];
+            tensor<fp16, [1, 8, 1, 512]> var_7000_cast_fp16 = mul(x = var_6999, y = sin_f)[name = string("op_7000_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> q_cast_fp16 = add(x = var_6993_cast_fp16, y = var_7000_cast_fp16)[name = string("q_cast_fp16")];
+            string var_7013_pad_type_0 = const()[name = string("op_7013_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_7013_strides_0 = const()[name = string("op_7013_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_7013_pad_0 = const()[name = string("op_7013_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_7013_dilations_0 = const()[name = string("op_7013_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_7013_groups_0 = const()[name = string("op_7013_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> var_7013 = conv(dilations = var_7013_dilations_0, groups = var_7013_groups_0, pad = var_7013_pad_0, pad_type = var_7013_pad_type_0, strides = var_7013_strides_0, weight = layers_11_self_attn_k_proj_weight_palettized, x = var_6934_cast_fp16)[name = string("op_7013")];
+            tensor<int32, [4]> var_7018 = const()[name = string("op_7018"), val = tensor<int32, [4]>([1, 2, 512, 1])];
+            tensor<fp16, [1, 2, 512, 1]> var_7019 = reshape(shape = var_7018, x = var_7013)[name = string("op_7019")];
+            tensor<int32, [4]> var_7024 = const()[name = string("op_7024"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_7041_pad_type_0 = const()[name = string("op_7041_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_7041_strides_0 = const()[name = string("op_7041_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_7041_pad_0 = const()[name = string("op_7041_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_7041_dilations_0 = const()[name = string("op_7041_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_7041_groups_0 = const()[name = string("op_7041_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> var_7041 = conv(dilations = var_7041_dilations_0, groups = var_7041_groups_0, pad = var_7041_pad_0, pad_type = var_7041_pad_type_0, strides = var_7041_strides_0, weight = layers_11_self_attn_v_proj_weight_palettized, x = var_6934_cast_fp16)[name = string("op_7041")];
+            tensor<int32, [4]> var_7046 = const()[name = string("op_7046"), val = tensor<int32, [4]>([1, 2, 512, 1])];
+            tensor<fp16, [1, 2, 512, 1]> var_7047 = reshape(shape = var_7046, x = var_7041)[name = string("op_7047")];
+            tensor<int32, [4]> var_7052 = const()[name = string("op_7052"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_7062 = const()[name = string("op_7062"), val = tensor<int32, [3]>([1, 2, 512])];
+            tensor<fp16, [1, 2, 1, 512]> var_7025 = transpose(perm = var_7024, x = var_7019)[name = string("transpose_15")];
+            tensor<fp16, [1, 2, 512]> x_223 = reshape(shape = var_7062, x = var_7025)[name = string("x_223")];
+            int32 var_7068 = const()[name = string("op_7068"), val = int32(-1)];
+            fp16 const_134_promoted = const()[name = string("const_134_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 512]> var_7070 = mul(x = x_223, y = const_134_promoted)[name = string("op_7070")];
+            bool input_337_interleave_0 = const()[name = string("input_337_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1024]> input_337 = concat(axis = var_7068, interleave = input_337_interleave_0, values = (x_223, var_7070))[name = string("input_337")];
+            tensor<int32, [1]> normed_321_axes_0 = const()[name = string("normed_321_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7065_to_fp16 = const()[name = string("op_7065_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1024]> normed_321_cast_fp16 = layer_norm(axes = normed_321_axes_0, epsilon = var_7065_to_fp16, x = input_337)[name = string("normed_321_cast_fp16")];
+            tensor<int32, [2]> var_7075_split_sizes_0 = const()[name = string("op_7075_split_sizes_0"), val = tensor<int32, [2]>([512, 512])];
+            int32 var_7075_axis_0 = const()[name = string("op_7075_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 512]> var_7075_0, tensor<fp16, [1, 2, 512]> var_7075_1 = split(axis = var_7075_axis_0, split_sizes = var_7075_split_sizes_0, x = normed_321_cast_fp16)[name = string("op_7075")];
+            tensor<fp16, [1, 2, 512]> var_7077 = mul(x = var_7075_0, y = layers_11_self_attn_k_norm_weight)[name = string("op_7077")];
+            tensor<int32, [4]> var_7082 = const()[name = string("op_7082"), val = tensor<int32, [4]>([1, 2, 1, 512])];
+            tensor<fp16, [1, 2, 1, 512]> q_93 = reshape(shape = var_7082, x = var_7077)[name = string("q_93")];
+            fp16 var_7084_promoted = const()[name = string("op_7084_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 512]> var_7053 = transpose(perm = var_7052, x = var_7047)[name = string("transpose_14")];
+            tensor<fp16, [1, 2, 1, 512]> var_7085 = pow(x = var_7053, y = var_7084_promoted)[name = string("op_7085")];
+            tensor<int32, [1]> var_7090_axes_0 = const()[name = string("op_7090_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_7090_keep_dims_0 = const()[name = string("op_7090_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_7090 = reduce_mean(axes = var_7090_axes_0, keep_dims = var_7090_keep_dims_0, x = var_7085)[name = string("op_7090")];
+            fp16 var_7092_to_fp16 = const()[name = string("op_7092_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_cast_fp16 = add(x = var_7090, y = var_7092_to_fp16)[name = string("mean_sq_cast_fp16")];
+            fp32 var_7094_epsilon_0 = const()[name = string("op_7094_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_7094_cast_fp16 = rsqrt(epsilon = var_7094_epsilon_0, x = mean_sq_cast_fp16)[name = string("op_7094_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 512]> v_cast_fp16 = mul(x = var_7053, y = var_7094_cast_fp16)[name = string("v_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 512]> var_7096_cast_fp16 = mul(x = q_93, y = cos_f)[name = string("op_7096_cast_fp16")];
+            tensor<int32, [2]> var_7097_split_sizes_0 = const()[name = string("op_7097_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_7097_axis_0 = const()[name = string("op_7097_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 256]> var_7097_0, tensor<fp16, [1, 2, 1, 256]> var_7097_1 = split(axis = var_7097_axis_0, split_sizes = var_7097_split_sizes_0, x = q_93)[name = string("op_7097")];
+            fp16 const_135_promoted = const()[name = string("const_135_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 256]> var_7099 = mul(x = var_7097_1, y = const_135_promoted)[name = string("op_7099")];
+            int32 var_7101 = const()[name = string("op_7101"), val = int32(-1)];
+            bool var_7102_interleave_0 = const()[name = string("op_7102_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 512]> var_7102 = concat(axis = var_7101, interleave = var_7102_interleave_0, values = (var_7099, var_7097_0))[name = string("op_7102")];
+            tensor<fp16, [1, 2, 1, 512]> var_7103_cast_fp16 = mul(x = var_7102, y = sin_f)[name = string("op_7103_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 512]> k_cast_fp16 = add(x = var_7096_cast_fp16, y = var_7103_cast_fp16)[name = string("k_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_7109_cast_fp16 = mul(x = K_full_slot_cast_fp16, y = var_3796_cast_fp16)[name = string("op_7109_cast_fp16")];
+            tensor<int32, [4]> var_7110_reps_0 = const()[name = string("op_7110_reps_0"), val = tensor<int32, [4]>([1, 1, 2048, 1])];
+            tensor<fp16, [1, 2, 2048, 512]> var_7110_cast_fp16 = tile(reps = var_7110_reps_0, x = k_cast_fp16)[name = string("op_7110_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_7111_cast_fp16 = mul(x = var_7110_cast_fp16, y = update_mask)[name = string("op_7111_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> K_full_out_cast_fp16 = add(x = var_7109_cast_fp16, y = var_7111_cast_fp16)[name = string("K_full_out_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_7117_cast_fp16 = mul(x = V_full_slot_cast_fp16, y = var_3796_cast_fp16)[name = string("op_7117_cast_fp16")];
+            tensor<int32, [4]> var_7118_reps_0 = const()[name = string("op_7118_reps_0"), val = tensor<int32, [4]>([1, 1, 2048, 1])];
+            tensor<fp16, [1, 2, 2048, 512]> var_7118_cast_fp16 = tile(reps = var_7118_reps_0, x = v_cast_fp16)[name = string("op_7118_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_7119_cast_fp16 = mul(x = var_7118_cast_fp16, y = update_mask)[name = string("op_7119_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> V_full_out_cast_fp16 = add(x = var_7117_cast_fp16, y = var_7119_cast_fp16)[name = string("V_full_out_cast_fp16")];
+            tensor<int32, [4]> transpose_44_perm_0 = const()[name = string("transpose_44_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_22_reps_0 = const()[name = string("tile_22_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 2048, 512]> transpose_44_cast_fp16 = transpose(perm = transpose_44_perm_0, x = K_full_out_cast_fp16)[name = string("transpose_13")];
+            tensor<fp16, [8, 1, 2048, 512]> tile_22_cast_fp16 = tile(reps = tile_22_reps_0, x = transpose_44_cast_fp16)[name = string("tile_22_cast_fp16")];
+            tensor<int32, [5]> concat_44 = const()[name = string("concat_44"), val = tensor<int32, [5]>([4, 2, 1, 2048, 512])];
+            tensor<fp16, [4, 2, 1, 2048, 512]> reshape_44_cast_fp16 = reshape(shape = concat_44, x = tile_22_cast_fp16)[name = string("reshape_44_cast_fp16")];
+            tensor<int32, [5]> transpose_45_perm_0 = const()[name = string("transpose_45_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_45 = const()[name = string("concat_45"), val = tensor<int32, [4]>([-1, 1, 2048, 512])];
+            tensor<fp16, [2, 4, 1, 2048, 512]> transpose_45_cast_fp16 = transpose(perm = transpose_45_perm_0, x = reshape_44_cast_fp16)[name = string("transpose_12")];
+            tensor<fp16, [8, 1, 2048, 512]> reshape_45_cast_fp16 = reshape(shape = concat_45, x = transpose_45_cast_fp16)[name = string("reshape_45_cast_fp16")];
+            tensor<int32, [4]> transpose_59_perm_0 = const()[name = string("transpose_59_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_46_perm_0 = const()[name = string("transpose_46_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_23_reps_0 = const()[name = string("tile_23_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 2048, 512]> transpose_46_cast_fp16 = transpose(perm = transpose_46_perm_0, x = V_full_out_cast_fp16)[name = string("transpose_11")];
+            tensor<fp16, [8, 1, 2048, 512]> tile_23_cast_fp16 = tile(reps = tile_23_reps_0, x = transpose_46_cast_fp16)[name = string("tile_23_cast_fp16")];
+            tensor<int32, [5]> concat_46 = const()[name = string("concat_46"), val = tensor<int32, [5]>([4, 2, 1, 2048, 512])];
+            tensor<fp16, [4, 2, 1, 2048, 512]> reshape_46_cast_fp16 = reshape(shape = concat_46, x = tile_23_cast_fp16)[name = string("reshape_46_cast_fp16")];
+            tensor<int32, [5]> transpose_47_perm_0 = const()[name = string("transpose_47_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_47 = const()[name = string("concat_47"), val = tensor<int32, [4]>([-1, 1, 2048, 512])];
+            tensor<fp16, [2, 4, 1, 2048, 512]> transpose_47_cast_fp16 = transpose(perm = transpose_47_perm_0, x = reshape_46_cast_fp16)[name = string("transpose_10")];
+            tensor<fp16, [8, 1, 2048, 512]> reshape_47_cast_fp16 = reshape(shape = concat_47, x = transpose_47_cast_fp16)[name = string("reshape_47_cast_fp16")];
+            tensor<int32, [4]> V_expanded_perm_0 = const()[name = string("V_expanded_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_45_transpose_x_0 = const()[name = string("attn_weights_45_transpose_x_0"), val = bool(false)];
+            bool attn_weights_45_transpose_y_0 = const()[name = string("attn_weights_45_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 2048]> transpose_59_cast_fp16 = transpose(perm = transpose_59_perm_0, x = reshape_45_cast_fp16)[name = string("transpose_9")];
+            tensor<fp16, [1, 8, 1, 2048]> attn_weights_45_cast_fp16 = matmul(transpose_x = attn_weights_45_transpose_x_0, transpose_y = attn_weights_45_transpose_y_0, x = q_cast_fp16, y = transpose_59_cast_fp16)[name = string("attn_weights_45_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 2048]> x_227_cast_fp16 = add(x = attn_weights_45_cast_fp16, y = causal_mask_full)[name = string("x_227_cast_fp16")];
+            tensor<int32, [1]> reduce_max_11_axes_0 = const()[name = string("reduce_max_11_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_11_keep_dims_0 = const()[name = string("reduce_max_11_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_11 = reduce_max(axes = reduce_max_11_axes_0, keep_dims = reduce_max_11_keep_dims_0, x = x_227_cast_fp16)[name = string("reduce_max_11")];
+            tensor<fp16, [1, 8, 1, 2048]> var_7161 = sub(x = x_227_cast_fp16, y = reduce_max_11)[name = string("op_7161")];
+            tensor<fp16, [1, 8, 1, 2048]> var_7167 = exp(x = var_7161)[name = string("op_7167")];
+            tensor<int32, [1]> var_7177_axes_0 = const()[name = string("op_7177_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_7177_keep_dims_0 = const()[name = string("op_7177_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_7177 = reduce_sum(axes = var_7177_axes_0, keep_dims = var_7177_keep_dims_0, x = var_7167)[name = string("op_7177")];
+            tensor<fp16, [1, 8, 1, 2048]> var_7183_cast_fp16 = real_div(x = var_7167, y = var_7177)[name = string("op_7183_cast_fp16")];
+            bool attn_output_67_transpose_x_0 = const()[name = string("attn_output_67_transpose_x_0"), val = bool(false)];
+            bool attn_output_67_transpose_y_0 = const()[name = string("attn_output_67_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 2048, 512]> V_expanded_cast_fp16 = transpose(perm = V_expanded_perm_0, x = reshape_47_cast_fp16)[name = string("transpose_8")];
+            tensor<fp16, [1, 8, 1, 512]> attn_output_67_cast_fp16 = matmul(transpose_x = attn_output_67_transpose_x_0, transpose_y = attn_output_67_transpose_y_0, x = var_7183_cast_fp16, y = V_expanded_cast_fp16)[name = string("attn_output_67_cast_fp16")];
+            tensor<int32, [4]> var_7194 = const()[name = string("op_7194"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_7201 = const()[name = string("op_7201"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 512]> var_7195_cast_fp16 = transpose(perm = var_7194, x = attn_output_67_cast_fp16)[name = string("transpose_7")];
+            tensor<fp16, [1, 1, 4096]> attn_output_69_cast_fp16 = reshape(shape = var_7201, x = var_7195_cast_fp16)[name = string("attn_output_69_cast_fp16")];
+            tensor<int32, [3]> var_7206 = const()[name = string("op_7206"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_7222_pad_type_0 = const()[name = string("op_7222_pad_type_0"), val = string("valid")];
+            int32 var_7222_groups_0 = const()[name = string("op_7222_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_7222_strides_0 = const()[name = string("op_7222_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_7222_pad_0 = const()[name = string("op_7222_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_7222_dilations_0 = const()[name = string("op_7222_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 4096, 1]> squeeze_11_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 4096, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(580378944))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(585621888))))[name = string("squeeze_11_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 4096, 1]> var_7207_cast_fp16 = transpose(perm = var_7206, x = attn_output_69_cast_fp16)[name = string("transpose_6")];
+            tensor<fp16, [1, 2560, 1]> var_7222_cast_fp16 = conv(dilations = var_7222_dilations_0, groups = var_7222_groups_0, pad = var_7222_pad_0, pad_type = var_7222_pad_type_0, strides = var_7222_strides_0, weight = squeeze_11_cast_fp16_to_fp32_to_fp16_palettized, x = var_7207_cast_fp16)[name = string("op_7222_cast_fp16")];
+            tensor<int32, [3]> var_7226 = const()[name = string("op_7226"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_7232 = const()[name = string("op_7232"), val = int32(-1)];
+            fp16 const_136_promoted_to_fp16 = const()[name = string("const_136_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_231_cast_fp16 = transpose(perm = var_7226, x = var_7222_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 1, 2560]> var_7234_cast_fp16 = mul(x = x_231_cast_fp16, y = const_136_promoted_to_fp16)[name = string("op_7234_cast_fp16")];
+            bool input_341_interleave_0 = const()[name = string("input_341_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_341_cast_fp16 = concat(axis = var_7232, interleave = input_341_interleave_0, values = (x_231_cast_fp16, var_7234_cast_fp16))[name = string("input_341_cast_fp16")];
+            tensor<int32, [1]> normed_325_axes_0 = const()[name = string("normed_325_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7229_to_fp16 = const()[name = string("op_7229_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_325_cast_fp16 = layer_norm(axes = normed_325_axes_0, epsilon = var_7229_to_fp16, x = input_341_cast_fp16)[name = string("normed_325_cast_fp16")];
+            tensor<int32, [2]> var_7239_split_sizes_0 = const()[name = string("op_7239_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7239_axis_0 = const()[name = string("op_7239_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_7239_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_7239_cast_fp16_1 = split(axis = var_7239_axis_0, split_sizes = var_7239_split_sizes_0, x = normed_325_cast_fp16)[name = string("op_7239_cast_fp16")];
+            tensor<fp16, [2560]> layers_11_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_11_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(585624512)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_cast_fp16 = mul(x = var_7239_cast_fp16_0, y = layers_11_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_233_cast_fp16 = add(x = x_219_cast_fp16, y = attn_output_cast_fp16)[name = string("x_233_cast_fp16")];
+            int32 var_7248 = const()[name = string("op_7248"), val = int32(-1)];
+            fp16 const_137_promoted_to_fp16 = const()[name = string("const_137_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_7250_cast_fp16 = mul(x = x_233_cast_fp16, y = const_137_promoted_to_fp16)[name = string("op_7250_cast_fp16")];
+            bool input_343_interleave_0 = const()[name = string("input_343_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_343_cast_fp16 = concat(axis = var_7248, interleave = input_343_interleave_0, values = (x_233_cast_fp16, var_7250_cast_fp16))[name = string("input_343_cast_fp16")];
+            tensor<int32, [1]> normed_329_axes_0 = const()[name = string("normed_329_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7245_to_fp16 = const()[name = string("op_7245_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_329_cast_fp16 = layer_norm(axes = normed_329_axes_0, epsilon = var_7245_to_fp16, x = input_343_cast_fp16)[name = string("normed_329_cast_fp16")];
+            tensor<int32, [2]> var_7255_split_sizes_0 = const()[name = string("op_7255_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7255_axis_0 = const()[name = string("op_7255_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_7255_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_7255_cast_fp16_1 = split(axis = var_7255_axis_0, split_sizes = var_7255_split_sizes_0, x = normed_329_cast_fp16)[name = string("op_7255_cast_fp16")];
+            tensor<fp16, [2560]> layers_11_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_11_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(585629696)))];
+            tensor<fp16, [1, 1, 2560]> h_69_cast_fp16 = mul(x = var_7255_cast_fp16_0, y = layers_11_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_69_cast_fp16")];
+            tensor<int32, [3]> var_7266 = const()[name = string("op_7266"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_345_axes_0 = const()[name = string("input_345_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_7267 = transpose(perm = var_7266, x = h_69_cast_fp16)[name = string("transpose_4")];
+            tensor<fp16, [1, 2560, 1, 1]> input_345 = expand_dims(axes = input_345_axes_0, x = var_7267)[name = string("input_345")];
+            string gate_45_pad_type_0 = const()[name = string("gate_45_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_45_strides_0 = const()[name = string("gate_45_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_45_pad_0 = const()[name = string("gate_45_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_45_dilations_0 = const()[name = string("gate_45_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_45_groups_0 = const()[name = string("gate_45_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_45 = conv(dilations = gate_45_dilations_0, groups = gate_45_groups_0, pad = gate_45_pad_0, pad_type = gate_45_pad_type_0, strides = gate_45_strides_0, weight = layers_11_mlp_gate_proj_weight_palettized, x = input_345)[name = string("gate_45")];
+            string up_pad_type_0 = const()[name = string("up_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_strides_0 = const()[name = string("up_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_pad_0 = const()[name = string("up_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_dilations_0 = const()[name = string("up_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_groups_0 = const()[name = string("up_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up = conv(dilations = up_dilations_0, groups = up_groups_0, pad = up_pad_0, pad_type = up_pad_type_0, strides = up_strides_0, weight = layers_11_mlp_up_proj_weight_palettized, x = input_345)[name = string("up")];
+            string gate_mode_0 = const()[name = string("gate_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate = gelu(mode = gate_mode_0, x = gate_45)[name = string("gate")];
+            tensor<fp16, [1, 10240, 1, 1]> input_347 = mul(x = gate, y = up)[name = string("input_347")];
+            string mlp_out_pad_type_0 = const()[name = string("mlp_out_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_strides_0 = const()[name = string("mlp_out_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_pad_0 = const()[name = string("mlp_out_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_dilations_0 = const()[name = string("mlp_out_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_groups_0 = const()[name = string("mlp_out_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out = conv(dilations = mlp_out_dilations_0, groups = mlp_out_groups_0, pad = mlp_out_pad_0, pad_type = mlp_out_pad_type_0, strides = mlp_out_strides_0, weight = layers_11_mlp_down_proj_weight_palettized, x = input_347)[name = string("mlp_out")];
+            tensor<int32, [1]> var_7307_axes_0 = const()[name = string("op_7307_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_7307 = squeeze(axes = var_7307_axes_0, x = mlp_out)[name = string("op_7307")];
+            tensor<int32, [3]> var_7311 = const()[name = string("op_7311"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_7317 = const()[name = string("op_7317"), val = int32(-1)];
+            fp16 const_138_promoted = const()[name = string("const_138_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_235 = transpose(perm = var_7311, x = var_7307)[name = string("transpose_3")];
+            tensor<fp16, [1, 1, 2560]> var_7319 = mul(x = x_235, y = const_138_promoted)[name = string("op_7319")];
+            bool input_349_interleave_0 = const()[name = string("input_349_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_349 = concat(axis = var_7317, interleave = input_349_interleave_0, values = (x_235, var_7319))[name = string("input_349")];
+            tensor<int32, [1]> normed_333_axes_0 = const()[name = string("normed_333_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7314_to_fp16 = const()[name = string("op_7314_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_333_cast_fp16 = layer_norm(axes = normed_333_axes_0, epsilon = var_7314_to_fp16, x = input_349)[name = string("normed_333_cast_fp16")];
+            tensor<int32, [2]> var_7324_split_sizes_0 = const()[name = string("op_7324_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7324_axis_0 = const()[name = string("op_7324_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_7324_0, tensor<fp16, [1, 1, 2560]> var_7324_1 = split(axis = var_7324_axis_0, split_sizes = var_7324_split_sizes_0, x = normed_333_cast_fp16)[name = string("op_7324")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_113 = mul(x = var_7324_0, y = layers_11_post_feedforward_layernorm_weight)[name = string("hidden_states_113")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_115_cast_fp16 = add(x = x_233_cast_fp16, y = hidden_states_113)[name = string("hidden_states_115_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_begin_0 = const()[name = string("per_layer_slice_begin_0"), val = tensor<int32, [3]>([0, 0, 2816])];
+            tensor<int32, [3]> per_layer_slice_end_0 = const()[name = string("per_layer_slice_end_0"), val = tensor<int32, [3]>([1, 1, 3072])];
+            tensor<bool, [3]> per_layer_slice_end_mask_0 = const()[name = string("per_layer_slice_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_cast_fp16 = slice_by_index(begin = per_layer_slice_begin_0, end = per_layer_slice_end_0, end_mask = per_layer_slice_end_mask_0, x = per_layer_combined_out)[name = string("per_layer_slice_cast_fp16")];
+            tensor<int32, [3]> var_7352 = const()[name = string("op_7352"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_351_axes_0 = const()[name = string("input_351_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_7353 = transpose(perm = var_7352, x = hidden_states_115_cast_fp16)[name = string("transpose_2")];
+            tensor<fp16, [1, 2560, 1, 1]> input_351 = expand_dims(axes = input_351_axes_0, x = var_7353)[name = string("input_351")];
+            string gated_67_pad_type_0 = const()[name = string("gated_67_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_67_strides_0 = const()[name = string("gated_67_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_67_pad_0 = const()[name = string("gated_67_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_67_dilations_0 = const()[name = string("gated_67_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_67_groups_0 = const()[name = string("gated_67_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_67 = conv(dilations = gated_67_dilations_0, groups = gated_67_groups_0, pad = gated_67_pad_0, pad_type = gated_67_pad_type_0, strides = gated_67_strides_0, weight = layers_11_per_layer_input_gate_weight_palettized, x = input_351)[name = string("gated_67")];
+            string gated_69_mode_0 = const()[name = string("gated_69_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_69 = gelu(mode = gated_69_mode_0, x = gated_67)[name = string("gated_69")];
+            tensor<int32, [3]> var_7372 = const()[name = string("op_7372"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_axes_0 = const()[name = string("per_layer_slice_conv_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_7373_cast_fp16 = transpose(perm = var_7372, x = per_layer_slice_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_cast_fp16 = expand_dims(axes = per_layer_slice_conv_axes_0, x = var_7373_cast_fp16)[name = string("per_layer_slice_conv_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_353_cast_fp16 = mul(x = gated_69, y = per_layer_slice_conv_cast_fp16)[name = string("input_353_cast_fp16")];
+            string gated_pad_type_0 = const()[name = string("gated_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_strides_0 = const()[name = string("gated_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_pad_0 = const()[name = string("gated_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_dilations_0 = const()[name = string("gated_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_groups_0 = const()[name = string("gated_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_11_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(585634880))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(585962624))))[name = string("layers_11_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_cast_fp16 = conv(dilations = gated_dilations_0, groups = gated_groups_0, pad = gated_pad_0, pad_type = gated_pad_type_0, strides = gated_strides_0, weight = layers_11_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_353_cast_fp16)[name = string("gated_cast_fp16")];
+            tensor<int32, [1]> var_7389_axes_0 = const()[name = string("op_7389_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_7389_cast_fp16 = squeeze(axes = var_7389_axes_0, x = gated_cast_fp16)[name = string("op_7389_cast_fp16")];
+            tensor<int32, [3]> var_7393 = const()[name = string("op_7393"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_7399 = const()[name = string("op_7399"), val = int32(-1)];
+            fp16 const_139_promoted_to_fp16 = const()[name = string("const_139_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_cast_fp16 = transpose(perm = var_7393, x = var_7389_cast_fp16)[name = string("transpose_0")];
+            tensor<fp16, [1, 1, 2560]> var_7401_cast_fp16 = mul(x = x_cast_fp16, y = const_139_promoted_to_fp16)[name = string("op_7401_cast_fp16")];
+            bool input_interleave_0 = const()[name = string("input_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_cast_fp16 = concat(axis = var_7399, interleave = input_interleave_0, values = (x_cast_fp16, var_7401_cast_fp16))[name = string("input_cast_fp16")];
+            tensor<int32, [1]> normed_337_axes_0 = const()[name = string("normed_337_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7396_to_fp16 = const()[name = string("op_7396_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_337_cast_fp16 = layer_norm(axes = normed_337_axes_0, epsilon = var_7396_to_fp16, x = input_cast_fp16)[name = string("normed_337_cast_fp16")];
+            tensor<int32, [2]> var_7406_split_sizes_0 = const()[name = string("op_7406_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7406_axis_0 = const()[name = string("op_7406_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_7406_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_7406_cast_fp16_1 = split(axis = var_7406_axis_0, split_sizes = var_7406_split_sizes_0, x = normed_337_cast_fp16)[name = string("op_7406_cast_fp16")];
+            tensor<fp16, [2560]> layers_11_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_11_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(585965248)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_119_cast_fp16 = mul(x = var_7406_cast_fp16_0, y = layers_11_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_119_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_cast_fp16 = add(x = hidden_states_115_cast_fp16, y = hidden_states_119_cast_fp16)[name = string("hidden_states_cast_fp16")];
+            tensor<fp16, [1]> const_140_promoted_to_fp16 = const()[name = string("const_140_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.0ap-1])];
+            tensor<fp16, [1, 1, 2560]> hidden_states_out = mul(x = hidden_states_cast_fp16, y = const_140_promoted_to_fp16)[name = string("op_7416_cast_fp16")];
+            tensor<int32, [1]> var_7418_axes_0 = const()[name = string("op_7418_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 2048, 512]> var_7418_cast_fp16 = squeeze(axes = var_7418_axes_0, x = K_full_out_cast_fp16)[name = string("op_7418_cast_fp16")];
+            tensor<int32, [1]> var_7420_axes_0 = const()[name = string("op_7420_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 2048, 512]> var_7420_cast_fp16 = squeeze(axes = var_7420_axes_0, x = V_full_out_cast_fp16)[name = string("op_7420_cast_fp16")];
+            int32 var_7423_axis_0 = const()[name = string("op_7423_axis_0"), val = int32(0)];
+            tensor<fp16, [10, 2, 512, 512]> K_sliding_out = stack(axis = var_7423_axis_0, values = (var_1353_cast_fp16, var_1912_cast_fp16, var_2471_cast_fp16, var_3030_cast_fp16, var_3589_cast_fp16, var_4665_cast_fp16, var_5224_cast_fp16, var_5783_cast_fp16, var_6342_cast_fp16, var_6901_cast_fp16))[name = string("op_7423_cast_fp16")];
+            int32 var_7426_axis_0 = const()[name = string("op_7426_axis_0"), val = int32(0)];
+            tensor<fp16, [10, 2, 512, 512]> V_sliding_out = stack(axis = var_7426_axis_0, values = (var_1355_cast_fp16, var_1914_cast_fp16, var_2473_cast_fp16, var_3032_cast_fp16, var_3591_cast_fp16, var_4667_cast_fp16, var_5226_cast_fp16, var_5785_cast_fp16, var_6344_cast_fp16, var_6903_cast_fp16))[name = string("op_7426_cast_fp16")];
+            int32 var_7429_axis_0 = const()[name = string("op_7429_axis_0"), val = int32(0)];
+            tensor<fp16, [2, 2, 2048, 512]> K_full_out = stack(axis = var_7429_axis_0, values = (var_4106_cast_fp16, var_7418_cast_fp16))[name = string("op_7429_cast_fp16")];
+            int32 var_7432_axis_0 = const()[name = string("op_7432_axis_0"), val = int32(0)];
+            tensor<fp16, [2, 2, 2048, 512]> V_full_out = stack(axis = var_7432_axis_0, values = (var_4108_cast_fp16, var_7420_cast_fp16))[name = string("op_7432_cast_fp16")];
+        } -> (hidden_states_out, K_sliding_out, V_sliding_out, K_full_out, V_full_out, per_layer_combined_out);
+    func verify_qK<ios18>(tensor<fp16, [2, 2, 2048, 512]> K_full_in, tensor<fp16, [10, 2, 512, 512]> K_sliding_in, tensor<fp16, [2, 2, 2048, 512]> V_full_in, tensor<fp16, [10, 2, 512, 512]> V_sliding_in, tensor<fp16, [1, 1, 3, 2048]> causal_mask_full, tensor<fp16, [1, 1, 3, 512]> causal_mask_sliding, tensor<fp16, [1, 1, 3, 512]> cos_f, tensor<fp16, [1, 1, 3, 256]> cos_s, tensor<fp16, [1, 3, 2560]> hidden_states, tensor<fp16, [1, 3, 10752]> per_layer_raw, tensor<fp16, [1, 1, 3, 512]> sin_f, tensor<fp16, [1, 1, 3, 256]> sin_s, tensor<fp16, [1, 1, 2048, 3]> update_indicator) {
+            tensor<fp16, [10752, 2560, 1, 1]> per_layer_model_projection_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10752, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [336, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(13762688))))[name = string("per_layer_model_projection_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_0_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(13773504))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16395008))))[name = string("layers_0_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_0_self_attn_q_norm_weight = const()[name = string("layers_0_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16397120)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_0_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(16397696))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17053120))))[name = string("layers_0_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_0_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17053696))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17709120))))[name = string("layers_0_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_0_self_attn_k_norm_weight = const()[name = string("layers_0_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17709696)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_0_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17710272))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(30817536))))[name = string("layers_0_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_0_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(30827840))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(43935104))))[name = string("layers_0_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_0_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(43945408))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(57052672))))[name = string("layers_0_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_0_post_feedforward_layernorm_weight = const()[name = string("layers_0_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(57055296)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_0_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(57060480))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(57388224))))[name = string("layers_0_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_1_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(57388544))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(60010048))))[name = string("layers_1_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_1_self_attn_q_norm_weight = const()[name = string("layers_1_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(60012160)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_1_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(60012736))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(60668160))))[name = string("layers_1_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_1_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(60668736))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(61324160))))[name = string("layers_1_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_1_self_attn_k_norm_weight = const()[name = string("layers_1_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(61324736)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_1_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(61325312))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(74432576))))[name = string("layers_1_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_1_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(74442880))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(87550144))))[name = string("layers_1_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_1_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(87560448))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(100667712))))[name = string("layers_1_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_1_post_feedforward_layernorm_weight = const()[name = string("layers_1_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(100670336)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_1_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(100675520))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(101003264))))[name = string("layers_1_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_2_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(101003584))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(103625088))))[name = string("layers_2_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_2_self_attn_q_norm_weight = const()[name = string("layers_2_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(103627200)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_2_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(103627776))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(104283200))))[name = string("layers_2_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_2_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(104283776))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(104939200))))[name = string("layers_2_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_2_self_attn_k_norm_weight = const()[name = string("layers_2_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(104939776)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_2_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(104940352))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(118047616))))[name = string("layers_2_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_2_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(118057920))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(131165184))))[name = string("layers_2_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_2_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(131175488))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(144282752))))[name = string("layers_2_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_2_post_feedforward_layernorm_weight = const()[name = string("layers_2_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(144285376)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_2_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(144290560))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(144618304))))[name = string("layers_2_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_3_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(144618624))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(147240128))))[name = string("layers_3_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_3_self_attn_q_norm_weight = const()[name = string("layers_3_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(147242240)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_3_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(147242816))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(147898240))))[name = string("layers_3_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_3_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(147898816))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(148554240))))[name = string("layers_3_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_3_self_attn_k_norm_weight = const()[name = string("layers_3_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(148554816)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_3_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(148555392))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(161662656))))[name = string("layers_3_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_3_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(161672960))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(174780224))))[name = string("layers_3_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_3_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(174790528))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(187897792))))[name = string("layers_3_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_3_post_feedforward_layernorm_weight = const()[name = string("layers_3_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(187900416)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_3_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(187905600))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(188233344))))[name = string("layers_3_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_4_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(188233664))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(190855168))))[name = string("layers_4_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_4_self_attn_q_norm_weight = const()[name = string("layers_4_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(190857280)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_4_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(190857856))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(191513280))))[name = string("layers_4_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_4_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(191513856))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(192169280))))[name = string("layers_4_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_4_self_attn_k_norm_weight = const()[name = string("layers_4_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(192169856)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_4_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(192170432))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(205277696))))[name = string("layers_4_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_4_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(205288000))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(218395264))))[name = string("layers_4_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_4_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(218405568))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(231512832))))[name = string("layers_4_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_4_post_feedforward_layernorm_weight = const()[name = string("layers_4_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(231515456)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_4_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(231520640))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(231848384))))[name = string("layers_4_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [4096, 2560, 1, 1]> layers_5_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(231848704))), lut = tensor<fp16, [128, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(237091648))))[name = string("layers_5_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [512]> layers_5_self_attn_q_norm_weight = const()[name = string("layers_5_self_attn_q_norm_weight"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(237095808)))];
+            tensor<fp16, [1024, 2560, 1, 1]> layers_5_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(237096896))), lut = tensor<fp16, [32, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(238407680))))[name = string("layers_5_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [1024, 2560, 1, 1]> layers_5_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(238408768))), lut = tensor<fp16, [32, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(239719552))))[name = string("layers_5_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [512]> layers_5_self_attn_k_norm_weight = const()[name = string("layers_5_self_attn_k_norm_weight"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(239720640)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_5_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(239721728))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(252828992))))[name = string("layers_5_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_5_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(252839296))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(265946560))))[name = string("layers_5_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_5_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(265956864))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(279064128))))[name = string("layers_5_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_5_post_feedforward_layernorm_weight = const()[name = string("layers_5_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(279066752)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_5_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(279071936))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(279399680))))[name = string("layers_5_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_6_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(279400000))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(282021504))))[name = string("layers_6_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_6_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(282023616))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(282679040))))[name = string("layers_6_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_6_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(282679616))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(283335040))))[name = string("layers_6_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_6_self_attn_k_norm_weight = const()[name = string("layers_6_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(283335616)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_6_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(283336192))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(296443456))))[name = string("layers_6_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_6_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(296453760))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(309561024))))[name = string("layers_6_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_6_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(309571328))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(322678592))))[name = string("layers_6_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_6_post_feedforward_layernorm_weight = const()[name = string("layers_6_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(322681216)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_6_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(322686400))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(323014144))))[name = string("layers_6_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_7_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(323014464))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(325635968))))[name = string("layers_7_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_7_self_attn_q_norm_weight = const()[name = string("layers_7_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(325638080)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_7_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(325638656))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(326294080))))[name = string("layers_7_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_7_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(326294656))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(326950080))))[name = string("layers_7_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_7_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(326950656))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(340057920))))[name = string("layers_7_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_7_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(340068224))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(353175488))))[name = string("layers_7_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_7_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(353185792))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(366293056))))[name = string("layers_7_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_7_post_feedforward_layernorm_weight = const()[name = string("layers_7_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(366295680)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_7_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(366300864))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(366628608))))[name = string("layers_7_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_8_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(366628928))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(369250432))))[name = string("layers_8_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_8_self_attn_q_norm_weight = const()[name = string("layers_8_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(369252544)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_8_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(369253120))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(369908544))))[name = string("layers_8_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_8_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(369909120))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(370564544))))[name = string("layers_8_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_8_self_attn_k_norm_weight = const()[name = string("layers_8_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(370565120)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_8_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(370565696))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(383672960))))[name = string("layers_8_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_8_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(383683264))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(396790528))))[name = string("layers_8_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_8_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(396800832))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409908096))))[name = string("layers_8_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_8_post_feedforward_layernorm_weight = const()[name = string("layers_8_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409910720)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_8_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409915904))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(410243648))))[name = string("layers_8_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_9_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(410243968))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412865472))))[name = string("layers_9_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_9_self_attn_q_norm_weight = const()[name = string("layers_9_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412867584)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_9_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412868160))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(413523584))))[name = string("layers_9_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_9_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(413524160))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(414179584))))[name = string("layers_9_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_9_self_attn_k_norm_weight = const()[name = string("layers_9_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(414180160)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_9_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(414180736))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(427288000))))[name = string("layers_9_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_9_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(427298304))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(440405568))))[name = string("layers_9_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_9_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(440415872))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(453523136))))[name = string("layers_9_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_9_post_feedforward_layernorm_weight = const()[name = string("layers_9_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(453525760)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_9_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(453530944))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(453858688))))[name = string("layers_9_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_10_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(453859008))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(456480512))))[name = string("layers_10_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_10_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(456482624))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(457138048))))[name = string("layers_10_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_10_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(457138624))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(457794048))))[name = string("layers_10_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_10_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(457794624))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(470901888))))[name = string("layers_10_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_10_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(470912192))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(484019456))))[name = string("layers_10_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_10_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(484029760))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(497137024))))[name = string("layers_10_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_10_post_feedforward_layernorm_weight = const()[name = string("layers_10_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(497139648)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_10_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(497144832))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(497472576))))[name = string("layers_10_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [4096, 2560, 1, 1]> layers_11_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(497472896))), lut = tensor<fp16, [128, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(502715840))))[name = string("layers_11_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [512]> layers_11_self_attn_q_norm_weight = const()[name = string("layers_11_self_attn_q_norm_weight"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(502720000)))];
+            tensor<fp16, [1024, 2560, 1, 1]> layers_11_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(502721088))), lut = tensor<fp16, [32, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(504031872))))[name = string("layers_11_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [1024, 2560, 1, 1]> layers_11_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(504032960))), lut = tensor<fp16, [32, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(505343744))))[name = string("layers_11_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [512]> layers_11_self_attn_k_norm_weight = const()[name = string("layers_11_self_attn_k_norm_weight"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(505344832)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_11_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(505345920))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(518453184))))[name = string("layers_11_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_11_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(518463488))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(531570752))))[name = string("layers_11_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_11_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(531581056))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(544688320))))[name = string("layers_11_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_11_post_feedforward_layernorm_weight = const()[name = string("layers_11_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(544690944)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_11_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(544696128))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(545023872))))[name = string("layers_11_per_layer_input_gate_weight_palettized")];
+            tensor<int32, [3]> var_740 = const()[name = string("op_740"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_743_axes_0 = const()[name = string("op_743_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_741_cast_fp16 = transpose(perm = var_740, x = hidden_states)[name = string("transpose_241")];
+            tensor<fp16, [1, 2560, 1, 3]> var_743_cast_fp16 = expand_dims(axes = var_743_axes_0, x = var_741_cast_fp16)[name = string("op_743_cast_fp16")];
+            string var_759_pad_type_0 = const()[name = string("op_759_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_759_strides_0 = const()[name = string("op_759_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_759_pad_0 = const()[name = string("op_759_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_759_dilations_0 = const()[name = string("op_759_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_759_groups_0 = const()[name = string("op_759_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10752, 1, 3]> var_759 = conv(dilations = var_759_dilations_0, groups = var_759_groups_0, pad = var_759_pad_0, pad_type = var_759_pad_type_0, strides = var_759_strides_0, weight = per_layer_model_projection_weight_palettized, x = var_743_cast_fp16)[name = string("op_759")];
+            fp16 var_760_to_fp16 = const()[name = string("op_760_to_fp16"), val = fp16(0x1.43cp-6)];
+            tensor<fp16, [1, 10752, 1, 3]> proj_1_cast_fp16 = mul(x = var_759, y = var_760_to_fp16)[name = string("proj_1_cast_fp16")];
+            tensor<int32, [1]> var_763_axes_0 = const()[name = string("op_763_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 10752, 3]> var_763_cast_fp16 = squeeze(axes = var_763_axes_0, x = proj_1_cast_fp16)[name = string("op_763_cast_fp16")];
+            tensor<int32, [3]> var_767 = const()[name = string("op_767"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [3]> var_774 = const()[name = string("op_774"), val = tensor<int32, [3]>([3, 42, 256])];
+            tensor<fp16, [1, 3, 10752]> proj_3_cast_fp16 = transpose(perm = var_767, x = var_763_cast_fp16)[name = string("transpose_240")];
+            tensor<fp16, [3, 42, 256]> proj_grouped_cast_fp16 = reshape(shape = var_774, x = proj_3_cast_fp16)[name = string("proj_grouped_cast_fp16")];
+            fp16 const_0_promoted_to_fp16 = const()[name = string("const_0_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 42, 256]> var_776_cast_fp16 = mul(x = proj_grouped_cast_fp16, y = const_0_promoted_to_fp16)[name = string("op_776_cast_fp16")];
+            int32 var_778 = const()[name = string("op_778"), val = int32(-1)];
+            bool input_3_interleave_0 = const()[name = string("input_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 42, 512]> input_3_cast_fp16 = concat(axis = var_778, interleave = input_3_interleave_0, values = (proj_grouped_cast_fp16, var_776_cast_fp16))[name = string("input_3_cast_fp16")];
+            tensor<int32, [1]> normed_1_axes_0 = const()[name = string("normed_1_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_784_to_fp16 = const()[name = string("op_784_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 42, 512]> normed_1_cast_fp16 = layer_norm(axes = normed_1_axes_0, epsilon = var_784_to_fp16, x = input_3_cast_fp16)[name = string("normed_1_cast_fp16")];
+            tensor<int32, [2]> var_787_split_sizes_0 = const()[name = string("op_787_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_787_axis_0 = const()[name = string("op_787_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 42, 256]> var_787_cast_fp16_0, tensor<fp16, [3, 42, 256]> var_787_cast_fp16_1 = split(axis = var_787_axis_0, split_sizes = var_787_split_sizes_0, x = normed_1_cast_fp16)[name = string("op_787_cast_fp16")];
+            tensor<fp16, [256]> per_layer_projection_norm_weight_promoted_to_fp16 = const()[name = string("per_layer_projection_norm_weight_promoted_to_fp16"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(545024192)))];
+            tensor<fp16, [3, 42, 256]> var_789_cast_fp16 = mul(x = var_787_cast_fp16_0, y = per_layer_projection_norm_weight_promoted_to_fp16)[name = string("op_789_cast_fp16")];
+            tensor<int32, [3]> var_793 = const()[name = string("op_793"), val = tensor<int32, [3]>([1, 3, 10752])];
+            tensor<fp16, [1, 3, 10752]> proj_normed_cast_fp16 = reshape(shape = var_793, x = var_789_cast_fp16)[name = string("proj_normed_cast_fp16")];
+            tensor<fp16, [1, 3, 10752]> var_796_cast_fp16 = add(x = proj_normed_cast_fp16, y = per_layer_raw)[name = string("op_796_cast_fp16")];
+            fp16 var_797_to_fp16 = const()[name = string("op_797_to_fp16"), val = fp16(0x1.6ap-1)];
+            tensor<fp16, [1, 3, 10752]> per_layer_combined_out = mul(x = var_796_cast_fp16, y = var_797_to_fp16)[name = string("per_layer_combined_cast_fp16")];
+            int32 var_803 = const()[name = string("op_803"), val = int32(-1)];
+            fp16 const_1_promoted_to_fp16 = const()[name = string("const_1_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_805_cast_fp16 = mul(x = hidden_states, y = const_1_promoted_to_fp16)[name = string("op_805_cast_fp16")];
+            bool input_5_interleave_0 = const()[name = string("input_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_5_cast_fp16 = concat(axis = var_803, interleave = input_5_interleave_0, values = (hidden_states, var_805_cast_fp16))[name = string("input_5_cast_fp16")];
+            tensor<int32, [1]> normed_5_axes_0 = const()[name = string("normed_5_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_800_to_fp16 = const()[name = string("op_800_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_5_cast_fp16 = layer_norm(axes = normed_5_axes_0, epsilon = var_800_to_fp16, x = input_5_cast_fp16)[name = string("normed_5_cast_fp16")];
+            tensor<int32, [2]> var_810_split_sizes_0 = const()[name = string("op_810_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_810_axis_0 = const()[name = string("op_810_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_810_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_810_cast_fp16_1 = split(axis = var_810_axis_0, split_sizes = var_810_split_sizes_0, x = normed_5_cast_fp16)[name = string("op_810_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_0_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(545024768)))];
+            tensor<fp16, [1, 3, 2560]> h_1_cast_fp16 = mul(x = var_810_cast_fp16_0, y = layers_0_input_layernorm_weight_promoted_to_fp16)[name = string("h_1_cast_fp16")];
+            tensor<int32, [3]> var_816 = const()[name = string("op_816"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_819_axes_0 = const()[name = string("op_819_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_817_cast_fp16 = transpose(perm = var_816, x = h_1_cast_fp16)[name = string("transpose_239")];
+            tensor<fp16, [1, 2560, 1, 3]> var_819_cast_fp16 = expand_dims(axes = var_819_axes_0, x = var_817_cast_fp16)[name = string("op_819_cast_fp16")];
+            string q_1_pad_type_0 = const()[name = string("q_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_1_strides_0 = const()[name = string("q_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_1_pad_0 = const()[name = string("q_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_1_dilations_0 = const()[name = string("q_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_1_groups_0 = const()[name = string("q_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_1 = conv(dilations = q_1_dilations_0, groups = q_1_groups_0, pad = q_1_pad_0, pad_type = q_1_pad_type_0, strides = q_1_strides_0, weight = layers_0_self_attn_q_proj_weight_palettized, x = var_819_cast_fp16)[name = string("q_1")];
+            tensor<int32, [4]> var_840 = const()[name = string("op_840"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_841 = reshape(shape = var_840, x = q_1)[name = string("op_841")];
+            tensor<int32, [4]> transpose_48_perm_0 = const()[name = string("transpose_48_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_864 = const()[name = string("op_864"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_48 = transpose(perm = transpose_48_perm_0, x = var_841)[name = string("transpose_238")];
+            tensor<fp16, [3, 8, 256]> x_1 = reshape(shape = var_864, x = transpose_48)[name = string("x_1")];
+            int32 var_870 = const()[name = string("op_870"), val = int32(-1)];
+            fp16 const_2_promoted = const()[name = string("const_2_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_872 = mul(x = x_1, y = const_2_promoted)[name = string("op_872")];
+            bool input_9_interleave_0 = const()[name = string("input_9_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_9 = concat(axis = var_870, interleave = input_9_interleave_0, values = (x_1, var_872))[name = string("input_9")];
+            tensor<int32, [1]> normed_9_axes_0 = const()[name = string("normed_9_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_867_to_fp16 = const()[name = string("op_867_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_9_cast_fp16 = layer_norm(axes = normed_9_axes_0, epsilon = var_867_to_fp16, x = input_9)[name = string("normed_9_cast_fp16")];
+            tensor<int32, [2]> var_877_split_sizes_0 = const()[name = string("op_877_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_877_axis_0 = const()[name = string("op_877_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_877_0, tensor<fp16, [3, 8, 256]> var_877_1 = split(axis = var_877_axis_0, split_sizes = var_877_split_sizes_0, x = normed_9_cast_fp16)[name = string("op_877")];
+            tensor<fp16, [3, 8, 256]> q_5 = mul(x = var_877_0, y = layers_0_self_attn_q_norm_weight)[name = string("q_5")];
+            tensor<int32, [4]> var_884 = const()[name = string("op_884"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_885 = reshape(shape = var_884, x = q_5)[name = string("op_885")];
+            tensor<int32, [4]> var_890 = const()[name = string("op_890"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_7 = transpose(perm = var_890, x = var_885)[name = string("transpose_237")];
+            tensor<fp16, [1, 8, 3, 256]> var_892_cast_fp16 = mul(x = q_7, y = cos_s)[name = string("op_892_cast_fp16")];
+            tensor<int32, [2]> var_893_split_sizes_0 = const()[name = string("op_893_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_893_axis_0 = const()[name = string("op_893_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_893_0, tensor<fp16, [1, 8, 3, 128]> var_893_1 = split(axis = var_893_axis_0, split_sizes = var_893_split_sizes_0, x = q_7)[name = string("op_893")];
+            fp16 const_3_promoted = const()[name = string("const_3_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_895 = mul(x = var_893_1, y = const_3_promoted)[name = string("op_895")];
+            int32 var_897 = const()[name = string("op_897"), val = int32(-1)];
+            bool var_898_interleave_0 = const()[name = string("op_898_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_898 = concat(axis = var_897, interleave = var_898_interleave_0, values = (var_895, var_893_0))[name = string("op_898")];
+            tensor<fp16, [1, 8, 3, 256]> var_899_cast_fp16 = mul(x = var_898, y = sin_s)[name = string("op_899_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_11_cast_fp16 = add(x = var_892_cast_fp16, y = var_899_cast_fp16)[name = string("q_11_cast_fp16")];
+            string k_1_pad_type_0 = const()[name = string("k_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_1_strides_0 = const()[name = string("k_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_1_pad_0 = const()[name = string("k_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_1_dilations_0 = const()[name = string("k_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_1_groups_0 = const()[name = string("k_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> k_1 = conv(dilations = k_1_dilations_0, groups = k_1_groups_0, pad = k_1_pad_0, pad_type = k_1_pad_type_0, strides = k_1_strides_0, weight = layers_0_self_attn_k_proj_weight_palettized, x = var_819_cast_fp16)[name = string("k_1")];
+            tensor<int32, [4]> var_917 = const()[name = string("op_917"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_918 = reshape(shape = var_917, x = k_1)[name = string("op_918")];
+            tensor<int32, [4]> transpose_49_perm_0 = const()[name = string("transpose_49_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            string v_1_pad_type_0 = const()[name = string("v_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_1_strides_0 = const()[name = string("v_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_1_pad_0 = const()[name = string("v_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_1_dilations_0 = const()[name = string("v_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_1_groups_0 = const()[name = string("v_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> v_1 = conv(dilations = v_1_dilations_0, groups = v_1_groups_0, pad = v_1_pad_0, pad_type = v_1_pad_type_0, strides = v_1_strides_0, weight = layers_0_self_attn_v_proj_weight_palettized, x = var_819_cast_fp16)[name = string("v_1")];
+            tensor<int32, [4]> var_945 = const()[name = string("op_945"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_946 = reshape(shape = var_945, x = v_1)[name = string("op_946")];
+            tensor<int32, [4]> var_951 = const()[name = string("op_951"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_969 = const()[name = string("op_969"), val = tensor<int32, [3]>([3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> transpose_49 = transpose(perm = transpose_49_perm_0, x = var_918)[name = string("transpose_236")];
+            tensor<fp16, [3, 2, 256]> x_3 = reshape(shape = var_969, x = transpose_49)[name = string("x_3")];
+            int32 var_975 = const()[name = string("op_975"), val = int32(-1)];
+            fp16 const_4_promoted = const()[name = string("const_4_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 2, 256]> var_977 = mul(x = x_3, y = const_4_promoted)[name = string("op_977")];
+            bool input_11_interleave_0 = const()[name = string("input_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 2, 512]> input_11 = concat(axis = var_975, interleave = input_11_interleave_0, values = (x_3, var_977))[name = string("input_11")];
+            tensor<int32, [1]> normed_13_axes_0 = const()[name = string("normed_13_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_972_to_fp16 = const()[name = string("op_972_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 2, 512]> normed_13_cast_fp16 = layer_norm(axes = normed_13_axes_0, epsilon = var_972_to_fp16, x = input_11)[name = string("normed_13_cast_fp16")];
+            tensor<int32, [2]> var_982_split_sizes_0 = const()[name = string("op_982_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_982_axis_0 = const()[name = string("op_982_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 2, 256]> var_982_0, tensor<fp16, [3, 2, 256]> var_982_1 = split(axis = var_982_axis_0, split_sizes = var_982_split_sizes_0, x = normed_13_cast_fp16)[name = string("op_982")];
+            tensor<fp16, [3, 2, 256]> k_5 = mul(x = var_982_0, y = layers_0_self_attn_k_norm_weight)[name = string("k_5")];
+            tensor<int32, [4]> var_989 = const()[name = string("op_989"), val = tensor<int32, [4]>([1, 3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> var_990 = reshape(shape = var_989, x = k_5)[name = string("op_990")];
+            tensor<int32, [4]> var_995 = const()[name = string("op_995"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            fp16 var_997_promoted = const()[name = string("op_997_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 3, 256]> var_952 = transpose(perm = var_951, x = var_946)[name = string("transpose_235")];
+            tensor<fp16, [1, 2, 3, 256]> var_998 = pow(x = var_952, y = var_997_promoted)[name = string("op_998")];
+            tensor<int32, [1]> var_1003_axes_0 = const()[name = string("op_1003_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1003_keep_dims_0 = const()[name = string("op_1003_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 3, 1]> var_1003 = reduce_mean(axes = var_1003_axes_0, keep_dims = var_1003_keep_dims_0, x = var_998)[name = string("op_1003")];
+            fp16 var_1005_to_fp16 = const()[name = string("op_1005_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 3, 1]> mean_sq_1_cast_fp16 = add(x = var_1003, y = var_1005_to_fp16)[name = string("mean_sq_1_cast_fp16")];
+            fp32 var_1007_epsilon_0 = const()[name = string("op_1007_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 3, 1]> var_1007_cast_fp16 = rsqrt(epsilon = var_1007_epsilon_0, x = mean_sq_1_cast_fp16)[name = string("op_1007_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_15_cast_fp16 = mul(x = var_952, y = var_1007_cast_fp16)[name = string("input_15_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> q_9 = transpose(perm = var_995, x = var_990)[name = string("transpose_234")];
+            tensor<fp16, [1, 2, 3, 256]> var_1009_cast_fp16 = mul(x = q_9, y = cos_s)[name = string("op_1009_cast_fp16")];
+            tensor<int32, [2]> var_1010_split_sizes_0 = const()[name = string("op_1010_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_1010_axis_0 = const()[name = string("op_1010_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 3, 128]> var_1010_0, tensor<fp16, [1, 2, 3, 128]> var_1010_1 = split(axis = var_1010_axis_0, split_sizes = var_1010_split_sizes_0, x = q_9)[name = string("op_1010")];
+            fp16 const_5_promoted = const()[name = string("const_5_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 3, 128]> var_1012 = mul(x = var_1010_1, y = const_5_promoted)[name = string("op_1012")];
+            int32 var_1014 = const()[name = string("op_1014"), val = int32(-1)];
+            bool var_1015_interleave_0 = const()[name = string("op_1015_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 3, 256]> var_1015 = concat(axis = var_1014, interleave = var_1015_interleave_0, values = (var_1012, var_1010_0))[name = string("op_1015")];
+            tensor<fp16, [1, 2, 3, 256]> var_1016_cast_fp16 = mul(x = var_1015, y = sin_s)[name = string("op_1016_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_13_cast_fp16 = add(x = var_1009_cast_fp16, y = var_1016_cast_fp16)[name = string("input_13_cast_fp16")];
+            tensor<int32, [8]> k_padded_1_pad_0 = const()[name = string("k_padded_1_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_1_mode_0 = const()[name = string("k_padded_1_mode_0"), val = string("constant")];
+            fp16 const_6_to_fp16 = const()[name = string("const_6_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> k_padded_1_cast_fp16 = pad(constant_val = const_6_to_fp16, mode = k_padded_1_mode_0, pad = k_padded_1_pad_0, x = input_13_cast_fp16)[name = string("k_padded_1_cast_fp16")];
+            tensor<int32, [8]> v_padded_1_pad_0 = const()[name = string("v_padded_1_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_1_mode_0 = const()[name = string("v_padded_1_mode_0"), val = string("constant")];
+            fp16 const_7_to_fp16 = const()[name = string("const_7_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> v_padded_1_cast_fp16 = pad(constant_val = const_7_to_fp16, mode = v_padded_1_mode_0, pad = v_padded_1_pad_0, x = input_15_cast_fp16)[name = string("v_padded_1_cast_fp16")];
+            tensor<int32, [4]> slot_k_1_begin_0 = const()[name = string("slot_k_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> slot_k_1_end_0 = const()[name = string("slot_k_1_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> slot_k_1_end_mask_0 = const()[name = string("slot_k_1_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_k_1_cast_fp16 = slice_by_index(begin = slot_k_1_begin_0, end = slot_k_1_end_0, end_mask = slot_k_1_end_mask_0, x = K_sliding_in)[name = string("slot_k_1_cast_fp16")];
+            tensor<int32, [4]> slot_v_1_begin_0 = const()[name = string("slot_v_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> slot_v_1_end_0 = const()[name = string("slot_v_1_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> slot_v_1_end_mask_0 = const()[name = string("slot_v_1_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_v_1_cast_fp16 = slice_by_index(begin = slot_v_1_begin_0, end = slot_v_1_end_0, end_mask = slot_v_1_end_mask_0, x = V_sliding_in)[name = string("slot_v_1_cast_fp16")];
+            tensor<int32, [4]> var_1055_begin_0 = const()[name = string("op_1055_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_1055_end_0 = const()[name = string("op_1055_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_1055_end_mask_0 = const()[name = string("op_1055_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_1055_cast_fp16 = slice_by_index(begin = var_1055_begin_0, end = var_1055_end_0, end_mask = var_1055_end_mask_0, x = slot_k_1_cast_fp16)[name = string("op_1055_cast_fp16")];
+            int32 var_1062 = const()[name = string("op_1062"), val = int32(2)];
+            bool new_k_1_interleave_0 = const()[name = string("new_k_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_k_1_cast_fp16 = concat(axis = var_1062, interleave = new_k_1_interleave_0, values = (var_1055_cast_fp16, k_padded_1_cast_fp16))[name = string("new_k_1_cast_fp16")];
+            tensor<int32, [4]> var_1078_begin_0 = const()[name = string("op_1078_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_1078_end_0 = const()[name = string("op_1078_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_1078_end_mask_0 = const()[name = string("op_1078_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_1078_cast_fp16 = slice_by_index(begin = var_1078_begin_0, end = var_1078_end_0, end_mask = var_1078_end_mask_0, x = slot_v_1_cast_fp16)[name = string("op_1078_cast_fp16")];
+            int32 var_1085 = const()[name = string("op_1085"), val = int32(2)];
+            bool new_v_1_interleave_0 = const()[name = string("new_v_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_v_1_cast_fp16 = concat(axis = var_1085, interleave = new_v_1_interleave_0, values = (var_1078_cast_fp16, v_padded_1_cast_fp16))[name = string("new_v_1_cast_fp16")];
+            tensor<int32, [4]> var_1096_begin_0 = const()[name = string("op_1096_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> var_1096_end_0 = const()[name = string("op_1096_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_1096_end_mask_0 = const()[name = string("op_1096_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [9, 2, 512, 512]> var_1096_cast_fp16 = slice_by_index(begin = var_1096_begin_0, end = var_1096_end_0, end_mask = var_1096_end_mask_0, x = K_sliding_in)[name = string("op_1096_cast_fp16")];
+            int32 var_1098 = const()[name = string("op_1098"), val = int32(0)];
+            bool K_sliding_out_1_interleave_0 = const()[name = string("K_sliding_out_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> K_sliding_out_1_cast_fp16 = concat(axis = var_1098, interleave = K_sliding_out_1_interleave_0, values = (new_k_1_cast_fp16, var_1096_cast_fp16))[name = string("K_sliding_out_1_cast_fp16")];
+            tensor<int32, [4]> var_1109_begin_0 = const()[name = string("op_1109_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> var_1109_end_0 = const()[name = string("op_1109_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_1109_end_mask_0 = const()[name = string("op_1109_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [9, 2, 512, 512]> var_1109_cast_fp16 = slice_by_index(begin = var_1109_begin_0, end = var_1109_end_0, end_mask = var_1109_end_mask_0, x = V_sliding_in)[name = string("op_1109_cast_fp16")];
+            int32 var_1111 = const()[name = string("op_1111"), val = int32(0)];
+            bool V_sliding_out_1_interleave_0 = const()[name = string("V_sliding_out_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> V_sliding_out_1_cast_fp16 = concat(axis = var_1111, interleave = V_sliding_out_1_interleave_0, values = (new_v_1_cast_fp16, var_1109_cast_fp16))[name = string("V_sliding_out_1_cast_fp16")];
+            tensor<int32, [4]> var_1117_begin_0 = const()[name = string("op_1117_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1117_end_0 = const()[name = string("op_1117_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_1117_end_mask_0 = const()[name = string("op_1117_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_1117_cast_fp16 = slice_by_index(begin = var_1117_begin_0, end = var_1117_end_0, end_mask = var_1117_end_mask_0, x = K_sliding_out_1_cast_fp16)[name = string("op_1117_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_1_begin_0 = const()[name = string("K_for_attn_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_1_end_0 = const()[name = string("K_for_attn_1_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_1_end_mask_0 = const()[name = string("K_for_attn_1_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_1_cast_fp16 = slice_by_index(begin = K_for_attn_1_begin_0, end = K_for_attn_1_end_0, end_mask = K_for_attn_1_end_mask_0, x = var_1117_cast_fp16)[name = string("K_for_attn_1_cast_fp16")];
+            tensor<int32, [4]> var_1127_begin_0 = const()[name = string("op_1127_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1127_end_0 = const()[name = string("op_1127_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_1127_end_mask_0 = const()[name = string("op_1127_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_1127_cast_fp16 = slice_by_index(begin = var_1127_begin_0, end = var_1127_end_0, end_mask = var_1127_end_mask_0, x = V_sliding_out_1_cast_fp16)[name = string("op_1127_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_1_begin_0 = const()[name = string("V_for_attn_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_1_end_0 = const()[name = string("V_for_attn_1_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_1_end_mask_0 = const()[name = string("V_for_attn_1_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_1_cast_fp16 = slice_by_index(begin = V_for_attn_1_begin_0, end = V_for_attn_1_end_0, end_mask = V_for_attn_1_end_mask_0, x = var_1127_cast_fp16)[name = string("V_for_attn_1_cast_fp16")];
+            tensor<int32, [4]> transpose_0_perm_0 = const()[name = string("transpose_0_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_0_reps_0 = const()[name = string("tile_0_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_0_cast_fp16 = transpose(perm = transpose_0_perm_0, x = K_for_attn_1_cast_fp16)[name = string("transpose_233")];
+            tensor<fp16, [8, 1, 512, 256]> tile_0_cast_fp16 = tile(reps = tile_0_reps_0, x = transpose_0_cast_fp16)[name = string("tile_0_cast_fp16")];
+            tensor<int32, [5]> concat_0 = const()[name = string("concat_0"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_0_cast_fp16 = reshape(shape = concat_0, x = tile_0_cast_fp16)[name = string("reshape_0_cast_fp16")];
+            tensor<int32, [5]> transpose_1_perm_0 = const()[name = string("transpose_1_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_1 = const()[name = string("concat_1"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_1_cast_fp16 = transpose(perm = transpose_1_perm_0, x = reshape_0_cast_fp16)[name = string("transpose_232")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_1_cast_fp16 = reshape(shape = concat_1, x = transpose_1_cast_fp16)[name = string("reshape_1_cast_fp16")];
+            tensor<int32, [4]> transpose_50_perm_0 = const()[name = string("transpose_50_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_2_perm_0 = const()[name = string("transpose_2_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_1_reps_0 = const()[name = string("tile_1_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_2_cast_fp16 = transpose(perm = transpose_2_perm_0, x = V_for_attn_1_cast_fp16)[name = string("transpose_231")];
+            tensor<fp16, [8, 1, 512, 256]> tile_1_cast_fp16 = tile(reps = tile_1_reps_0, x = transpose_2_cast_fp16)[name = string("tile_1_cast_fp16")];
+            tensor<int32, [5]> concat_2 = const()[name = string("concat_2"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_2_cast_fp16 = reshape(shape = concat_2, x = tile_1_cast_fp16)[name = string("reshape_2_cast_fp16")];
+            tensor<int32, [5]> transpose_3_perm_0 = const()[name = string("transpose_3_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_3 = const()[name = string("concat_3"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_3_cast_fp16 = transpose(perm = transpose_3_perm_0, x = reshape_2_cast_fp16)[name = string("transpose_230")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_3_cast_fp16 = reshape(shape = concat_3, x = transpose_3_cast_fp16)[name = string("reshape_3_cast_fp16")];
+            tensor<int32, [4]> V_expanded_1_perm_0 = const()[name = string("V_expanded_1_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(false)];
+            bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_50_cast_fp16 = transpose(perm = transpose_50_perm_0, x = reshape_1_cast_fp16)[name = string("transpose_229")];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = q_11_cast_fp16, y = transpose_50_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_7_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = causal_mask_sliding)[name = string("x_7_cast_fp16")];
+            tensor<int32, [1]> reduce_max_0_axes_0 = const()[name = string("reduce_max_0_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_0_keep_dims_0 = const()[name = string("reduce_max_0_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_0 = reduce_max(axes = reduce_max_0_axes_0, keep_dims = reduce_max_0_keep_dims_0, x = x_7_cast_fp16)[name = string("reduce_max_0")];
+            tensor<fp16, [1, 8, 3, 512]> var_1162 = sub(x = x_7_cast_fp16, y = reduce_max_0)[name = string("op_1162")];
+            tensor<fp16, [1, 8, 3, 512]> var_1168 = exp(x = var_1162)[name = string("op_1168")];
+            tensor<int32, [1]> var_1178_axes_0 = const()[name = string("op_1178_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1178_keep_dims_0 = const()[name = string("op_1178_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_1178 = reduce_sum(axes = var_1178_axes_0, keep_dims = var_1178_keep_dims_0, x = var_1168)[name = string("op_1178")];
+            tensor<fp16, [1, 8, 3, 512]> var_1184_cast_fp16 = real_div(x = var_1168, y = var_1178)[name = string("op_1184_cast_fp16")];
+            bool attn_output_1_transpose_x_0 = const()[name = string("attn_output_1_transpose_x_0"), val = bool(false)];
+            bool attn_output_1_transpose_y_0 = const()[name = string("attn_output_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_1_cast_fp16 = transpose(perm = V_expanded_1_perm_0, x = reshape_3_cast_fp16)[name = string("transpose_228")];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_1_cast_fp16 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = var_1184_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_1_cast_fp16")];
+            tensor<int32, [4]> var_1195 = const()[name = string("op_1195"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1202 = const()[name = string("op_1202"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_1196_cast_fp16 = transpose(perm = var_1195, x = attn_output_1_cast_fp16)[name = string("transpose_227")];
+            tensor<fp16, [1, 3, 2048]> attn_output_3_cast_fp16 = reshape(shape = var_1202, x = var_1196_cast_fp16)[name = string("attn_output_3_cast_fp16")];
+            tensor<int32, [3]> var_1207 = const()[name = string("op_1207"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_1223_pad_type_0 = const()[name = string("op_1223_pad_type_0"), val = string("valid")];
+            int32 var_1223_groups_0 = const()[name = string("op_1223_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_1223_strides_0 = const()[name = string("op_1223_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_1223_pad_0 = const()[name = string("op_1223_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_1223_dilations_0 = const()[name = string("op_1223_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_0_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(545029952))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(547651456))))[name = string("squeeze_0_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_1208_cast_fp16 = transpose(perm = var_1207, x = attn_output_3_cast_fp16)[name = string("transpose_226")];
+            tensor<fp16, [1, 2560, 3]> var_1223_cast_fp16 = conv(dilations = var_1223_dilations_0, groups = var_1223_groups_0, pad = var_1223_pad_0, pad_type = var_1223_pad_type_0, strides = var_1223_strides_0, weight = squeeze_0_cast_fp16_to_fp32_to_fp16_palettized, x = var_1208_cast_fp16)[name = string("op_1223_cast_fp16")];
+            tensor<int32, [3]> var_1227 = const()[name = string("op_1227"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1233 = const()[name = string("op_1233"), val = int32(-1)];
+            fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_11_cast_fp16 = transpose(perm = var_1227, x = var_1223_cast_fp16)[name = string("transpose_225")];
+            tensor<fp16, [1, 3, 2560]> var_1235_cast_fp16 = mul(x = x_11_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_1235_cast_fp16")];
+            bool input_19_interleave_0 = const()[name = string("input_19_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_19_cast_fp16 = concat(axis = var_1233, interleave = input_19_interleave_0, values = (x_11_cast_fp16, var_1235_cast_fp16))[name = string("input_19_cast_fp16")];
+            tensor<int32, [1]> normed_17_axes_0 = const()[name = string("normed_17_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1230_to_fp16 = const()[name = string("op_1230_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_17_cast_fp16 = layer_norm(axes = normed_17_axes_0, epsilon = var_1230_to_fp16, x = input_19_cast_fp16)[name = string("normed_17_cast_fp16")];
+            tensor<int32, [2]> var_1240_split_sizes_0 = const()[name = string("op_1240_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1240_axis_0 = const()[name = string("op_1240_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1240_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1240_cast_fp16_1 = split(axis = var_1240_axis_0, split_sizes = var_1240_split_sizes_0, x = normed_17_cast_fp16)[name = string("op_1240_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_0_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(547654080)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_5_cast_fp16 = mul(x = var_1240_cast_fp16_0, y = layers_0_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_5_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_13_cast_fp16 = add(x = hidden_states, y = attn_output_5_cast_fp16)[name = string("x_13_cast_fp16")];
+            int32 var_1249 = const()[name = string("op_1249"), val = int32(-1)];
+            fp16 const_9_promoted_to_fp16 = const()[name = string("const_9_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_1251_cast_fp16 = mul(x = x_13_cast_fp16, y = const_9_promoted_to_fp16)[name = string("op_1251_cast_fp16")];
+            bool input_21_interleave_0 = const()[name = string("input_21_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_21_cast_fp16 = concat(axis = var_1249, interleave = input_21_interleave_0, values = (x_13_cast_fp16, var_1251_cast_fp16))[name = string("input_21_cast_fp16")];
+            tensor<int32, [1]> normed_21_axes_0 = const()[name = string("normed_21_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1246_to_fp16 = const()[name = string("op_1246_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_21_cast_fp16 = layer_norm(axes = normed_21_axes_0, epsilon = var_1246_to_fp16, x = input_21_cast_fp16)[name = string("normed_21_cast_fp16")];
+            tensor<int32, [2]> var_1256_split_sizes_0 = const()[name = string("op_1256_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1256_axis_0 = const()[name = string("op_1256_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1256_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1256_cast_fp16_1 = split(axis = var_1256_axis_0, split_sizes = var_1256_split_sizes_0, x = normed_21_cast_fp16)[name = string("op_1256_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_0_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(547659264)))];
+            tensor<fp16, [1, 3, 2560]> h_3_cast_fp16 = mul(x = var_1256_cast_fp16_0, y = layers_0_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_3_cast_fp16")];
+            tensor<int32, [3]> var_1267 = const()[name = string("op_1267"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_23_axes_0 = const()[name = string("input_23_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1268 = transpose(perm = var_1267, x = h_3_cast_fp16)[name = string("transpose_224")];
+            tensor<fp16, [1, 2560, 1, 3]> input_23 = expand_dims(axes = input_23_axes_0, x = var_1268)[name = string("input_23")];
+            string gate_1_pad_type_0 = const()[name = string("gate_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_1_strides_0 = const()[name = string("gate_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_1_pad_0 = const()[name = string("gate_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_1_dilations_0 = const()[name = string("gate_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_1_groups_0 = const()[name = string("gate_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_1 = conv(dilations = gate_1_dilations_0, groups = gate_1_groups_0, pad = gate_1_pad_0, pad_type = gate_1_pad_type_0, strides = gate_1_strides_0, weight = layers_0_mlp_gate_proj_weight_palettized, x = input_23)[name = string("gate_1")];
+            string up_1_pad_type_0 = const()[name = string("up_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_1_strides_0 = const()[name = string("up_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_1_pad_0 = const()[name = string("up_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_1_dilations_0 = const()[name = string("up_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_1_groups_0 = const()[name = string("up_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_1 = conv(dilations = up_1_dilations_0, groups = up_1_groups_0, pad = up_1_pad_0, pad_type = up_1_pad_type_0, strides = up_1_strides_0, weight = layers_0_mlp_up_proj_weight_palettized, x = input_23)[name = string("up_1")];
+            string gate_3_mode_0 = const()[name = string("gate_3_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_3 = gelu(mode = gate_3_mode_0, x = gate_1)[name = string("gate_3")];
+            tensor<fp16, [1, 10240, 1, 3]> input_25 = mul(x = gate_3, y = up_1)[name = string("input_25")];
+            string mlp_out_1_pad_type_0 = const()[name = string("mlp_out_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_1_strides_0 = const()[name = string("mlp_out_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_1_pad_0 = const()[name = string("mlp_out_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_1_dilations_0 = const()[name = string("mlp_out_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_1_groups_0 = const()[name = string("mlp_out_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_1 = conv(dilations = mlp_out_1_dilations_0, groups = mlp_out_1_groups_0, pad = mlp_out_1_pad_0, pad_type = mlp_out_1_pad_type_0, strides = mlp_out_1_strides_0, weight = layers_0_mlp_down_proj_weight_palettized, x = input_25)[name = string("mlp_out_1")];
+            tensor<int32, [1]> var_1308_axes_0 = const()[name = string("op_1308_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1308 = squeeze(axes = var_1308_axes_0, x = mlp_out_1)[name = string("op_1308")];
+            tensor<int32, [3]> var_1312 = const()[name = string("op_1312"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1318 = const()[name = string("op_1318"), val = int32(-1)];
+            fp16 const_10_promoted = const()[name = string("const_10_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_15 = transpose(perm = var_1312, x = var_1308)[name = string("transpose_223")];
+            tensor<fp16, [1, 3, 2560]> var_1320 = mul(x = x_15, y = const_10_promoted)[name = string("op_1320")];
+            bool input_27_interleave_0 = const()[name = string("input_27_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_27 = concat(axis = var_1318, interleave = input_27_interleave_0, values = (x_15, var_1320))[name = string("input_27")];
+            tensor<int32, [1]> normed_25_axes_0 = const()[name = string("normed_25_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1315_to_fp16 = const()[name = string("op_1315_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_25_cast_fp16 = layer_norm(axes = normed_25_axes_0, epsilon = var_1315_to_fp16, x = input_27)[name = string("normed_25_cast_fp16")];
+            tensor<int32, [2]> var_1325_split_sizes_0 = const()[name = string("op_1325_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1325_axis_0 = const()[name = string("op_1325_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1325_0, tensor<fp16, [1, 3, 2560]> var_1325_1 = split(axis = var_1325_axis_0, split_sizes = var_1325_split_sizes_0, x = normed_25_cast_fp16)[name = string("op_1325")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_3 = mul(x = var_1325_0, y = layers_0_post_feedforward_layernorm_weight)[name = string("hidden_states_3")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_5_cast_fp16 = add(x = x_13_cast_fp16, y = hidden_states_3)[name = string("hidden_states_5_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_1_begin_0 = const()[name = string("per_layer_slice_1_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> per_layer_slice_1_end_0 = const()[name = string("per_layer_slice_1_end_0"), val = tensor<int32, [3]>([1, 3, 256])];
+            tensor<bool, [3]> per_layer_slice_1_end_mask_0 = const()[name = string("per_layer_slice_1_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_1_cast_fp16 = slice_by_index(begin = per_layer_slice_1_begin_0, end = per_layer_slice_1_end_0, end_mask = per_layer_slice_1_end_mask_0, x = per_layer_combined_out)[name = string("per_layer_slice_1_cast_fp16")];
+            tensor<int32, [3]> var_1353 = const()[name = string("op_1353"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_29_axes_0 = const()[name = string("input_29_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1354 = transpose(perm = var_1353, x = hidden_states_5_cast_fp16)[name = string("transpose_222")];
+            tensor<fp16, [1, 2560, 1, 3]> input_29 = expand_dims(axes = input_29_axes_0, x = var_1354)[name = string("input_29")];
+            string gated_1_pad_type_0 = const()[name = string("gated_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_1_strides_0 = const()[name = string("gated_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_1_pad_0 = const()[name = string("gated_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_1_dilations_0 = const()[name = string("gated_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_1_groups_0 = const()[name = string("gated_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_1 = conv(dilations = gated_1_dilations_0, groups = gated_1_groups_0, pad = gated_1_pad_0, pad_type = gated_1_pad_type_0, strides = gated_1_strides_0, weight = layers_0_per_layer_input_gate_weight_palettized, x = input_29)[name = string("gated_1")];
+            string gated_3_mode_0 = const()[name = string("gated_3_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_3 = gelu(mode = gated_3_mode_0, x = gated_1)[name = string("gated_3")];
+            tensor<int32, [3]> var_1373 = const()[name = string("op_1373"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_1_axes_0 = const()[name = string("per_layer_slice_conv_1_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_1374_cast_fp16 = transpose(perm = var_1373, x = per_layer_slice_1_cast_fp16)[name = string("transpose_221")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_1_cast_fp16 = expand_dims(axes = per_layer_slice_conv_1_axes_0, x = var_1374_cast_fp16)[name = string("per_layer_slice_conv_1_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_31_cast_fp16 = mul(x = gated_3, y = per_layer_slice_conv_1_cast_fp16)[name = string("input_31_cast_fp16")];
+            string gated_5_pad_type_0 = const()[name = string("gated_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_5_strides_0 = const()[name = string("gated_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_5_pad_0 = const()[name = string("gated_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_5_dilations_0 = const()[name = string("gated_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_5_groups_0 = const()[name = string("gated_5_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_0_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(547664448))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(547992192))))[name = string("layers_0_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_5_cast_fp16 = conv(dilations = gated_5_dilations_0, groups = gated_5_groups_0, pad = gated_5_pad_0, pad_type = gated_5_pad_type_0, strides = gated_5_strides_0, weight = layers_0_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_31_cast_fp16)[name = string("gated_5_cast_fp16")];
+            tensor<int32, [1]> var_1390_axes_0 = const()[name = string("op_1390_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1390_cast_fp16 = squeeze(axes = var_1390_axes_0, x = gated_5_cast_fp16)[name = string("op_1390_cast_fp16")];
+            tensor<int32, [3]> var_1394 = const()[name = string("op_1394"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1400 = const()[name = string("op_1400"), val = int32(-1)];
+            fp16 const_11_promoted_to_fp16 = const()[name = string("const_11_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_17_cast_fp16 = transpose(perm = var_1394, x = var_1390_cast_fp16)[name = string("transpose_220")];
+            tensor<fp16, [1, 3, 2560]> var_1402_cast_fp16 = mul(x = x_17_cast_fp16, y = const_11_promoted_to_fp16)[name = string("op_1402_cast_fp16")];
+            bool input_33_interleave_0 = const()[name = string("input_33_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_33_cast_fp16 = concat(axis = var_1400, interleave = input_33_interleave_0, values = (x_17_cast_fp16, var_1402_cast_fp16))[name = string("input_33_cast_fp16")];
+            tensor<int32, [1]> normed_29_axes_0 = const()[name = string("normed_29_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1397_to_fp16 = const()[name = string("op_1397_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_29_cast_fp16 = layer_norm(axes = normed_29_axes_0, epsilon = var_1397_to_fp16, x = input_33_cast_fp16)[name = string("normed_29_cast_fp16")];
+            tensor<int32, [2]> var_1407_split_sizes_0 = const()[name = string("op_1407_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1407_axis_0 = const()[name = string("op_1407_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1407_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1407_cast_fp16_1 = split(axis = var_1407_axis_0, split_sizes = var_1407_split_sizes_0, x = normed_29_cast_fp16)[name = string("op_1407_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_0_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(547994816)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_9_cast_fp16 = mul(x = var_1407_cast_fp16_0, y = layers_0_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_9_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_11_cast_fp16 = add(x = hidden_states_5_cast_fp16, y = hidden_states_9_cast_fp16)[name = string("hidden_states_11_cast_fp16")];
+            tensor<fp16, [1]> const_12_promoted_to_fp16 = const()[name = string("const_12_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.f4p-5])];
+            tensor<fp16, [1, 3, 2560]> x_19_cast_fp16 = mul(x = hidden_states_11_cast_fp16, y = const_12_promoted_to_fp16)[name = string("x_19_cast_fp16")];
+            int32 var_1422 = const()[name = string("op_1422"), val = int32(-1)];
+            fp16 const_13_promoted_to_fp16 = const()[name = string("const_13_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_1424_cast_fp16 = mul(x = x_19_cast_fp16, y = const_13_promoted_to_fp16)[name = string("op_1424_cast_fp16")];
+            bool input_35_interleave_0 = const()[name = string("input_35_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_35_cast_fp16 = concat(axis = var_1422, interleave = input_35_interleave_0, values = (x_19_cast_fp16, var_1424_cast_fp16))[name = string("input_35_cast_fp16")];
+            tensor<int32, [1]> normed_33_axes_0 = const()[name = string("normed_33_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1419_to_fp16 = const()[name = string("op_1419_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_33_cast_fp16 = layer_norm(axes = normed_33_axes_0, epsilon = var_1419_to_fp16, x = input_35_cast_fp16)[name = string("normed_33_cast_fp16")];
+            tensor<int32, [2]> var_1429_split_sizes_0 = const()[name = string("op_1429_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1429_axis_0 = const()[name = string("op_1429_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1429_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1429_cast_fp16_1 = split(axis = var_1429_axis_0, split_sizes = var_1429_split_sizes_0, x = normed_33_cast_fp16)[name = string("op_1429_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_1_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(548000000)))];
+            tensor<fp16, [1, 3, 2560]> h_7_cast_fp16 = mul(x = var_1429_cast_fp16_0, y = layers_1_input_layernorm_weight_promoted_to_fp16)[name = string("h_7_cast_fp16")];
+            tensor<int32, [3]> var_1435 = const()[name = string("op_1435"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1438_axes_0 = const()[name = string("op_1438_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1436_cast_fp16 = transpose(perm = var_1435, x = h_7_cast_fp16)[name = string("transpose_219")];
+            tensor<fp16, [1, 2560, 1, 3]> var_1438_cast_fp16 = expand_dims(axes = var_1438_axes_0, x = var_1436_cast_fp16)[name = string("op_1438_cast_fp16")];
+            string q_13_pad_type_0 = const()[name = string("q_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_13_strides_0 = const()[name = string("q_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_13_pad_0 = const()[name = string("q_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_13_dilations_0 = const()[name = string("q_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_13_groups_0 = const()[name = string("q_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_13 = conv(dilations = q_13_dilations_0, groups = q_13_groups_0, pad = q_13_pad_0, pad_type = q_13_pad_type_0, strides = q_13_strides_0, weight = layers_1_self_attn_q_proj_weight_palettized, x = var_1438_cast_fp16)[name = string("q_13")];
+            tensor<int32, [4]> var_1459 = const()[name = string("op_1459"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_1460 = reshape(shape = var_1459, x = q_13)[name = string("op_1460")];
+            tensor<int32, [4]> transpose_51_perm_0 = const()[name = string("transpose_51_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_1483 = const()[name = string("op_1483"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_51 = transpose(perm = transpose_51_perm_0, x = var_1460)[name = string("transpose_218")];
+            tensor<fp16, [3, 8, 256]> x_21 = reshape(shape = var_1483, x = transpose_51)[name = string("x_21")];
+            int32 var_1489 = const()[name = string("op_1489"), val = int32(-1)];
+            fp16 const_14_promoted = const()[name = string("const_14_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_1491 = mul(x = x_21, y = const_14_promoted)[name = string("op_1491")];
+            bool input_39_interleave_0 = const()[name = string("input_39_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_39 = concat(axis = var_1489, interleave = input_39_interleave_0, values = (x_21, var_1491))[name = string("input_39")];
+            tensor<int32, [1]> normed_37_axes_0 = const()[name = string("normed_37_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1486_to_fp16 = const()[name = string("op_1486_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_37_cast_fp16 = layer_norm(axes = normed_37_axes_0, epsilon = var_1486_to_fp16, x = input_39)[name = string("normed_37_cast_fp16")];
+            tensor<int32, [2]> var_1496_split_sizes_0 = const()[name = string("op_1496_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_1496_axis_0 = const()[name = string("op_1496_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_1496_0, tensor<fp16, [3, 8, 256]> var_1496_1 = split(axis = var_1496_axis_0, split_sizes = var_1496_split_sizes_0, x = normed_37_cast_fp16)[name = string("op_1496")];
+            tensor<fp16, [3, 8, 256]> q_17 = mul(x = var_1496_0, y = layers_1_self_attn_q_norm_weight)[name = string("q_17")];
+            tensor<int32, [4]> var_1503 = const()[name = string("op_1503"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_1504 = reshape(shape = var_1503, x = q_17)[name = string("op_1504")];
+            tensor<int32, [4]> var_1509 = const()[name = string("op_1509"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_19 = transpose(perm = var_1509, x = var_1504)[name = string("transpose_217")];
+            tensor<fp16, [1, 8, 3, 256]> var_1511_cast_fp16 = mul(x = q_19, y = cos_s)[name = string("op_1511_cast_fp16")];
+            tensor<int32, [2]> var_1512_split_sizes_0 = const()[name = string("op_1512_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_1512_axis_0 = const()[name = string("op_1512_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_1512_0, tensor<fp16, [1, 8, 3, 128]> var_1512_1 = split(axis = var_1512_axis_0, split_sizes = var_1512_split_sizes_0, x = q_19)[name = string("op_1512")];
+            fp16 const_15_promoted = const()[name = string("const_15_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_1514 = mul(x = var_1512_1, y = const_15_promoted)[name = string("op_1514")];
+            int32 var_1516 = const()[name = string("op_1516"), val = int32(-1)];
+            bool var_1517_interleave_0 = const()[name = string("op_1517_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_1517 = concat(axis = var_1516, interleave = var_1517_interleave_0, values = (var_1514, var_1512_0))[name = string("op_1517")];
+            tensor<fp16, [1, 8, 3, 256]> var_1518_cast_fp16 = mul(x = var_1517, y = sin_s)[name = string("op_1518_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_23_cast_fp16 = add(x = var_1511_cast_fp16, y = var_1518_cast_fp16)[name = string("q_23_cast_fp16")];
+            string k_7_pad_type_0 = const()[name = string("k_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_7_strides_0 = const()[name = string("k_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_7_pad_0 = const()[name = string("k_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_7_dilations_0 = const()[name = string("k_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_7_groups_0 = const()[name = string("k_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> k_7 = conv(dilations = k_7_dilations_0, groups = k_7_groups_0, pad = k_7_pad_0, pad_type = k_7_pad_type_0, strides = k_7_strides_0, weight = layers_1_self_attn_k_proj_weight_palettized, x = var_1438_cast_fp16)[name = string("k_7")];
+            tensor<int32, [4]> var_1536 = const()[name = string("op_1536"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_1537 = reshape(shape = var_1536, x = k_7)[name = string("op_1537")];
+            tensor<int32, [4]> transpose_52_perm_0 = const()[name = string("transpose_52_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            string v_3_pad_type_0 = const()[name = string("v_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_3_strides_0 = const()[name = string("v_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_3_pad_0 = const()[name = string("v_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_3_dilations_0 = const()[name = string("v_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_3_groups_0 = const()[name = string("v_3_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> v_3 = conv(dilations = v_3_dilations_0, groups = v_3_groups_0, pad = v_3_pad_0, pad_type = v_3_pad_type_0, strides = v_3_strides_0, weight = layers_1_self_attn_v_proj_weight_palettized, x = var_1438_cast_fp16)[name = string("v_3")];
+            tensor<int32, [4]> var_1564 = const()[name = string("op_1564"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_1565 = reshape(shape = var_1564, x = v_3)[name = string("op_1565")];
+            tensor<int32, [4]> var_1570 = const()[name = string("op_1570"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_1588 = const()[name = string("op_1588"), val = tensor<int32, [3]>([3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> transpose_52 = transpose(perm = transpose_52_perm_0, x = var_1537)[name = string("transpose_216")];
+            tensor<fp16, [3, 2, 256]> x_23 = reshape(shape = var_1588, x = transpose_52)[name = string("x_23")];
+            int32 var_1594 = const()[name = string("op_1594"), val = int32(-1)];
+            fp16 const_16_promoted = const()[name = string("const_16_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 2, 256]> var_1596 = mul(x = x_23, y = const_16_promoted)[name = string("op_1596")];
+            bool input_41_interleave_0 = const()[name = string("input_41_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 2, 512]> input_41 = concat(axis = var_1594, interleave = input_41_interleave_0, values = (x_23, var_1596))[name = string("input_41")];
+            tensor<int32, [1]> normed_41_axes_0 = const()[name = string("normed_41_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1591_to_fp16 = const()[name = string("op_1591_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 2, 512]> normed_41_cast_fp16 = layer_norm(axes = normed_41_axes_0, epsilon = var_1591_to_fp16, x = input_41)[name = string("normed_41_cast_fp16")];
+            tensor<int32, [2]> var_1601_split_sizes_0 = const()[name = string("op_1601_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_1601_axis_0 = const()[name = string("op_1601_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 2, 256]> var_1601_0, tensor<fp16, [3, 2, 256]> var_1601_1 = split(axis = var_1601_axis_0, split_sizes = var_1601_split_sizes_0, x = normed_41_cast_fp16)[name = string("op_1601")];
+            tensor<fp16, [3, 2, 256]> k_11 = mul(x = var_1601_0, y = layers_1_self_attn_k_norm_weight)[name = string("k_11")];
+            tensor<int32, [4]> var_1608 = const()[name = string("op_1608"), val = tensor<int32, [4]>([1, 3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> var_1609 = reshape(shape = var_1608, x = k_11)[name = string("op_1609")];
+            tensor<int32, [4]> var_1614 = const()[name = string("op_1614"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            fp16 var_1616_promoted = const()[name = string("op_1616_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 3, 256]> var_1571 = transpose(perm = var_1570, x = var_1565)[name = string("transpose_215")];
+            tensor<fp16, [1, 2, 3, 256]> var_1617 = pow(x = var_1571, y = var_1616_promoted)[name = string("op_1617")];
+            tensor<int32, [1]> var_1622_axes_0 = const()[name = string("op_1622_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1622_keep_dims_0 = const()[name = string("op_1622_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 3, 1]> var_1622 = reduce_mean(axes = var_1622_axes_0, keep_dims = var_1622_keep_dims_0, x = var_1617)[name = string("op_1622")];
+            fp16 var_1624_to_fp16 = const()[name = string("op_1624_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 3, 1]> mean_sq_3_cast_fp16 = add(x = var_1622, y = var_1624_to_fp16)[name = string("mean_sq_3_cast_fp16")];
+            fp32 var_1626_epsilon_0 = const()[name = string("op_1626_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 3, 1]> var_1626_cast_fp16 = rsqrt(epsilon = var_1626_epsilon_0, x = mean_sq_3_cast_fp16)[name = string("op_1626_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_45_cast_fp16 = mul(x = var_1571, y = var_1626_cast_fp16)[name = string("input_45_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> q_21 = transpose(perm = var_1614, x = var_1609)[name = string("transpose_214")];
+            tensor<fp16, [1, 2, 3, 256]> var_1628_cast_fp16 = mul(x = q_21, y = cos_s)[name = string("op_1628_cast_fp16")];
+            tensor<int32, [2]> var_1629_split_sizes_0 = const()[name = string("op_1629_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_1629_axis_0 = const()[name = string("op_1629_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 3, 128]> var_1629_0, tensor<fp16, [1, 2, 3, 128]> var_1629_1 = split(axis = var_1629_axis_0, split_sizes = var_1629_split_sizes_0, x = q_21)[name = string("op_1629")];
+            fp16 const_17_promoted = const()[name = string("const_17_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 3, 128]> var_1631 = mul(x = var_1629_1, y = const_17_promoted)[name = string("op_1631")];
+            int32 var_1633 = const()[name = string("op_1633"), val = int32(-1)];
+            bool var_1634_interleave_0 = const()[name = string("op_1634_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 3, 256]> var_1634 = concat(axis = var_1633, interleave = var_1634_interleave_0, values = (var_1631, var_1629_0))[name = string("op_1634")];
+            tensor<fp16, [1, 2, 3, 256]> var_1635_cast_fp16 = mul(x = var_1634, y = sin_s)[name = string("op_1635_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_43_cast_fp16 = add(x = var_1628_cast_fp16, y = var_1635_cast_fp16)[name = string("input_43_cast_fp16")];
+            tensor<int32, [8]> k_padded_3_pad_0 = const()[name = string("k_padded_3_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_3_mode_0 = const()[name = string("k_padded_3_mode_0"), val = string("constant")];
+            fp16 const_18_to_fp16 = const()[name = string("const_18_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> k_padded_3_cast_fp16 = pad(constant_val = const_18_to_fp16, mode = k_padded_3_mode_0, pad = k_padded_3_pad_0, x = input_43_cast_fp16)[name = string("k_padded_3_cast_fp16")];
+            tensor<int32, [8]> v_padded_3_pad_0 = const()[name = string("v_padded_3_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_3_mode_0 = const()[name = string("v_padded_3_mode_0"), val = string("constant")];
+            fp16 const_19_to_fp16 = const()[name = string("const_19_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> v_padded_3_cast_fp16 = pad(constant_val = const_19_to_fp16, mode = v_padded_3_mode_0, pad = v_padded_3_pad_0, x = input_45_cast_fp16)[name = string("v_padded_3_cast_fp16")];
+            tensor<int32, [4]> slot_k_3_begin_0 = const()[name = string("slot_k_3_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> slot_k_3_end_0 = const()[name = string("slot_k_3_end_0"), val = tensor<int32, [4]>([2, 2, 512, 512])];
+            tensor<bool, [4]> slot_k_3_end_mask_0 = const()[name = string("slot_k_3_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_k_3_cast_fp16 = slice_by_index(begin = slot_k_3_begin_0, end = slot_k_3_end_0, end_mask = slot_k_3_end_mask_0, x = K_sliding_out_1_cast_fp16)[name = string("slot_k_3_cast_fp16")];
+            tensor<int32, [4]> slot_v_3_begin_0 = const()[name = string("slot_v_3_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> slot_v_3_end_0 = const()[name = string("slot_v_3_end_0"), val = tensor<int32, [4]>([2, 2, 512, 512])];
+            tensor<bool, [4]> slot_v_3_end_mask_0 = const()[name = string("slot_v_3_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_v_3_cast_fp16 = slice_by_index(begin = slot_v_3_begin_0, end = slot_v_3_end_0, end_mask = slot_v_3_end_mask_0, x = V_sliding_out_1_cast_fp16)[name = string("slot_v_3_cast_fp16")];
+            tensor<int32, [4]> var_1674_begin_0 = const()[name = string("op_1674_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_1674_end_0 = const()[name = string("op_1674_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_1674_end_mask_0 = const()[name = string("op_1674_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_1674_cast_fp16 = slice_by_index(begin = var_1674_begin_0, end = var_1674_end_0, end_mask = var_1674_end_mask_0, x = slot_k_3_cast_fp16)[name = string("op_1674_cast_fp16")];
+            int32 var_1681 = const()[name = string("op_1681"), val = int32(2)];
+            bool new_k_3_interleave_0 = const()[name = string("new_k_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_k_3_cast_fp16 = concat(axis = var_1681, interleave = new_k_3_interleave_0, values = (var_1674_cast_fp16, k_padded_3_cast_fp16))[name = string("new_k_3_cast_fp16")];
+            tensor<int32, [4]> var_1697_begin_0 = const()[name = string("op_1697_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_1697_end_0 = const()[name = string("op_1697_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_1697_end_mask_0 = const()[name = string("op_1697_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_1697_cast_fp16 = slice_by_index(begin = var_1697_begin_0, end = var_1697_end_0, end_mask = var_1697_end_mask_0, x = slot_v_3_cast_fp16)[name = string("op_1697_cast_fp16")];
+            int32 var_1704 = const()[name = string("op_1704"), val = int32(2)];
+            bool new_v_3_interleave_0 = const()[name = string("new_v_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_v_3_cast_fp16 = concat(axis = var_1704, interleave = new_v_3_interleave_0, values = (var_1697_cast_fp16, v_padded_3_cast_fp16))[name = string("new_v_3_cast_fp16")];
+            tensor<int32, [4]> var_1715_begin_0 = const()[name = string("op_1715_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
+            tensor<int32, [4]> var_1715_end_0 = const()[name = string("op_1715_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_1715_end_mask_0 = const()[name = string("op_1715_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [8, 2, 512, 512]> var_1715_cast_fp16 = slice_by_index(begin = var_1715_begin_0, end = var_1715_end_0, end_mask = var_1715_end_mask_0, x = K_sliding_out_1_cast_fp16)[name = string("op_1715_cast_fp16")];
+            int32 var_1717 = const()[name = string("op_1717"), val = int32(0)];
+            bool K_sliding_out_3_interleave_0 = const()[name = string("K_sliding_out_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> K_sliding_out_3_cast_fp16 = concat(axis = var_1717, interleave = K_sliding_out_3_interleave_0, values = (var_1117_cast_fp16, new_k_3_cast_fp16, var_1715_cast_fp16))[name = string("K_sliding_out_3_cast_fp16")];
+            tensor<int32, [4]> var_1728_begin_0 = const()[name = string("op_1728_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
+            tensor<int32, [4]> var_1728_end_0 = const()[name = string("op_1728_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_1728_end_mask_0 = const()[name = string("op_1728_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [8, 2, 512, 512]> var_1728_cast_fp16 = slice_by_index(begin = var_1728_begin_0, end = var_1728_end_0, end_mask = var_1728_end_mask_0, x = V_sliding_out_1_cast_fp16)[name = string("op_1728_cast_fp16")];
+            int32 var_1730 = const()[name = string("op_1730"), val = int32(0)];
+            bool V_sliding_out_3_interleave_0 = const()[name = string("V_sliding_out_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> V_sliding_out_3_cast_fp16 = concat(axis = var_1730, interleave = V_sliding_out_3_interleave_0, values = (var_1127_cast_fp16, new_v_3_cast_fp16, var_1728_cast_fp16))[name = string("V_sliding_out_3_cast_fp16")];
+            tensor<int32, [4]> var_1736_begin_0 = const()[name = string("op_1736_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> var_1736_end_0 = const()[name = string("op_1736_end_0"), val = tensor<int32, [4]>([2, 2, 512, 512])];
+            tensor<bool, [4]> var_1736_end_mask_0 = const()[name = string("op_1736_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_1736_cast_fp16 = slice_by_index(begin = var_1736_begin_0, end = var_1736_end_0, end_mask = var_1736_end_mask_0, x = K_sliding_out_3_cast_fp16)[name = string("op_1736_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_3_begin_0 = const()[name = string("K_for_attn_3_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_3_end_0 = const()[name = string("K_for_attn_3_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_3_end_mask_0 = const()[name = string("K_for_attn_3_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_3_cast_fp16 = slice_by_index(begin = K_for_attn_3_begin_0, end = K_for_attn_3_end_0, end_mask = K_for_attn_3_end_mask_0, x = var_1736_cast_fp16)[name = string("K_for_attn_3_cast_fp16")];
+            tensor<int32, [4]> var_1746_begin_0 = const()[name = string("op_1746_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> var_1746_end_0 = const()[name = string("op_1746_end_0"), val = tensor<int32, [4]>([2, 2, 512, 512])];
+            tensor<bool, [4]> var_1746_end_mask_0 = const()[name = string("op_1746_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_1746_cast_fp16 = slice_by_index(begin = var_1746_begin_0, end = var_1746_end_0, end_mask = var_1746_end_mask_0, x = V_sliding_out_3_cast_fp16)[name = string("op_1746_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_3_begin_0 = const()[name = string("V_for_attn_3_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_3_end_0 = const()[name = string("V_for_attn_3_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_3_end_mask_0 = const()[name = string("V_for_attn_3_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_3_cast_fp16 = slice_by_index(begin = V_for_attn_3_begin_0, end = V_for_attn_3_end_0, end_mask = V_for_attn_3_end_mask_0, x = var_1746_cast_fp16)[name = string("V_for_attn_3_cast_fp16")];
+            tensor<int32, [4]> transpose_4_perm_0 = const()[name = string("transpose_4_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_2_reps_0 = const()[name = string("tile_2_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_4_cast_fp16 = transpose(perm = transpose_4_perm_0, x = K_for_attn_3_cast_fp16)[name = string("transpose_213")];
+            tensor<fp16, [8, 1, 512, 256]> tile_2_cast_fp16 = tile(reps = tile_2_reps_0, x = transpose_4_cast_fp16)[name = string("tile_2_cast_fp16")];
+            tensor<int32, [5]> concat_4 = const()[name = string("concat_4"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_4_cast_fp16 = reshape(shape = concat_4, x = tile_2_cast_fp16)[name = string("reshape_4_cast_fp16")];
+            tensor<int32, [5]> transpose_5_perm_0 = const()[name = string("transpose_5_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_5 = const()[name = string("concat_5"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_5_cast_fp16 = transpose(perm = transpose_5_perm_0, x = reshape_4_cast_fp16)[name = string("transpose_212")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_5_cast_fp16 = reshape(shape = concat_5, x = transpose_5_cast_fp16)[name = string("reshape_5_cast_fp16")];
+            tensor<int32, [4]> transpose_53_perm_0 = const()[name = string("transpose_53_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_6_perm_0 = const()[name = string("transpose_6_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_3_reps_0 = const()[name = string("tile_3_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_6_cast_fp16 = transpose(perm = transpose_6_perm_0, x = V_for_attn_3_cast_fp16)[name = string("transpose_211")];
+            tensor<fp16, [8, 1, 512, 256]> tile_3_cast_fp16 = tile(reps = tile_3_reps_0, x = transpose_6_cast_fp16)[name = string("tile_3_cast_fp16")];
+            tensor<int32, [5]> concat_6 = const()[name = string("concat_6"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_6_cast_fp16 = reshape(shape = concat_6, x = tile_3_cast_fp16)[name = string("reshape_6_cast_fp16")];
+            tensor<int32, [5]> transpose_7_perm_0 = const()[name = string("transpose_7_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_7 = const()[name = string("concat_7"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_7_cast_fp16 = transpose(perm = transpose_7_perm_0, x = reshape_6_cast_fp16)[name = string("transpose_210")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_7_cast_fp16 = reshape(shape = concat_7, x = transpose_7_cast_fp16)[name = string("reshape_7_cast_fp16")];
+            tensor<int32, [4]> V_expanded_3_perm_0 = const()[name = string("V_expanded_3_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(false)];
+            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_53_cast_fp16 = transpose(perm = transpose_53_perm_0, x = reshape_5_cast_fp16)[name = string("transpose_209")];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = q_23_cast_fp16, y = transpose_53_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_27_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = causal_mask_sliding)[name = string("x_27_cast_fp16")];
+            tensor<int32, [1]> reduce_max_1_axes_0 = const()[name = string("reduce_max_1_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_1_keep_dims_0 = const()[name = string("reduce_max_1_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_1 = reduce_max(axes = reduce_max_1_axes_0, keep_dims = reduce_max_1_keep_dims_0, x = x_27_cast_fp16)[name = string("reduce_max_1")];
+            tensor<fp16, [1, 8, 3, 512]> var_1781 = sub(x = x_27_cast_fp16, y = reduce_max_1)[name = string("op_1781")];
+            tensor<fp16, [1, 8, 3, 512]> var_1787 = exp(x = var_1781)[name = string("op_1787")];
+            tensor<int32, [1]> var_1797_axes_0 = const()[name = string("op_1797_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1797_keep_dims_0 = const()[name = string("op_1797_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_1797 = reduce_sum(axes = var_1797_axes_0, keep_dims = var_1797_keep_dims_0, x = var_1787)[name = string("op_1797")];
+            tensor<fp16, [1, 8, 3, 512]> var_1803_cast_fp16 = real_div(x = var_1787, y = var_1797)[name = string("op_1803_cast_fp16")];
+            bool attn_output_7_transpose_x_0 = const()[name = string("attn_output_7_transpose_x_0"), val = bool(false)];
+            bool attn_output_7_transpose_y_0 = const()[name = string("attn_output_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_3_cast_fp16 = transpose(perm = V_expanded_3_perm_0, x = reshape_7_cast_fp16)[name = string("transpose_208")];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_7_cast_fp16 = matmul(transpose_x = attn_output_7_transpose_x_0, transpose_y = attn_output_7_transpose_y_0, x = var_1803_cast_fp16, y = V_expanded_3_cast_fp16)[name = string("attn_output_7_cast_fp16")];
+            tensor<int32, [4]> var_1814 = const()[name = string("op_1814"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1821 = const()[name = string("op_1821"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_1815_cast_fp16 = transpose(perm = var_1814, x = attn_output_7_cast_fp16)[name = string("transpose_207")];
+            tensor<fp16, [1, 3, 2048]> attn_output_9_cast_fp16 = reshape(shape = var_1821, x = var_1815_cast_fp16)[name = string("attn_output_9_cast_fp16")];
+            tensor<int32, [3]> var_1826 = const()[name = string("op_1826"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_1842_pad_type_0 = const()[name = string("op_1842_pad_type_0"), val = string("valid")];
+            int32 var_1842_groups_0 = const()[name = string("op_1842_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_1842_strides_0 = const()[name = string("op_1842_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_1842_pad_0 = const()[name = string("op_1842_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_1842_dilations_0 = const()[name = string("op_1842_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_1_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(548005184))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(550626688))))[name = string("squeeze_1_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_1827_cast_fp16 = transpose(perm = var_1826, x = attn_output_9_cast_fp16)[name = string("transpose_206")];
+            tensor<fp16, [1, 2560, 3]> var_1842_cast_fp16 = conv(dilations = var_1842_dilations_0, groups = var_1842_groups_0, pad = var_1842_pad_0, pad_type = var_1842_pad_type_0, strides = var_1842_strides_0, weight = squeeze_1_cast_fp16_to_fp32_to_fp16_palettized, x = var_1827_cast_fp16)[name = string("op_1842_cast_fp16")];
+            tensor<int32, [3]> var_1846 = const()[name = string("op_1846"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1852 = const()[name = string("op_1852"), val = int32(-1)];
+            fp16 const_20_promoted_to_fp16 = const()[name = string("const_20_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_31_cast_fp16 = transpose(perm = var_1846, x = var_1842_cast_fp16)[name = string("transpose_205")];
+            tensor<fp16, [1, 3, 2560]> var_1854_cast_fp16 = mul(x = x_31_cast_fp16, y = const_20_promoted_to_fp16)[name = string("op_1854_cast_fp16")];
+            bool input_49_interleave_0 = const()[name = string("input_49_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_49_cast_fp16 = concat(axis = var_1852, interleave = input_49_interleave_0, values = (x_31_cast_fp16, var_1854_cast_fp16))[name = string("input_49_cast_fp16")];
+            tensor<int32, [1]> normed_45_axes_0 = const()[name = string("normed_45_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1849_to_fp16 = const()[name = string("op_1849_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_45_cast_fp16 = layer_norm(axes = normed_45_axes_0, epsilon = var_1849_to_fp16, x = input_49_cast_fp16)[name = string("normed_45_cast_fp16")];
+            tensor<int32, [2]> var_1859_split_sizes_0 = const()[name = string("op_1859_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1859_axis_0 = const()[name = string("op_1859_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1859_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1859_cast_fp16_1 = split(axis = var_1859_axis_0, split_sizes = var_1859_split_sizes_0, x = normed_45_cast_fp16)[name = string("op_1859_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_1_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(550629312)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_11_cast_fp16 = mul(x = var_1859_cast_fp16_0, y = layers_1_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_11_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_33_cast_fp16 = add(x = x_19_cast_fp16, y = attn_output_11_cast_fp16)[name = string("x_33_cast_fp16")];
+            int32 var_1868 = const()[name = string("op_1868"), val = int32(-1)];
+            fp16 const_21_promoted_to_fp16 = const()[name = string("const_21_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_1870_cast_fp16 = mul(x = x_33_cast_fp16, y = const_21_promoted_to_fp16)[name = string("op_1870_cast_fp16")];
+            bool input_51_interleave_0 = const()[name = string("input_51_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_51_cast_fp16 = concat(axis = var_1868, interleave = input_51_interleave_0, values = (x_33_cast_fp16, var_1870_cast_fp16))[name = string("input_51_cast_fp16")];
+            tensor<int32, [1]> normed_49_axes_0 = const()[name = string("normed_49_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1865_to_fp16 = const()[name = string("op_1865_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_49_cast_fp16 = layer_norm(axes = normed_49_axes_0, epsilon = var_1865_to_fp16, x = input_51_cast_fp16)[name = string("normed_49_cast_fp16")];
+            tensor<int32, [2]> var_1875_split_sizes_0 = const()[name = string("op_1875_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1875_axis_0 = const()[name = string("op_1875_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1875_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1875_cast_fp16_1 = split(axis = var_1875_axis_0, split_sizes = var_1875_split_sizes_0, x = normed_49_cast_fp16)[name = string("op_1875_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_1_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(550634496)))];
+            tensor<fp16, [1, 3, 2560]> h_9_cast_fp16 = mul(x = var_1875_cast_fp16_0, y = layers_1_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_9_cast_fp16")];
+            tensor<int32, [3]> var_1886 = const()[name = string("op_1886"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_53_axes_0 = const()[name = string("input_53_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1887 = transpose(perm = var_1886, x = h_9_cast_fp16)[name = string("transpose_204")];
+            tensor<fp16, [1, 2560, 1, 3]> input_53 = expand_dims(axes = input_53_axes_0, x = var_1887)[name = string("input_53")];
+            string gate_5_pad_type_0 = const()[name = string("gate_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_5_strides_0 = const()[name = string("gate_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_5_pad_0 = const()[name = string("gate_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_5_dilations_0 = const()[name = string("gate_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_5_groups_0 = const()[name = string("gate_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_5 = conv(dilations = gate_5_dilations_0, groups = gate_5_groups_0, pad = gate_5_pad_0, pad_type = gate_5_pad_type_0, strides = gate_5_strides_0, weight = layers_1_mlp_gate_proj_weight_palettized, x = input_53)[name = string("gate_5")];
+            string up_3_pad_type_0 = const()[name = string("up_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_3_strides_0 = const()[name = string("up_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_3_pad_0 = const()[name = string("up_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_3_dilations_0 = const()[name = string("up_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_3_groups_0 = const()[name = string("up_3_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_3 = conv(dilations = up_3_dilations_0, groups = up_3_groups_0, pad = up_3_pad_0, pad_type = up_3_pad_type_0, strides = up_3_strides_0, weight = layers_1_mlp_up_proj_weight_palettized, x = input_53)[name = string("up_3")];
+            string gate_7_mode_0 = const()[name = string("gate_7_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_7 = gelu(mode = gate_7_mode_0, x = gate_5)[name = string("gate_7")];
+            tensor<fp16, [1, 10240, 1, 3]> input_55 = mul(x = gate_7, y = up_3)[name = string("input_55")];
+            string mlp_out_3_pad_type_0 = const()[name = string("mlp_out_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_3_strides_0 = const()[name = string("mlp_out_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_3_pad_0 = const()[name = string("mlp_out_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_3_dilations_0 = const()[name = string("mlp_out_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_3_groups_0 = const()[name = string("mlp_out_3_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_3 = conv(dilations = mlp_out_3_dilations_0, groups = mlp_out_3_groups_0, pad = mlp_out_3_pad_0, pad_type = mlp_out_3_pad_type_0, strides = mlp_out_3_strides_0, weight = layers_1_mlp_down_proj_weight_palettized, x = input_55)[name = string("mlp_out_3")];
+            tensor<int32, [1]> var_1927_axes_0 = const()[name = string("op_1927_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1927 = squeeze(axes = var_1927_axes_0, x = mlp_out_3)[name = string("op_1927")];
+            tensor<int32, [3]> var_1931 = const()[name = string("op_1931"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1937 = const()[name = string("op_1937"), val = int32(-1)];
+            fp16 const_22_promoted = const()[name = string("const_22_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_35 = transpose(perm = var_1931, x = var_1927)[name = string("transpose_203")];
+            tensor<fp16, [1, 3, 2560]> var_1939 = mul(x = x_35, y = const_22_promoted)[name = string("op_1939")];
+            bool input_57_interleave_0 = const()[name = string("input_57_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_57 = concat(axis = var_1937, interleave = input_57_interleave_0, values = (x_35, var_1939))[name = string("input_57")];
+            tensor<int32, [1]> normed_53_axes_0 = const()[name = string("normed_53_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1934_to_fp16 = const()[name = string("op_1934_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_53_cast_fp16 = layer_norm(axes = normed_53_axes_0, epsilon = var_1934_to_fp16, x = input_57)[name = string("normed_53_cast_fp16")];
+            tensor<int32, [2]> var_1944_split_sizes_0 = const()[name = string("op_1944_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1944_axis_0 = const()[name = string("op_1944_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1944_0, tensor<fp16, [1, 3, 2560]> var_1944_1 = split(axis = var_1944_axis_0, split_sizes = var_1944_split_sizes_0, x = normed_53_cast_fp16)[name = string("op_1944")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_13 = mul(x = var_1944_0, y = layers_1_post_feedforward_layernorm_weight)[name = string("hidden_states_13")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_15_cast_fp16 = add(x = x_33_cast_fp16, y = hidden_states_13)[name = string("hidden_states_15_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_3_begin_0 = const()[name = string("per_layer_slice_3_begin_0"), val = tensor<int32, [3]>([0, 0, 256])];
+            tensor<int32, [3]> per_layer_slice_3_end_0 = const()[name = string("per_layer_slice_3_end_0"), val = tensor<int32, [3]>([1, 3, 512])];
+            tensor<bool, [3]> per_layer_slice_3_end_mask_0 = const()[name = string("per_layer_slice_3_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_3_cast_fp16 = slice_by_index(begin = per_layer_slice_3_begin_0, end = per_layer_slice_3_end_0, end_mask = per_layer_slice_3_end_mask_0, x = per_layer_combined_out)[name = string("per_layer_slice_3_cast_fp16")];
+            tensor<int32, [3]> var_1972 = const()[name = string("op_1972"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_59_axes_0 = const()[name = string("input_59_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1973 = transpose(perm = var_1972, x = hidden_states_15_cast_fp16)[name = string("transpose_202")];
+            tensor<fp16, [1, 2560, 1, 3]> input_59 = expand_dims(axes = input_59_axes_0, x = var_1973)[name = string("input_59")];
+            string gated_7_pad_type_0 = const()[name = string("gated_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_7_strides_0 = const()[name = string("gated_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_7_pad_0 = const()[name = string("gated_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_7_dilations_0 = const()[name = string("gated_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_7_groups_0 = const()[name = string("gated_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_7 = conv(dilations = gated_7_dilations_0, groups = gated_7_groups_0, pad = gated_7_pad_0, pad_type = gated_7_pad_type_0, strides = gated_7_strides_0, weight = layers_1_per_layer_input_gate_weight_palettized, x = input_59)[name = string("gated_7")];
+            string gated_9_mode_0 = const()[name = string("gated_9_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_9 = gelu(mode = gated_9_mode_0, x = gated_7)[name = string("gated_9")];
+            tensor<int32, [3]> var_1992 = const()[name = string("op_1992"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_3_axes_0 = const()[name = string("per_layer_slice_conv_3_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_1993_cast_fp16 = transpose(perm = var_1992, x = per_layer_slice_3_cast_fp16)[name = string("transpose_201")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_3_cast_fp16 = expand_dims(axes = per_layer_slice_conv_3_axes_0, x = var_1993_cast_fp16)[name = string("per_layer_slice_conv_3_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_61_cast_fp16 = mul(x = gated_9, y = per_layer_slice_conv_3_cast_fp16)[name = string("input_61_cast_fp16")];
+            string gated_11_pad_type_0 = const()[name = string("gated_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_11_strides_0 = const()[name = string("gated_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_11_pad_0 = const()[name = string("gated_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_11_dilations_0 = const()[name = string("gated_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_11_groups_0 = const()[name = string("gated_11_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_1_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(550639680))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(550967424))))[name = string("layers_1_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_11_cast_fp16 = conv(dilations = gated_11_dilations_0, groups = gated_11_groups_0, pad = gated_11_pad_0, pad_type = gated_11_pad_type_0, strides = gated_11_strides_0, weight = layers_1_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_61_cast_fp16)[name = string("gated_11_cast_fp16")];
+            tensor<int32, [1]> var_2009_axes_0 = const()[name = string("op_2009_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2009_cast_fp16 = squeeze(axes = var_2009_axes_0, x = gated_11_cast_fp16)[name = string("op_2009_cast_fp16")];
+            tensor<int32, [3]> var_2013 = const()[name = string("op_2013"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2019 = const()[name = string("op_2019"), val = int32(-1)];
+            fp16 const_23_promoted_to_fp16 = const()[name = string("const_23_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_37_cast_fp16 = transpose(perm = var_2013, x = var_2009_cast_fp16)[name = string("transpose_200")];
+            tensor<fp16, [1, 3, 2560]> var_2021_cast_fp16 = mul(x = x_37_cast_fp16, y = const_23_promoted_to_fp16)[name = string("op_2021_cast_fp16")];
+            bool input_63_interleave_0 = const()[name = string("input_63_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_63_cast_fp16 = concat(axis = var_2019, interleave = input_63_interleave_0, values = (x_37_cast_fp16, var_2021_cast_fp16))[name = string("input_63_cast_fp16")];
+            tensor<int32, [1]> normed_57_axes_0 = const()[name = string("normed_57_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2016_to_fp16 = const()[name = string("op_2016_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_57_cast_fp16 = layer_norm(axes = normed_57_axes_0, epsilon = var_2016_to_fp16, x = input_63_cast_fp16)[name = string("normed_57_cast_fp16")];
+            tensor<int32, [2]> var_2026_split_sizes_0 = const()[name = string("op_2026_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2026_axis_0 = const()[name = string("op_2026_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2026_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2026_cast_fp16_1 = split(axis = var_2026_axis_0, split_sizes = var_2026_split_sizes_0, x = normed_57_cast_fp16)[name = string("op_2026_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_1_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(550970048)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_19_cast_fp16 = mul(x = var_2026_cast_fp16_0, y = layers_1_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_19_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_21_cast_fp16 = add(x = hidden_states_15_cast_fp16, y = hidden_states_19_cast_fp16)[name = string("hidden_states_21_cast_fp16")];
+            tensor<fp16, [1]> const_24_promoted_to_fp16 = const()[name = string("const_24_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.48p-3])];
+            tensor<fp16, [1, 3, 2560]> x_39_cast_fp16 = mul(x = hidden_states_21_cast_fp16, y = const_24_promoted_to_fp16)[name = string("x_39_cast_fp16")];
+            int32 var_2041 = const()[name = string("op_2041"), val = int32(-1)];
+            fp16 const_25_promoted_to_fp16 = const()[name = string("const_25_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_2043_cast_fp16 = mul(x = x_39_cast_fp16, y = const_25_promoted_to_fp16)[name = string("op_2043_cast_fp16")];
+            bool input_65_interleave_0 = const()[name = string("input_65_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_65_cast_fp16 = concat(axis = var_2041, interleave = input_65_interleave_0, values = (x_39_cast_fp16, var_2043_cast_fp16))[name = string("input_65_cast_fp16")];
+            tensor<int32, [1]> normed_61_axes_0 = const()[name = string("normed_61_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2038_to_fp16 = const()[name = string("op_2038_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_61_cast_fp16 = layer_norm(axes = normed_61_axes_0, epsilon = var_2038_to_fp16, x = input_65_cast_fp16)[name = string("normed_61_cast_fp16")];
+            tensor<int32, [2]> var_2048_split_sizes_0 = const()[name = string("op_2048_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2048_axis_0 = const()[name = string("op_2048_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2048_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2048_cast_fp16_1 = split(axis = var_2048_axis_0, split_sizes = var_2048_split_sizes_0, x = normed_61_cast_fp16)[name = string("op_2048_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_2_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(550975232)))];
+            tensor<fp16, [1, 3, 2560]> h_13_cast_fp16 = mul(x = var_2048_cast_fp16_0, y = layers_2_input_layernorm_weight_promoted_to_fp16)[name = string("h_13_cast_fp16")];
+            tensor<int32, [3]> var_2054 = const()[name = string("op_2054"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_2057_axes_0 = const()[name = string("op_2057_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2055_cast_fp16 = transpose(perm = var_2054, x = h_13_cast_fp16)[name = string("transpose_199")];
+            tensor<fp16, [1, 2560, 1, 3]> var_2057_cast_fp16 = expand_dims(axes = var_2057_axes_0, x = var_2055_cast_fp16)[name = string("op_2057_cast_fp16")];
+            string q_25_pad_type_0 = const()[name = string("q_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_25_strides_0 = const()[name = string("q_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_25_pad_0 = const()[name = string("q_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_25_dilations_0 = const()[name = string("q_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_25_groups_0 = const()[name = string("q_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_25 = conv(dilations = q_25_dilations_0, groups = q_25_groups_0, pad = q_25_pad_0, pad_type = q_25_pad_type_0, strides = q_25_strides_0, weight = layers_2_self_attn_q_proj_weight_palettized, x = var_2057_cast_fp16)[name = string("q_25")];
+            tensor<int32, [4]> var_2078 = const()[name = string("op_2078"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_2079 = reshape(shape = var_2078, x = q_25)[name = string("op_2079")];
+            tensor<int32, [4]> transpose_54_perm_0 = const()[name = string("transpose_54_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_2102 = const()[name = string("op_2102"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_54 = transpose(perm = transpose_54_perm_0, x = var_2079)[name = string("transpose_198")];
+            tensor<fp16, [3, 8, 256]> x_41 = reshape(shape = var_2102, x = transpose_54)[name = string("x_41")];
+            int32 var_2108 = const()[name = string("op_2108"), val = int32(-1)];
+            fp16 const_26_promoted = const()[name = string("const_26_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_2110 = mul(x = x_41, y = const_26_promoted)[name = string("op_2110")];
+            bool input_69_interleave_0 = const()[name = string("input_69_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_69 = concat(axis = var_2108, interleave = input_69_interleave_0, values = (x_41, var_2110))[name = string("input_69")];
+            tensor<int32, [1]> normed_65_axes_0 = const()[name = string("normed_65_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2105_to_fp16 = const()[name = string("op_2105_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_65_cast_fp16 = layer_norm(axes = normed_65_axes_0, epsilon = var_2105_to_fp16, x = input_69)[name = string("normed_65_cast_fp16")];
+            tensor<int32, [2]> var_2115_split_sizes_0 = const()[name = string("op_2115_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2115_axis_0 = const()[name = string("op_2115_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_2115_0, tensor<fp16, [3, 8, 256]> var_2115_1 = split(axis = var_2115_axis_0, split_sizes = var_2115_split_sizes_0, x = normed_65_cast_fp16)[name = string("op_2115")];
+            tensor<fp16, [3, 8, 256]> q_29 = mul(x = var_2115_0, y = layers_2_self_attn_q_norm_weight)[name = string("q_29")];
+            tensor<int32, [4]> var_2122 = const()[name = string("op_2122"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_2123 = reshape(shape = var_2122, x = q_29)[name = string("op_2123")];
+            tensor<int32, [4]> var_2128 = const()[name = string("op_2128"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_31 = transpose(perm = var_2128, x = var_2123)[name = string("transpose_197")];
+            tensor<fp16, [1, 8, 3, 256]> var_2130_cast_fp16 = mul(x = q_31, y = cos_s)[name = string("op_2130_cast_fp16")];
+            tensor<int32, [2]> var_2131_split_sizes_0 = const()[name = string("op_2131_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2131_axis_0 = const()[name = string("op_2131_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_2131_0, tensor<fp16, [1, 8, 3, 128]> var_2131_1 = split(axis = var_2131_axis_0, split_sizes = var_2131_split_sizes_0, x = q_31)[name = string("op_2131")];
+            fp16 const_27_promoted = const()[name = string("const_27_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_2133 = mul(x = var_2131_1, y = const_27_promoted)[name = string("op_2133")];
+            int32 var_2135 = const()[name = string("op_2135"), val = int32(-1)];
+            bool var_2136_interleave_0 = const()[name = string("op_2136_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_2136 = concat(axis = var_2135, interleave = var_2136_interleave_0, values = (var_2133, var_2131_0))[name = string("op_2136")];
+            tensor<fp16, [1, 8, 3, 256]> var_2137_cast_fp16 = mul(x = var_2136, y = sin_s)[name = string("op_2137_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_35_cast_fp16 = add(x = var_2130_cast_fp16, y = var_2137_cast_fp16)[name = string("q_35_cast_fp16")];
+            string k_13_pad_type_0 = const()[name = string("k_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_13_strides_0 = const()[name = string("k_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_13_pad_0 = const()[name = string("k_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_13_dilations_0 = const()[name = string("k_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_13_groups_0 = const()[name = string("k_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> k_13 = conv(dilations = k_13_dilations_0, groups = k_13_groups_0, pad = k_13_pad_0, pad_type = k_13_pad_type_0, strides = k_13_strides_0, weight = layers_2_self_attn_k_proj_weight_palettized, x = var_2057_cast_fp16)[name = string("k_13")];
+            tensor<int32, [4]> var_2155 = const()[name = string("op_2155"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_2156 = reshape(shape = var_2155, x = k_13)[name = string("op_2156")];
+            tensor<int32, [4]> transpose_55_perm_0 = const()[name = string("transpose_55_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            string v_5_pad_type_0 = const()[name = string("v_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_5_strides_0 = const()[name = string("v_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_5_pad_0 = const()[name = string("v_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_5_dilations_0 = const()[name = string("v_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_5_groups_0 = const()[name = string("v_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> v_5 = conv(dilations = v_5_dilations_0, groups = v_5_groups_0, pad = v_5_pad_0, pad_type = v_5_pad_type_0, strides = v_5_strides_0, weight = layers_2_self_attn_v_proj_weight_palettized, x = var_2057_cast_fp16)[name = string("v_5")];
+            tensor<int32, [4]> var_2183 = const()[name = string("op_2183"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_2184 = reshape(shape = var_2183, x = v_5)[name = string("op_2184")];
+            tensor<int32, [4]> var_2189 = const()[name = string("op_2189"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_2207 = const()[name = string("op_2207"), val = tensor<int32, [3]>([3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> transpose_55 = transpose(perm = transpose_55_perm_0, x = var_2156)[name = string("transpose_196")];
+            tensor<fp16, [3, 2, 256]> x_43 = reshape(shape = var_2207, x = transpose_55)[name = string("x_43")];
+            int32 var_2213 = const()[name = string("op_2213"), val = int32(-1)];
+            fp16 const_28_promoted = const()[name = string("const_28_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 2, 256]> var_2215 = mul(x = x_43, y = const_28_promoted)[name = string("op_2215")];
+            bool input_71_interleave_0 = const()[name = string("input_71_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 2, 512]> input_71 = concat(axis = var_2213, interleave = input_71_interleave_0, values = (x_43, var_2215))[name = string("input_71")];
+            tensor<int32, [1]> normed_69_axes_0 = const()[name = string("normed_69_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2210_to_fp16 = const()[name = string("op_2210_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 2, 512]> normed_69_cast_fp16 = layer_norm(axes = normed_69_axes_0, epsilon = var_2210_to_fp16, x = input_71)[name = string("normed_69_cast_fp16")];
+            tensor<int32, [2]> var_2220_split_sizes_0 = const()[name = string("op_2220_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2220_axis_0 = const()[name = string("op_2220_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 2, 256]> var_2220_0, tensor<fp16, [3, 2, 256]> var_2220_1 = split(axis = var_2220_axis_0, split_sizes = var_2220_split_sizes_0, x = normed_69_cast_fp16)[name = string("op_2220")];
+            tensor<fp16, [3, 2, 256]> k_17 = mul(x = var_2220_0, y = layers_2_self_attn_k_norm_weight)[name = string("k_17")];
+            tensor<int32, [4]> var_2227 = const()[name = string("op_2227"), val = tensor<int32, [4]>([1, 3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> var_2228 = reshape(shape = var_2227, x = k_17)[name = string("op_2228")];
+            tensor<int32, [4]> var_2233 = const()[name = string("op_2233"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            fp16 var_2235_promoted = const()[name = string("op_2235_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 3, 256]> var_2190 = transpose(perm = var_2189, x = var_2184)[name = string("transpose_195")];
+            tensor<fp16, [1, 2, 3, 256]> var_2236 = pow(x = var_2190, y = var_2235_promoted)[name = string("op_2236")];
+            tensor<int32, [1]> var_2241_axes_0 = const()[name = string("op_2241_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2241_keep_dims_0 = const()[name = string("op_2241_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 3, 1]> var_2241 = reduce_mean(axes = var_2241_axes_0, keep_dims = var_2241_keep_dims_0, x = var_2236)[name = string("op_2241")];
+            fp16 var_2243_to_fp16 = const()[name = string("op_2243_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 3, 1]> mean_sq_5_cast_fp16 = add(x = var_2241, y = var_2243_to_fp16)[name = string("mean_sq_5_cast_fp16")];
+            fp32 var_2245_epsilon_0 = const()[name = string("op_2245_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 3, 1]> var_2245_cast_fp16 = rsqrt(epsilon = var_2245_epsilon_0, x = mean_sq_5_cast_fp16)[name = string("op_2245_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_75_cast_fp16 = mul(x = var_2190, y = var_2245_cast_fp16)[name = string("input_75_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> q_33 = transpose(perm = var_2233, x = var_2228)[name = string("transpose_194")];
+            tensor<fp16, [1, 2, 3, 256]> var_2247_cast_fp16 = mul(x = q_33, y = cos_s)[name = string("op_2247_cast_fp16")];
+            tensor<int32, [2]> var_2248_split_sizes_0 = const()[name = string("op_2248_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2248_axis_0 = const()[name = string("op_2248_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 3, 128]> var_2248_0, tensor<fp16, [1, 2, 3, 128]> var_2248_1 = split(axis = var_2248_axis_0, split_sizes = var_2248_split_sizes_0, x = q_33)[name = string("op_2248")];
+            fp16 const_29_promoted = const()[name = string("const_29_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 3, 128]> var_2250 = mul(x = var_2248_1, y = const_29_promoted)[name = string("op_2250")];
+            int32 var_2252 = const()[name = string("op_2252"), val = int32(-1)];
+            bool var_2253_interleave_0 = const()[name = string("op_2253_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 3, 256]> var_2253 = concat(axis = var_2252, interleave = var_2253_interleave_0, values = (var_2250, var_2248_0))[name = string("op_2253")];
+            tensor<fp16, [1, 2, 3, 256]> var_2254_cast_fp16 = mul(x = var_2253, y = sin_s)[name = string("op_2254_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_73_cast_fp16 = add(x = var_2247_cast_fp16, y = var_2254_cast_fp16)[name = string("input_73_cast_fp16")];
+            tensor<int32, [8]> k_padded_5_pad_0 = const()[name = string("k_padded_5_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_5_mode_0 = const()[name = string("k_padded_5_mode_0"), val = string("constant")];
+            fp16 const_30_to_fp16 = const()[name = string("const_30_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> k_padded_5_cast_fp16 = pad(constant_val = const_30_to_fp16, mode = k_padded_5_mode_0, pad = k_padded_5_pad_0, x = input_73_cast_fp16)[name = string("k_padded_5_cast_fp16")];
+            tensor<int32, [8]> v_padded_5_pad_0 = const()[name = string("v_padded_5_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_5_mode_0 = const()[name = string("v_padded_5_mode_0"), val = string("constant")];
+            fp16 const_31_to_fp16 = const()[name = string("const_31_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> v_padded_5_cast_fp16 = pad(constant_val = const_31_to_fp16, mode = v_padded_5_mode_0, pad = v_padded_5_pad_0, x = input_75_cast_fp16)[name = string("v_padded_5_cast_fp16")];
+            tensor<int32, [4]> slot_k_5_begin_0 = const()[name = string("slot_k_5_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
+            tensor<int32, [4]> slot_k_5_end_0 = const()[name = string("slot_k_5_end_0"), val = tensor<int32, [4]>([3, 2, 512, 512])];
+            tensor<bool, [4]> slot_k_5_end_mask_0 = const()[name = string("slot_k_5_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_k_5_cast_fp16 = slice_by_index(begin = slot_k_5_begin_0, end = slot_k_5_end_0, end_mask = slot_k_5_end_mask_0, x = K_sliding_out_3_cast_fp16)[name = string("slot_k_5_cast_fp16")];
+            tensor<int32, [4]> slot_v_5_begin_0 = const()[name = string("slot_v_5_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
+            tensor<int32, [4]> slot_v_5_end_0 = const()[name = string("slot_v_5_end_0"), val = tensor<int32, [4]>([3, 2, 512, 512])];
+            tensor<bool, [4]> slot_v_5_end_mask_0 = const()[name = string("slot_v_5_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_v_5_cast_fp16 = slice_by_index(begin = slot_v_5_begin_0, end = slot_v_5_end_0, end_mask = slot_v_5_end_mask_0, x = V_sliding_out_3_cast_fp16)[name = string("slot_v_5_cast_fp16")];
+            tensor<int32, [4]> var_2293_begin_0 = const()[name = string("op_2293_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_2293_end_0 = const()[name = string("op_2293_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_2293_end_mask_0 = const()[name = string("op_2293_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_2293_cast_fp16 = slice_by_index(begin = var_2293_begin_0, end = var_2293_end_0, end_mask = var_2293_end_mask_0, x = slot_k_5_cast_fp16)[name = string("op_2293_cast_fp16")];
+            int32 var_2300 = const()[name = string("op_2300"), val = int32(2)];
+            bool new_k_5_interleave_0 = const()[name = string("new_k_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_k_5_cast_fp16 = concat(axis = var_2300, interleave = new_k_5_interleave_0, values = (var_2293_cast_fp16, k_padded_5_cast_fp16))[name = string("new_k_5_cast_fp16")];
+            tensor<int32, [4]> var_2316_begin_0 = const()[name = string("op_2316_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_2316_end_0 = const()[name = string("op_2316_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_2316_end_mask_0 = const()[name = string("op_2316_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_2316_cast_fp16 = slice_by_index(begin = var_2316_begin_0, end = var_2316_end_0, end_mask = var_2316_end_mask_0, x = slot_v_5_cast_fp16)[name = string("op_2316_cast_fp16")];
+            int32 var_2323 = const()[name = string("op_2323"), val = int32(2)];
+            bool new_v_5_interleave_0 = const()[name = string("new_v_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_v_5_cast_fp16 = concat(axis = var_2323, interleave = new_v_5_interleave_0, values = (var_2316_cast_fp16, v_padded_5_cast_fp16))[name = string("new_v_5_cast_fp16")];
+            tensor<int32, [4]> var_2329_begin_0 = const()[name = string("op_2329_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2329_end_0 = const()[name = string("op_2329_end_0"), val = tensor<int32, [4]>([2, 2, 512, 512])];
+            tensor<bool, [4]> var_2329_end_mask_0 = const()[name = string("op_2329_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [2, 2, 512, 512]> var_2329_cast_fp16 = slice_by_index(begin = var_2329_begin_0, end = var_2329_end_0, end_mask = var_2329_end_mask_0, x = K_sliding_out_3_cast_fp16)[name = string("op_2329_cast_fp16")];
+            tensor<int32, [4]> var_2334_begin_0 = const()[name = string("op_2334_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
+            tensor<int32, [4]> var_2334_end_0 = const()[name = string("op_2334_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_2334_end_mask_0 = const()[name = string("op_2334_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [7, 2, 512, 512]> var_2334_cast_fp16 = slice_by_index(begin = var_2334_begin_0, end = var_2334_end_0, end_mask = var_2334_end_mask_0, x = K_sliding_out_3_cast_fp16)[name = string("op_2334_cast_fp16")];
+            int32 var_2336 = const()[name = string("op_2336"), val = int32(0)];
+            bool K_sliding_out_5_interleave_0 = const()[name = string("K_sliding_out_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> K_sliding_out_5_cast_fp16 = concat(axis = var_2336, interleave = K_sliding_out_5_interleave_0, values = (var_2329_cast_fp16, new_k_5_cast_fp16, var_2334_cast_fp16))[name = string("K_sliding_out_5_cast_fp16")];
+            tensor<int32, [4]> var_2342_begin_0 = const()[name = string("op_2342_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2342_end_0 = const()[name = string("op_2342_end_0"), val = tensor<int32, [4]>([2, 2, 512, 512])];
+            tensor<bool, [4]> var_2342_end_mask_0 = const()[name = string("op_2342_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [2, 2, 512, 512]> var_2342_cast_fp16 = slice_by_index(begin = var_2342_begin_0, end = var_2342_end_0, end_mask = var_2342_end_mask_0, x = V_sliding_out_3_cast_fp16)[name = string("op_2342_cast_fp16")];
+            tensor<int32, [4]> var_2347_begin_0 = const()[name = string("op_2347_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
+            tensor<int32, [4]> var_2347_end_0 = const()[name = string("op_2347_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_2347_end_mask_0 = const()[name = string("op_2347_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [7, 2, 512, 512]> var_2347_cast_fp16 = slice_by_index(begin = var_2347_begin_0, end = var_2347_end_0, end_mask = var_2347_end_mask_0, x = V_sliding_out_3_cast_fp16)[name = string("op_2347_cast_fp16")];
+            int32 var_2349 = const()[name = string("op_2349"), val = int32(0)];
+            bool V_sliding_out_5_interleave_0 = const()[name = string("V_sliding_out_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> V_sliding_out_5_cast_fp16 = concat(axis = var_2349, interleave = V_sliding_out_5_interleave_0, values = (var_2342_cast_fp16, new_v_5_cast_fp16, var_2347_cast_fp16))[name = string("V_sliding_out_5_cast_fp16")];
+            tensor<int32, [4]> var_2355_begin_0 = const()[name = string("op_2355_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
+            tensor<int32, [4]> var_2355_end_0 = const()[name = string("op_2355_end_0"), val = tensor<int32, [4]>([3, 2, 512, 512])];
+            tensor<bool, [4]> var_2355_end_mask_0 = const()[name = string("op_2355_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_2355_cast_fp16 = slice_by_index(begin = var_2355_begin_0, end = var_2355_end_0, end_mask = var_2355_end_mask_0, x = K_sliding_out_5_cast_fp16)[name = string("op_2355_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_5_begin_0 = const()[name = string("K_for_attn_5_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_5_end_0 = const()[name = string("K_for_attn_5_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_5_end_mask_0 = const()[name = string("K_for_attn_5_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_5_cast_fp16 = slice_by_index(begin = K_for_attn_5_begin_0, end = K_for_attn_5_end_0, end_mask = K_for_attn_5_end_mask_0, x = var_2355_cast_fp16)[name = string("K_for_attn_5_cast_fp16")];
+            tensor<int32, [4]> var_2365_begin_0 = const()[name = string("op_2365_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
+            tensor<int32, [4]> var_2365_end_0 = const()[name = string("op_2365_end_0"), val = tensor<int32, [4]>([3, 2, 512, 512])];
+            tensor<bool, [4]> var_2365_end_mask_0 = const()[name = string("op_2365_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_2365_cast_fp16 = slice_by_index(begin = var_2365_begin_0, end = var_2365_end_0, end_mask = var_2365_end_mask_0, x = V_sliding_out_5_cast_fp16)[name = string("op_2365_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_5_begin_0 = const()[name = string("V_for_attn_5_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_5_end_0 = const()[name = string("V_for_attn_5_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_5_end_mask_0 = const()[name = string("V_for_attn_5_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_5_cast_fp16 = slice_by_index(begin = V_for_attn_5_begin_0, end = V_for_attn_5_end_0, end_mask = V_for_attn_5_end_mask_0, x = var_2365_cast_fp16)[name = string("V_for_attn_5_cast_fp16")];
+            tensor<int32, [4]> transpose_8_perm_0 = const()[name = string("transpose_8_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_4_reps_0 = const()[name = string("tile_4_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_8_cast_fp16 = transpose(perm = transpose_8_perm_0, x = K_for_attn_5_cast_fp16)[name = string("transpose_193")];
+            tensor<fp16, [8, 1, 512, 256]> tile_4_cast_fp16 = tile(reps = tile_4_reps_0, x = transpose_8_cast_fp16)[name = string("tile_4_cast_fp16")];
+            tensor<int32, [5]> concat_8 = const()[name = string("concat_8"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_8_cast_fp16 = reshape(shape = concat_8, x = tile_4_cast_fp16)[name = string("reshape_8_cast_fp16")];
+            tensor<int32, [5]> transpose_9_perm_0 = const()[name = string("transpose_9_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_9 = const()[name = string("concat_9"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_9_cast_fp16 = transpose(perm = transpose_9_perm_0, x = reshape_8_cast_fp16)[name = string("transpose_192")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_9_cast_fp16 = reshape(shape = concat_9, x = transpose_9_cast_fp16)[name = string("reshape_9_cast_fp16")];
+            tensor<int32, [4]> transpose_56_perm_0 = const()[name = string("transpose_56_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_10_perm_0 = const()[name = string("transpose_10_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_5_reps_0 = const()[name = string("tile_5_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_10_cast_fp16 = transpose(perm = transpose_10_perm_0, x = V_for_attn_5_cast_fp16)[name = string("transpose_191")];
+            tensor<fp16, [8, 1, 512, 256]> tile_5_cast_fp16 = tile(reps = tile_5_reps_0, x = transpose_10_cast_fp16)[name = string("tile_5_cast_fp16")];
+            tensor<int32, [5]> concat_10 = const()[name = string("concat_10"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_10_cast_fp16 = reshape(shape = concat_10, x = tile_5_cast_fp16)[name = string("reshape_10_cast_fp16")];
+            tensor<int32, [5]> transpose_11_perm_0 = const()[name = string("transpose_11_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_11 = const()[name = string("concat_11"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_11_cast_fp16 = transpose(perm = transpose_11_perm_0, x = reshape_10_cast_fp16)[name = string("transpose_190")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_11_cast_fp16 = reshape(shape = concat_11, x = transpose_11_cast_fp16)[name = string("reshape_11_cast_fp16")];
+            tensor<int32, [4]> V_expanded_5_perm_0 = const()[name = string("V_expanded_5_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(false)];
+            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_56_cast_fp16 = transpose(perm = transpose_56_perm_0, x = reshape_9_cast_fp16)[name = string("transpose_189")];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = q_35_cast_fp16, y = transpose_56_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_47_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = causal_mask_sliding)[name = string("x_47_cast_fp16")];
+            tensor<int32, [1]> reduce_max_2_axes_0 = const()[name = string("reduce_max_2_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_2_keep_dims_0 = const()[name = string("reduce_max_2_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_2 = reduce_max(axes = reduce_max_2_axes_0, keep_dims = reduce_max_2_keep_dims_0, x = x_47_cast_fp16)[name = string("reduce_max_2")];
+            tensor<fp16, [1, 8, 3, 512]> var_2400 = sub(x = x_47_cast_fp16, y = reduce_max_2)[name = string("op_2400")];
+            tensor<fp16, [1, 8, 3, 512]> var_2406 = exp(x = var_2400)[name = string("op_2406")];
+            tensor<int32, [1]> var_2416_axes_0 = const()[name = string("op_2416_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2416_keep_dims_0 = const()[name = string("op_2416_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_2416 = reduce_sum(axes = var_2416_axes_0, keep_dims = var_2416_keep_dims_0, x = var_2406)[name = string("op_2416")];
+            tensor<fp16, [1, 8, 3, 512]> var_2422_cast_fp16 = real_div(x = var_2406, y = var_2416)[name = string("op_2422_cast_fp16")];
+            bool attn_output_13_transpose_x_0 = const()[name = string("attn_output_13_transpose_x_0"), val = bool(false)];
+            bool attn_output_13_transpose_y_0 = const()[name = string("attn_output_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_5_cast_fp16 = transpose(perm = V_expanded_5_perm_0, x = reshape_11_cast_fp16)[name = string("transpose_188")];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_13_cast_fp16 = matmul(transpose_x = attn_output_13_transpose_x_0, transpose_y = attn_output_13_transpose_y_0, x = var_2422_cast_fp16, y = V_expanded_5_cast_fp16)[name = string("attn_output_13_cast_fp16")];
+            tensor<int32, [4]> var_2433 = const()[name = string("op_2433"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_2440 = const()[name = string("op_2440"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_2434_cast_fp16 = transpose(perm = var_2433, x = attn_output_13_cast_fp16)[name = string("transpose_187")];
+            tensor<fp16, [1, 3, 2048]> attn_output_15_cast_fp16 = reshape(shape = var_2440, x = var_2434_cast_fp16)[name = string("attn_output_15_cast_fp16")];
+            tensor<int32, [3]> var_2445 = const()[name = string("op_2445"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_2461_pad_type_0 = const()[name = string("op_2461_pad_type_0"), val = string("valid")];
+            int32 var_2461_groups_0 = const()[name = string("op_2461_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_2461_strides_0 = const()[name = string("op_2461_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_2461_pad_0 = const()[name = string("op_2461_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_2461_dilations_0 = const()[name = string("op_2461_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_2_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(550980416))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(553601920))))[name = string("squeeze_2_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_2446_cast_fp16 = transpose(perm = var_2445, x = attn_output_15_cast_fp16)[name = string("transpose_186")];
+            tensor<fp16, [1, 2560, 3]> var_2461_cast_fp16 = conv(dilations = var_2461_dilations_0, groups = var_2461_groups_0, pad = var_2461_pad_0, pad_type = var_2461_pad_type_0, strides = var_2461_strides_0, weight = squeeze_2_cast_fp16_to_fp32_to_fp16_palettized, x = var_2446_cast_fp16)[name = string("op_2461_cast_fp16")];
+            tensor<int32, [3]> var_2465 = const()[name = string("op_2465"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2471 = const()[name = string("op_2471"), val = int32(-1)];
+            fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_51_cast_fp16 = transpose(perm = var_2465, x = var_2461_cast_fp16)[name = string("transpose_185")];
+            tensor<fp16, [1, 3, 2560]> var_2473_cast_fp16 = mul(x = x_51_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_2473_cast_fp16")];
+            bool input_79_interleave_0 = const()[name = string("input_79_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_79_cast_fp16 = concat(axis = var_2471, interleave = input_79_interleave_0, values = (x_51_cast_fp16, var_2473_cast_fp16))[name = string("input_79_cast_fp16")];
+            tensor<int32, [1]> normed_73_axes_0 = const()[name = string("normed_73_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2468_to_fp16 = const()[name = string("op_2468_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_73_cast_fp16 = layer_norm(axes = normed_73_axes_0, epsilon = var_2468_to_fp16, x = input_79_cast_fp16)[name = string("normed_73_cast_fp16")];
+            tensor<int32, [2]> var_2478_split_sizes_0 = const()[name = string("op_2478_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2478_axis_0 = const()[name = string("op_2478_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2478_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2478_cast_fp16_1 = split(axis = var_2478_axis_0, split_sizes = var_2478_split_sizes_0, x = normed_73_cast_fp16)[name = string("op_2478_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_2_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(553604544)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_17_cast_fp16 = mul(x = var_2478_cast_fp16_0, y = layers_2_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_17_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_53_cast_fp16 = add(x = x_39_cast_fp16, y = attn_output_17_cast_fp16)[name = string("x_53_cast_fp16")];
+            int32 var_2487 = const()[name = string("op_2487"), val = int32(-1)];
+            fp16 const_33_promoted_to_fp16 = const()[name = string("const_33_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_2489_cast_fp16 = mul(x = x_53_cast_fp16, y = const_33_promoted_to_fp16)[name = string("op_2489_cast_fp16")];
+            bool input_81_interleave_0 = const()[name = string("input_81_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_81_cast_fp16 = concat(axis = var_2487, interleave = input_81_interleave_0, values = (x_53_cast_fp16, var_2489_cast_fp16))[name = string("input_81_cast_fp16")];
+            tensor<int32, [1]> normed_77_axes_0 = const()[name = string("normed_77_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2484_to_fp16 = const()[name = string("op_2484_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_77_cast_fp16 = layer_norm(axes = normed_77_axes_0, epsilon = var_2484_to_fp16, x = input_81_cast_fp16)[name = string("normed_77_cast_fp16")];
+            tensor<int32, [2]> var_2494_split_sizes_0 = const()[name = string("op_2494_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2494_axis_0 = const()[name = string("op_2494_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2494_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2494_cast_fp16_1 = split(axis = var_2494_axis_0, split_sizes = var_2494_split_sizes_0, x = normed_77_cast_fp16)[name = string("op_2494_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_2_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(553609728)))];
+            tensor<fp16, [1, 3, 2560]> h_15_cast_fp16 = mul(x = var_2494_cast_fp16_0, y = layers_2_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_15_cast_fp16")];
+            tensor<int32, [3]> var_2505 = const()[name = string("op_2505"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_83_axes_0 = const()[name = string("input_83_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2506 = transpose(perm = var_2505, x = h_15_cast_fp16)[name = string("transpose_184")];
+            tensor<fp16, [1, 2560, 1, 3]> input_83 = expand_dims(axes = input_83_axes_0, x = var_2506)[name = string("input_83")];
+            string gate_9_pad_type_0 = const()[name = string("gate_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_9_strides_0 = const()[name = string("gate_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_9_pad_0 = const()[name = string("gate_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_9_dilations_0 = const()[name = string("gate_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_9_groups_0 = const()[name = string("gate_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_9 = conv(dilations = gate_9_dilations_0, groups = gate_9_groups_0, pad = gate_9_pad_0, pad_type = gate_9_pad_type_0, strides = gate_9_strides_0, weight = layers_2_mlp_gate_proj_weight_palettized, x = input_83)[name = string("gate_9")];
+            string up_5_pad_type_0 = const()[name = string("up_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_5_strides_0 = const()[name = string("up_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_5_pad_0 = const()[name = string("up_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_5_dilations_0 = const()[name = string("up_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_5_groups_0 = const()[name = string("up_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_5 = conv(dilations = up_5_dilations_0, groups = up_5_groups_0, pad = up_5_pad_0, pad_type = up_5_pad_type_0, strides = up_5_strides_0, weight = layers_2_mlp_up_proj_weight_palettized, x = input_83)[name = string("up_5")];
+            string gate_11_mode_0 = const()[name = string("gate_11_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_11 = gelu(mode = gate_11_mode_0, x = gate_9)[name = string("gate_11")];
+            tensor<fp16, [1, 10240, 1, 3]> input_85 = mul(x = gate_11, y = up_5)[name = string("input_85")];
+            string mlp_out_5_pad_type_0 = const()[name = string("mlp_out_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_5_strides_0 = const()[name = string("mlp_out_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_5_pad_0 = const()[name = string("mlp_out_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_5_dilations_0 = const()[name = string("mlp_out_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_5_groups_0 = const()[name = string("mlp_out_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_5 = conv(dilations = mlp_out_5_dilations_0, groups = mlp_out_5_groups_0, pad = mlp_out_5_pad_0, pad_type = mlp_out_5_pad_type_0, strides = mlp_out_5_strides_0, weight = layers_2_mlp_down_proj_weight_palettized, x = input_85)[name = string("mlp_out_5")];
+            tensor<int32, [1]> var_2546_axes_0 = const()[name = string("op_2546_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2546 = squeeze(axes = var_2546_axes_0, x = mlp_out_5)[name = string("op_2546")];
+            tensor<int32, [3]> var_2550 = const()[name = string("op_2550"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2556 = const()[name = string("op_2556"), val = int32(-1)];
+            fp16 const_34_promoted = const()[name = string("const_34_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_55 = transpose(perm = var_2550, x = var_2546)[name = string("transpose_183")];
+            tensor<fp16, [1, 3, 2560]> var_2558 = mul(x = x_55, y = const_34_promoted)[name = string("op_2558")];
+            bool input_87_interleave_0 = const()[name = string("input_87_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_87 = concat(axis = var_2556, interleave = input_87_interleave_0, values = (x_55, var_2558))[name = string("input_87")];
+            tensor<int32, [1]> normed_81_axes_0 = const()[name = string("normed_81_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2553_to_fp16 = const()[name = string("op_2553_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_81_cast_fp16 = layer_norm(axes = normed_81_axes_0, epsilon = var_2553_to_fp16, x = input_87)[name = string("normed_81_cast_fp16")];
+            tensor<int32, [2]> var_2563_split_sizes_0 = const()[name = string("op_2563_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2563_axis_0 = const()[name = string("op_2563_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2563_0, tensor<fp16, [1, 3, 2560]> var_2563_1 = split(axis = var_2563_axis_0, split_sizes = var_2563_split_sizes_0, x = normed_81_cast_fp16)[name = string("op_2563")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_23 = mul(x = var_2563_0, y = layers_2_post_feedforward_layernorm_weight)[name = string("hidden_states_23")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_25_cast_fp16 = add(x = x_53_cast_fp16, y = hidden_states_23)[name = string("hidden_states_25_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_5_begin_0 = const()[name = string("per_layer_slice_5_begin_0"), val = tensor<int32, [3]>([0, 0, 512])];
+            tensor<int32, [3]> per_layer_slice_5_end_0 = const()[name = string("per_layer_slice_5_end_0"), val = tensor<int32, [3]>([1, 3, 768])];
+            tensor<bool, [3]> per_layer_slice_5_end_mask_0 = const()[name = string("per_layer_slice_5_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_5_cast_fp16 = slice_by_index(begin = per_layer_slice_5_begin_0, end = per_layer_slice_5_end_0, end_mask = per_layer_slice_5_end_mask_0, x = per_layer_combined_out)[name = string("per_layer_slice_5_cast_fp16")];
+            tensor<int32, [3]> var_2591 = const()[name = string("op_2591"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_89_axes_0 = const()[name = string("input_89_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2592 = transpose(perm = var_2591, x = hidden_states_25_cast_fp16)[name = string("transpose_182")];
+            tensor<fp16, [1, 2560, 1, 3]> input_89 = expand_dims(axes = input_89_axes_0, x = var_2592)[name = string("input_89")];
+            string gated_13_pad_type_0 = const()[name = string("gated_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_13_strides_0 = const()[name = string("gated_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_13_pad_0 = const()[name = string("gated_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_13_dilations_0 = const()[name = string("gated_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_13_groups_0 = const()[name = string("gated_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_13 = conv(dilations = gated_13_dilations_0, groups = gated_13_groups_0, pad = gated_13_pad_0, pad_type = gated_13_pad_type_0, strides = gated_13_strides_0, weight = layers_2_per_layer_input_gate_weight_palettized, x = input_89)[name = string("gated_13")];
+            string gated_15_mode_0 = const()[name = string("gated_15_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_15 = gelu(mode = gated_15_mode_0, x = gated_13)[name = string("gated_15")];
+            tensor<int32, [3]> var_2611 = const()[name = string("op_2611"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_5_axes_0 = const()[name = string("per_layer_slice_conv_5_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_2612_cast_fp16 = transpose(perm = var_2611, x = per_layer_slice_5_cast_fp16)[name = string("transpose_181")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_5_cast_fp16 = expand_dims(axes = per_layer_slice_conv_5_axes_0, x = var_2612_cast_fp16)[name = string("per_layer_slice_conv_5_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_91_cast_fp16 = mul(x = gated_15, y = per_layer_slice_conv_5_cast_fp16)[name = string("input_91_cast_fp16")];
+            string gated_17_pad_type_0 = const()[name = string("gated_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_17_strides_0 = const()[name = string("gated_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_17_pad_0 = const()[name = string("gated_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_17_dilations_0 = const()[name = string("gated_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_17_groups_0 = const()[name = string("gated_17_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_2_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(553614912))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(553942656))))[name = string("layers_2_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_17_cast_fp16 = conv(dilations = gated_17_dilations_0, groups = gated_17_groups_0, pad = gated_17_pad_0, pad_type = gated_17_pad_type_0, strides = gated_17_strides_0, weight = layers_2_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_91_cast_fp16)[name = string("gated_17_cast_fp16")];
+            tensor<int32, [1]> var_2628_axes_0 = const()[name = string("op_2628_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2628_cast_fp16 = squeeze(axes = var_2628_axes_0, x = gated_17_cast_fp16)[name = string("op_2628_cast_fp16")];
+            tensor<int32, [3]> var_2632 = const()[name = string("op_2632"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2638 = const()[name = string("op_2638"), val = int32(-1)];
+            fp16 const_35_promoted_to_fp16 = const()[name = string("const_35_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_57_cast_fp16 = transpose(perm = var_2632, x = var_2628_cast_fp16)[name = string("transpose_180")];
+            tensor<fp16, [1, 3, 2560]> var_2640_cast_fp16 = mul(x = x_57_cast_fp16, y = const_35_promoted_to_fp16)[name = string("op_2640_cast_fp16")];
+            bool input_93_interleave_0 = const()[name = string("input_93_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_93_cast_fp16 = concat(axis = var_2638, interleave = input_93_interleave_0, values = (x_57_cast_fp16, var_2640_cast_fp16))[name = string("input_93_cast_fp16")];
+            tensor<int32, [1]> normed_85_axes_0 = const()[name = string("normed_85_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2635_to_fp16 = const()[name = string("op_2635_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_85_cast_fp16 = layer_norm(axes = normed_85_axes_0, epsilon = var_2635_to_fp16, x = input_93_cast_fp16)[name = string("normed_85_cast_fp16")];
+            tensor<int32, [2]> var_2645_split_sizes_0 = const()[name = string("op_2645_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2645_axis_0 = const()[name = string("op_2645_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2645_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2645_cast_fp16_1 = split(axis = var_2645_axis_0, split_sizes = var_2645_split_sizes_0, x = normed_85_cast_fp16)[name = string("op_2645_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_2_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(553945280)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_29_cast_fp16 = mul(x = var_2645_cast_fp16_0, y = layers_2_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_29_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_31_cast_fp16 = add(x = hidden_states_25_cast_fp16, y = hidden_states_29_cast_fp16)[name = string("hidden_states_31_cast_fp16")];
+            tensor<fp16, [1]> const_36_promoted_to_fp16 = const()[name = string("const_36_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.aep-1])];
+            tensor<fp16, [1, 3, 2560]> x_59_cast_fp16 = mul(x = hidden_states_31_cast_fp16, y = const_36_promoted_to_fp16)[name = string("x_59_cast_fp16")];
+            int32 var_2660 = const()[name = string("op_2660"), val = int32(-1)];
+            fp16 const_37_promoted_to_fp16 = const()[name = string("const_37_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_2662_cast_fp16 = mul(x = x_59_cast_fp16, y = const_37_promoted_to_fp16)[name = string("op_2662_cast_fp16")];
+            bool input_95_interleave_0 = const()[name = string("input_95_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_95_cast_fp16 = concat(axis = var_2660, interleave = input_95_interleave_0, values = (x_59_cast_fp16, var_2662_cast_fp16))[name = string("input_95_cast_fp16")];
+            tensor<int32, [1]> normed_89_axes_0 = const()[name = string("normed_89_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2657_to_fp16 = const()[name = string("op_2657_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_89_cast_fp16 = layer_norm(axes = normed_89_axes_0, epsilon = var_2657_to_fp16, x = input_95_cast_fp16)[name = string("normed_89_cast_fp16")];
+            tensor<int32, [2]> var_2667_split_sizes_0 = const()[name = string("op_2667_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2667_axis_0 = const()[name = string("op_2667_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2667_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2667_cast_fp16_1 = split(axis = var_2667_axis_0, split_sizes = var_2667_split_sizes_0, x = normed_89_cast_fp16)[name = string("op_2667_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_3_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(553950464)))];
+            tensor<fp16, [1, 3, 2560]> h_19_cast_fp16 = mul(x = var_2667_cast_fp16_0, y = layers_3_input_layernorm_weight_promoted_to_fp16)[name = string("h_19_cast_fp16")];
+            tensor<int32, [3]> var_2673 = const()[name = string("op_2673"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_2676_axes_0 = const()[name = string("op_2676_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2674_cast_fp16 = transpose(perm = var_2673, x = h_19_cast_fp16)[name = string("transpose_179")];
+            tensor<fp16, [1, 2560, 1, 3]> var_2676_cast_fp16 = expand_dims(axes = var_2676_axes_0, x = var_2674_cast_fp16)[name = string("op_2676_cast_fp16")];
+            string q_37_pad_type_0 = const()[name = string("q_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_37_strides_0 = const()[name = string("q_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_37_pad_0 = const()[name = string("q_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_37_dilations_0 = const()[name = string("q_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_37_groups_0 = const()[name = string("q_37_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_37 = conv(dilations = q_37_dilations_0, groups = q_37_groups_0, pad = q_37_pad_0, pad_type = q_37_pad_type_0, strides = q_37_strides_0, weight = layers_3_self_attn_q_proj_weight_palettized, x = var_2676_cast_fp16)[name = string("q_37")];
+            tensor<int32, [4]> var_2697 = const()[name = string("op_2697"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_2698 = reshape(shape = var_2697, x = q_37)[name = string("op_2698")];
+            tensor<int32, [4]> transpose_57_perm_0 = const()[name = string("transpose_57_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_2721 = const()[name = string("op_2721"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_57 = transpose(perm = transpose_57_perm_0, x = var_2698)[name = string("transpose_178")];
+            tensor<fp16, [3, 8, 256]> x_61 = reshape(shape = var_2721, x = transpose_57)[name = string("x_61")];
+            int32 var_2727 = const()[name = string("op_2727"), val = int32(-1)];
+            fp16 const_38_promoted = const()[name = string("const_38_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_2729 = mul(x = x_61, y = const_38_promoted)[name = string("op_2729")];
+            bool input_99_interleave_0 = const()[name = string("input_99_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_99 = concat(axis = var_2727, interleave = input_99_interleave_0, values = (x_61, var_2729))[name = string("input_99")];
+            tensor<int32, [1]> normed_93_axes_0 = const()[name = string("normed_93_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2724_to_fp16 = const()[name = string("op_2724_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_93_cast_fp16 = layer_norm(axes = normed_93_axes_0, epsilon = var_2724_to_fp16, x = input_99)[name = string("normed_93_cast_fp16")];
+            tensor<int32, [2]> var_2734_split_sizes_0 = const()[name = string("op_2734_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2734_axis_0 = const()[name = string("op_2734_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_2734_0, tensor<fp16, [3, 8, 256]> var_2734_1 = split(axis = var_2734_axis_0, split_sizes = var_2734_split_sizes_0, x = normed_93_cast_fp16)[name = string("op_2734")];
+            tensor<fp16, [3, 8, 256]> q_41 = mul(x = var_2734_0, y = layers_3_self_attn_q_norm_weight)[name = string("q_41")];
+            tensor<int32, [4]> var_2741 = const()[name = string("op_2741"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_2742 = reshape(shape = var_2741, x = q_41)[name = string("op_2742")];
+            tensor<int32, [4]> var_2747 = const()[name = string("op_2747"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_43 = transpose(perm = var_2747, x = var_2742)[name = string("transpose_177")];
+            tensor<fp16, [1, 8, 3, 256]> var_2749_cast_fp16 = mul(x = q_43, y = cos_s)[name = string("op_2749_cast_fp16")];
+            tensor<int32, [2]> var_2750_split_sizes_0 = const()[name = string("op_2750_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2750_axis_0 = const()[name = string("op_2750_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_2750_0, tensor<fp16, [1, 8, 3, 128]> var_2750_1 = split(axis = var_2750_axis_0, split_sizes = var_2750_split_sizes_0, x = q_43)[name = string("op_2750")];
+            fp16 const_39_promoted = const()[name = string("const_39_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_2752 = mul(x = var_2750_1, y = const_39_promoted)[name = string("op_2752")];
+            int32 var_2754 = const()[name = string("op_2754"), val = int32(-1)];
+            bool var_2755_interleave_0 = const()[name = string("op_2755_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_2755 = concat(axis = var_2754, interleave = var_2755_interleave_0, values = (var_2752, var_2750_0))[name = string("op_2755")];
+            tensor<fp16, [1, 8, 3, 256]> var_2756_cast_fp16 = mul(x = var_2755, y = sin_s)[name = string("op_2756_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_47_cast_fp16 = add(x = var_2749_cast_fp16, y = var_2756_cast_fp16)[name = string("q_47_cast_fp16")];
+            string k_19_pad_type_0 = const()[name = string("k_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_19_strides_0 = const()[name = string("k_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_19_pad_0 = const()[name = string("k_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_19_dilations_0 = const()[name = string("k_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_19_groups_0 = const()[name = string("k_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> k_19 = conv(dilations = k_19_dilations_0, groups = k_19_groups_0, pad = k_19_pad_0, pad_type = k_19_pad_type_0, strides = k_19_strides_0, weight = layers_3_self_attn_k_proj_weight_palettized, x = var_2676_cast_fp16)[name = string("k_19")];
+            tensor<int32, [4]> var_2774 = const()[name = string("op_2774"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_2775 = reshape(shape = var_2774, x = k_19)[name = string("op_2775")];
+            tensor<int32, [4]> transpose_58_perm_0 = const()[name = string("transpose_58_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            string v_7_pad_type_0 = const()[name = string("v_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_7_strides_0 = const()[name = string("v_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_7_pad_0 = const()[name = string("v_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_7_dilations_0 = const()[name = string("v_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_7_groups_0 = const()[name = string("v_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> v_7 = conv(dilations = v_7_dilations_0, groups = v_7_groups_0, pad = v_7_pad_0, pad_type = v_7_pad_type_0, strides = v_7_strides_0, weight = layers_3_self_attn_v_proj_weight_palettized, x = var_2676_cast_fp16)[name = string("v_7")];
+            tensor<int32, [4]> var_2802 = const()[name = string("op_2802"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_2803 = reshape(shape = var_2802, x = v_7)[name = string("op_2803")];
+            tensor<int32, [4]> var_2808 = const()[name = string("op_2808"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_2826 = const()[name = string("op_2826"), val = tensor<int32, [3]>([3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> transpose_58 = transpose(perm = transpose_58_perm_0, x = var_2775)[name = string("transpose_176")];
+            tensor<fp16, [3, 2, 256]> x_63 = reshape(shape = var_2826, x = transpose_58)[name = string("x_63")];
+            int32 var_2832 = const()[name = string("op_2832"), val = int32(-1)];
+            fp16 const_40_promoted = const()[name = string("const_40_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 2, 256]> var_2834 = mul(x = x_63, y = const_40_promoted)[name = string("op_2834")];
+            bool input_101_interleave_0 = const()[name = string("input_101_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 2, 512]> input_101 = concat(axis = var_2832, interleave = input_101_interleave_0, values = (x_63, var_2834))[name = string("input_101")];
+            tensor<int32, [1]> normed_97_axes_0 = const()[name = string("normed_97_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2829_to_fp16 = const()[name = string("op_2829_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 2, 512]> normed_97_cast_fp16 = layer_norm(axes = normed_97_axes_0, epsilon = var_2829_to_fp16, x = input_101)[name = string("normed_97_cast_fp16")];
+            tensor<int32, [2]> var_2839_split_sizes_0 = const()[name = string("op_2839_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2839_axis_0 = const()[name = string("op_2839_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 2, 256]> var_2839_0, tensor<fp16, [3, 2, 256]> var_2839_1 = split(axis = var_2839_axis_0, split_sizes = var_2839_split_sizes_0, x = normed_97_cast_fp16)[name = string("op_2839")];
+            tensor<fp16, [3, 2, 256]> k_23 = mul(x = var_2839_0, y = layers_3_self_attn_k_norm_weight)[name = string("k_23")];
+            tensor<int32, [4]> var_2846 = const()[name = string("op_2846"), val = tensor<int32, [4]>([1, 3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> var_2847 = reshape(shape = var_2846, x = k_23)[name = string("op_2847")];
+            tensor<int32, [4]> var_2852 = const()[name = string("op_2852"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            fp16 var_2854_promoted = const()[name = string("op_2854_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 3, 256]> var_2809 = transpose(perm = var_2808, x = var_2803)[name = string("transpose_175")];
+            tensor<fp16, [1, 2, 3, 256]> var_2855 = pow(x = var_2809, y = var_2854_promoted)[name = string("op_2855")];
+            tensor<int32, [1]> var_2860_axes_0 = const()[name = string("op_2860_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2860_keep_dims_0 = const()[name = string("op_2860_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 3, 1]> var_2860 = reduce_mean(axes = var_2860_axes_0, keep_dims = var_2860_keep_dims_0, x = var_2855)[name = string("op_2860")];
+            fp16 var_2862_to_fp16 = const()[name = string("op_2862_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 3, 1]> mean_sq_7_cast_fp16 = add(x = var_2860, y = var_2862_to_fp16)[name = string("mean_sq_7_cast_fp16")];
+            fp32 var_2864_epsilon_0 = const()[name = string("op_2864_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 3, 1]> var_2864_cast_fp16 = rsqrt(epsilon = var_2864_epsilon_0, x = mean_sq_7_cast_fp16)[name = string("op_2864_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_105_cast_fp16 = mul(x = var_2809, y = var_2864_cast_fp16)[name = string("input_105_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> q_45 = transpose(perm = var_2852, x = var_2847)[name = string("transpose_174")];
+            tensor<fp16, [1, 2, 3, 256]> var_2866_cast_fp16 = mul(x = q_45, y = cos_s)[name = string("op_2866_cast_fp16")];
+            tensor<int32, [2]> var_2867_split_sizes_0 = const()[name = string("op_2867_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2867_axis_0 = const()[name = string("op_2867_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 3, 128]> var_2867_0, tensor<fp16, [1, 2, 3, 128]> var_2867_1 = split(axis = var_2867_axis_0, split_sizes = var_2867_split_sizes_0, x = q_45)[name = string("op_2867")];
+            fp16 const_41_promoted = const()[name = string("const_41_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 3, 128]> var_2869 = mul(x = var_2867_1, y = const_41_promoted)[name = string("op_2869")];
+            int32 var_2871 = const()[name = string("op_2871"), val = int32(-1)];
+            bool var_2872_interleave_0 = const()[name = string("op_2872_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 3, 256]> var_2872 = concat(axis = var_2871, interleave = var_2872_interleave_0, values = (var_2869, var_2867_0))[name = string("op_2872")];
+            tensor<fp16, [1, 2, 3, 256]> var_2873_cast_fp16 = mul(x = var_2872, y = sin_s)[name = string("op_2873_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_103_cast_fp16 = add(x = var_2866_cast_fp16, y = var_2873_cast_fp16)[name = string("input_103_cast_fp16")];
+            tensor<int32, [8]> k_padded_7_pad_0 = const()[name = string("k_padded_7_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_7_mode_0 = const()[name = string("k_padded_7_mode_0"), val = string("constant")];
+            fp16 const_42_to_fp16 = const()[name = string("const_42_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> k_padded_7_cast_fp16 = pad(constant_val = const_42_to_fp16, mode = k_padded_7_mode_0, pad = k_padded_7_pad_0, x = input_103_cast_fp16)[name = string("k_padded_7_cast_fp16")];
+            tensor<int32, [8]> v_padded_7_pad_0 = const()[name = string("v_padded_7_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_7_mode_0 = const()[name = string("v_padded_7_mode_0"), val = string("constant")];
+            fp16 const_43_to_fp16 = const()[name = string("const_43_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> v_padded_7_cast_fp16 = pad(constant_val = const_43_to_fp16, mode = v_padded_7_mode_0, pad = v_padded_7_pad_0, x = input_105_cast_fp16)[name = string("v_padded_7_cast_fp16")];
+            tensor<int32, [4]> slot_k_7_begin_0 = const()[name = string("slot_k_7_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
+            tensor<int32, [4]> slot_k_7_end_0 = const()[name = string("slot_k_7_end_0"), val = tensor<int32, [4]>([4, 2, 512, 512])];
+            tensor<bool, [4]> slot_k_7_end_mask_0 = const()[name = string("slot_k_7_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_k_7_cast_fp16 = slice_by_index(begin = slot_k_7_begin_0, end = slot_k_7_end_0, end_mask = slot_k_7_end_mask_0, x = K_sliding_out_5_cast_fp16)[name = string("slot_k_7_cast_fp16")];
+            tensor<int32, [4]> slot_v_7_begin_0 = const()[name = string("slot_v_7_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
+            tensor<int32, [4]> slot_v_7_end_0 = const()[name = string("slot_v_7_end_0"), val = tensor<int32, [4]>([4, 2, 512, 512])];
+            tensor<bool, [4]> slot_v_7_end_mask_0 = const()[name = string("slot_v_7_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_v_7_cast_fp16 = slice_by_index(begin = slot_v_7_begin_0, end = slot_v_7_end_0, end_mask = slot_v_7_end_mask_0, x = V_sliding_out_5_cast_fp16)[name = string("slot_v_7_cast_fp16")];
+            tensor<int32, [4]> var_2912_begin_0 = const()[name = string("op_2912_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_2912_end_0 = const()[name = string("op_2912_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_2912_end_mask_0 = const()[name = string("op_2912_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_2912_cast_fp16 = slice_by_index(begin = var_2912_begin_0, end = var_2912_end_0, end_mask = var_2912_end_mask_0, x = slot_k_7_cast_fp16)[name = string("op_2912_cast_fp16")];
+            int32 var_2919 = const()[name = string("op_2919"), val = int32(2)];
+            bool new_k_7_interleave_0 = const()[name = string("new_k_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_k_7_cast_fp16 = concat(axis = var_2919, interleave = new_k_7_interleave_0, values = (var_2912_cast_fp16, k_padded_7_cast_fp16))[name = string("new_k_7_cast_fp16")];
+            tensor<int32, [4]> var_2935_begin_0 = const()[name = string("op_2935_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_2935_end_0 = const()[name = string("op_2935_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_2935_end_mask_0 = const()[name = string("op_2935_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_2935_cast_fp16 = slice_by_index(begin = var_2935_begin_0, end = var_2935_end_0, end_mask = var_2935_end_mask_0, x = slot_v_7_cast_fp16)[name = string("op_2935_cast_fp16")];
+            int32 var_2942 = const()[name = string("op_2942"), val = int32(2)];
+            bool new_v_7_interleave_0 = const()[name = string("new_v_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_v_7_cast_fp16 = concat(axis = var_2942, interleave = new_v_7_interleave_0, values = (var_2935_cast_fp16, v_padded_7_cast_fp16))[name = string("new_v_7_cast_fp16")];
+            tensor<int32, [4]> var_2948_begin_0 = const()[name = string("op_2948_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2948_end_0 = const()[name = string("op_2948_end_0"), val = tensor<int32, [4]>([3, 2, 512, 512])];
+            tensor<bool, [4]> var_2948_end_mask_0 = const()[name = string("op_2948_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [3, 2, 512, 512]> var_2948_cast_fp16 = slice_by_index(begin = var_2948_begin_0, end = var_2948_end_0, end_mask = var_2948_end_mask_0, x = K_sliding_out_5_cast_fp16)[name = string("op_2948_cast_fp16")];
+            tensor<int32, [4]> var_2953_begin_0 = const()[name = string("op_2953_begin_0"), val = tensor<int32, [4]>([4, 0, 0, 0])];
+            tensor<int32, [4]> var_2953_end_0 = const()[name = string("op_2953_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_2953_end_mask_0 = const()[name = string("op_2953_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [6, 2, 512, 512]> var_2953_cast_fp16 = slice_by_index(begin = var_2953_begin_0, end = var_2953_end_0, end_mask = var_2953_end_mask_0, x = K_sliding_out_5_cast_fp16)[name = string("op_2953_cast_fp16")];
+            int32 var_2955 = const()[name = string("op_2955"), val = int32(0)];
+            bool K_sliding_out_7_interleave_0 = const()[name = string("K_sliding_out_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> K_sliding_out_7_cast_fp16 = concat(axis = var_2955, interleave = K_sliding_out_7_interleave_0, values = (var_2948_cast_fp16, new_k_7_cast_fp16, var_2953_cast_fp16))[name = string("K_sliding_out_7_cast_fp16")];
+            tensor<int32, [4]> var_2961_begin_0 = const()[name = string("op_2961_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2961_end_0 = const()[name = string("op_2961_end_0"), val = tensor<int32, [4]>([3, 2, 512, 512])];
+            tensor<bool, [4]> var_2961_end_mask_0 = const()[name = string("op_2961_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [3, 2, 512, 512]> var_2961_cast_fp16 = slice_by_index(begin = var_2961_begin_0, end = var_2961_end_0, end_mask = var_2961_end_mask_0, x = V_sliding_out_5_cast_fp16)[name = string("op_2961_cast_fp16")];
+            tensor<int32, [4]> var_2966_begin_0 = const()[name = string("op_2966_begin_0"), val = tensor<int32, [4]>([4, 0, 0, 0])];
+            tensor<int32, [4]> var_2966_end_0 = const()[name = string("op_2966_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_2966_end_mask_0 = const()[name = string("op_2966_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [6, 2, 512, 512]> var_2966_cast_fp16 = slice_by_index(begin = var_2966_begin_0, end = var_2966_end_0, end_mask = var_2966_end_mask_0, x = V_sliding_out_5_cast_fp16)[name = string("op_2966_cast_fp16")];
+            int32 var_2968 = const()[name = string("op_2968"), val = int32(0)];
+            bool V_sliding_out_7_interleave_0 = const()[name = string("V_sliding_out_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> V_sliding_out_7_cast_fp16 = concat(axis = var_2968, interleave = V_sliding_out_7_interleave_0, values = (var_2961_cast_fp16, new_v_7_cast_fp16, var_2966_cast_fp16))[name = string("V_sliding_out_7_cast_fp16")];
+            tensor<int32, [4]> var_2974_begin_0 = const()[name = string("op_2974_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
+            tensor<int32, [4]> var_2974_end_0 = const()[name = string("op_2974_end_0"), val = tensor<int32, [4]>([4, 2, 512, 512])];
+            tensor<bool, [4]> var_2974_end_mask_0 = const()[name = string("op_2974_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_2974_cast_fp16 = slice_by_index(begin = var_2974_begin_0, end = var_2974_end_0, end_mask = var_2974_end_mask_0, x = K_sliding_out_7_cast_fp16)[name = string("op_2974_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_7_begin_0 = const()[name = string("K_for_attn_7_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_7_end_0 = const()[name = string("K_for_attn_7_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_7_end_mask_0 = const()[name = string("K_for_attn_7_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_7_cast_fp16 = slice_by_index(begin = K_for_attn_7_begin_0, end = K_for_attn_7_end_0, end_mask = K_for_attn_7_end_mask_0, x = var_2974_cast_fp16)[name = string("K_for_attn_7_cast_fp16")];
+            tensor<int32, [4]> var_2984_begin_0 = const()[name = string("op_2984_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
+            tensor<int32, [4]> var_2984_end_0 = const()[name = string("op_2984_end_0"), val = tensor<int32, [4]>([4, 2, 512, 512])];
+            tensor<bool, [4]> var_2984_end_mask_0 = const()[name = string("op_2984_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_2984_cast_fp16 = slice_by_index(begin = var_2984_begin_0, end = var_2984_end_0, end_mask = var_2984_end_mask_0, x = V_sliding_out_7_cast_fp16)[name = string("op_2984_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_7_begin_0 = const()[name = string("V_for_attn_7_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_7_end_0 = const()[name = string("V_for_attn_7_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_7_end_mask_0 = const()[name = string("V_for_attn_7_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_7_cast_fp16 = slice_by_index(begin = V_for_attn_7_begin_0, end = V_for_attn_7_end_0, end_mask = V_for_attn_7_end_mask_0, x = var_2984_cast_fp16)[name = string("V_for_attn_7_cast_fp16")];
+            tensor<int32, [4]> transpose_12_perm_0 = const()[name = string("transpose_12_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_6_reps_0 = const()[name = string("tile_6_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_12_cast_fp16 = transpose(perm = transpose_12_perm_0, x = K_for_attn_7_cast_fp16)[name = string("transpose_173")];
+            tensor<fp16, [8, 1, 512, 256]> tile_6_cast_fp16 = tile(reps = tile_6_reps_0, x = transpose_12_cast_fp16)[name = string("tile_6_cast_fp16")];
+            tensor<int32, [5]> concat_12 = const()[name = string("concat_12"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_12_cast_fp16 = reshape(shape = concat_12, x = tile_6_cast_fp16)[name = string("reshape_12_cast_fp16")];
+            tensor<int32, [5]> transpose_13_perm_0 = const()[name = string("transpose_13_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_13 = const()[name = string("concat_13"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_13_cast_fp16 = transpose(perm = transpose_13_perm_0, x = reshape_12_cast_fp16)[name = string("transpose_172")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_13_cast_fp16 = reshape(shape = concat_13, x = transpose_13_cast_fp16)[name = string("reshape_13_cast_fp16")];
+            tensor<int32, [4]> transpose_59_perm_0 = const()[name = string("transpose_59_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_14_perm_0 = const()[name = string("transpose_14_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_7_reps_0 = const()[name = string("tile_7_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_14_cast_fp16 = transpose(perm = transpose_14_perm_0, x = V_for_attn_7_cast_fp16)[name = string("transpose_171")];
+            tensor<fp16, [8, 1, 512, 256]> tile_7_cast_fp16 = tile(reps = tile_7_reps_0, x = transpose_14_cast_fp16)[name = string("tile_7_cast_fp16")];
+            tensor<int32, [5]> concat_14 = const()[name = string("concat_14"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_14_cast_fp16 = reshape(shape = concat_14, x = tile_7_cast_fp16)[name = string("reshape_14_cast_fp16")];
+            tensor<int32, [5]> transpose_15_perm_0 = const()[name = string("transpose_15_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_15 = const()[name = string("concat_15"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_15_cast_fp16 = transpose(perm = transpose_15_perm_0, x = reshape_14_cast_fp16)[name = string("transpose_170")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_15_cast_fp16 = reshape(shape = concat_15, x = transpose_15_cast_fp16)[name = string("reshape_15_cast_fp16")];
+            tensor<int32, [4]> V_expanded_7_perm_0 = const()[name = string("V_expanded_7_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(false)];
+            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_59_cast_fp16 = transpose(perm = transpose_59_perm_0, x = reshape_13_cast_fp16)[name = string("transpose_169")];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = q_47_cast_fp16, y = transpose_59_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_67_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = causal_mask_sliding)[name = string("x_67_cast_fp16")];
+            tensor<int32, [1]> reduce_max_3_axes_0 = const()[name = string("reduce_max_3_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_3_keep_dims_0 = const()[name = string("reduce_max_3_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_3 = reduce_max(axes = reduce_max_3_axes_0, keep_dims = reduce_max_3_keep_dims_0, x = x_67_cast_fp16)[name = string("reduce_max_3")];
+            tensor<fp16, [1, 8, 3, 512]> var_3019 = sub(x = x_67_cast_fp16, y = reduce_max_3)[name = string("op_3019")];
+            tensor<fp16, [1, 8, 3, 512]> var_3025 = exp(x = var_3019)[name = string("op_3025")];
+            tensor<int32, [1]> var_3035_axes_0 = const()[name = string("op_3035_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3035_keep_dims_0 = const()[name = string("op_3035_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_3035 = reduce_sum(axes = var_3035_axes_0, keep_dims = var_3035_keep_dims_0, x = var_3025)[name = string("op_3035")];
+            tensor<fp16, [1, 8, 3, 512]> var_3041_cast_fp16 = real_div(x = var_3025, y = var_3035)[name = string("op_3041_cast_fp16")];
+            bool attn_output_19_transpose_x_0 = const()[name = string("attn_output_19_transpose_x_0"), val = bool(false)];
+            bool attn_output_19_transpose_y_0 = const()[name = string("attn_output_19_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_7_cast_fp16 = transpose(perm = V_expanded_7_perm_0, x = reshape_15_cast_fp16)[name = string("transpose_168")];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_19_cast_fp16 = matmul(transpose_x = attn_output_19_transpose_x_0, transpose_y = attn_output_19_transpose_y_0, x = var_3041_cast_fp16, y = V_expanded_7_cast_fp16)[name = string("attn_output_19_cast_fp16")];
+            tensor<int32, [4]> var_3052 = const()[name = string("op_3052"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_3059 = const()[name = string("op_3059"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_3053_cast_fp16 = transpose(perm = var_3052, x = attn_output_19_cast_fp16)[name = string("transpose_167")];
+            tensor<fp16, [1, 3, 2048]> attn_output_21_cast_fp16 = reshape(shape = var_3059, x = var_3053_cast_fp16)[name = string("attn_output_21_cast_fp16")];
+            tensor<int32, [3]> var_3064 = const()[name = string("op_3064"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_3080_pad_type_0 = const()[name = string("op_3080_pad_type_0"), val = string("valid")];
+            int32 var_3080_groups_0 = const()[name = string("op_3080_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_3080_strides_0 = const()[name = string("op_3080_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_3080_pad_0 = const()[name = string("op_3080_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_3080_dilations_0 = const()[name = string("op_3080_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_3_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(553955648))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(556577152))))[name = string("squeeze_3_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_3065_cast_fp16 = transpose(perm = var_3064, x = attn_output_21_cast_fp16)[name = string("transpose_166")];
+            tensor<fp16, [1, 2560, 3]> var_3080_cast_fp16 = conv(dilations = var_3080_dilations_0, groups = var_3080_groups_0, pad = var_3080_pad_0, pad_type = var_3080_pad_type_0, strides = var_3080_strides_0, weight = squeeze_3_cast_fp16_to_fp32_to_fp16_palettized, x = var_3065_cast_fp16)[name = string("op_3080_cast_fp16")];
+            tensor<int32, [3]> var_3084 = const()[name = string("op_3084"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3090 = const()[name = string("op_3090"), val = int32(-1)];
+            fp16 const_44_promoted_to_fp16 = const()[name = string("const_44_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_71_cast_fp16 = transpose(perm = var_3084, x = var_3080_cast_fp16)[name = string("transpose_165")];
+            tensor<fp16, [1, 3, 2560]> var_3092_cast_fp16 = mul(x = x_71_cast_fp16, y = const_44_promoted_to_fp16)[name = string("op_3092_cast_fp16")];
+            bool input_109_interleave_0 = const()[name = string("input_109_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_109_cast_fp16 = concat(axis = var_3090, interleave = input_109_interleave_0, values = (x_71_cast_fp16, var_3092_cast_fp16))[name = string("input_109_cast_fp16")];
+            tensor<int32, [1]> normed_101_axes_0 = const()[name = string("normed_101_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3087_to_fp16 = const()[name = string("op_3087_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_101_cast_fp16 = layer_norm(axes = normed_101_axes_0, epsilon = var_3087_to_fp16, x = input_109_cast_fp16)[name = string("normed_101_cast_fp16")];
+            tensor<int32, [2]> var_3097_split_sizes_0 = const()[name = string("op_3097_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3097_axis_0 = const()[name = string("op_3097_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3097_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3097_cast_fp16_1 = split(axis = var_3097_axis_0, split_sizes = var_3097_split_sizes_0, x = normed_101_cast_fp16)[name = string("op_3097_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_3_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(556579776)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_23_cast_fp16 = mul(x = var_3097_cast_fp16_0, y = layers_3_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_23_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_73_cast_fp16 = add(x = x_59_cast_fp16, y = attn_output_23_cast_fp16)[name = string("x_73_cast_fp16")];
+            int32 var_3106 = const()[name = string("op_3106"), val = int32(-1)];
+            fp16 const_45_promoted_to_fp16 = const()[name = string("const_45_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_3108_cast_fp16 = mul(x = x_73_cast_fp16, y = const_45_promoted_to_fp16)[name = string("op_3108_cast_fp16")];
+            bool input_111_interleave_0 = const()[name = string("input_111_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_111_cast_fp16 = concat(axis = var_3106, interleave = input_111_interleave_0, values = (x_73_cast_fp16, var_3108_cast_fp16))[name = string("input_111_cast_fp16")];
+            tensor<int32, [1]> normed_105_axes_0 = const()[name = string("normed_105_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3103_to_fp16 = const()[name = string("op_3103_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_105_cast_fp16 = layer_norm(axes = normed_105_axes_0, epsilon = var_3103_to_fp16, x = input_111_cast_fp16)[name = string("normed_105_cast_fp16")];
+            tensor<int32, [2]> var_3113_split_sizes_0 = const()[name = string("op_3113_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3113_axis_0 = const()[name = string("op_3113_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3113_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3113_cast_fp16_1 = split(axis = var_3113_axis_0, split_sizes = var_3113_split_sizes_0, x = normed_105_cast_fp16)[name = string("op_3113_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_3_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(556584960)))];
+            tensor<fp16, [1, 3, 2560]> h_21_cast_fp16 = mul(x = var_3113_cast_fp16_0, y = layers_3_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_21_cast_fp16")];
+            tensor<int32, [3]> var_3124 = const()[name = string("op_3124"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_113_axes_0 = const()[name = string("input_113_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3125 = transpose(perm = var_3124, x = h_21_cast_fp16)[name = string("transpose_164")];
+            tensor<fp16, [1, 2560, 1, 3]> input_113 = expand_dims(axes = input_113_axes_0, x = var_3125)[name = string("input_113")];
+            string gate_13_pad_type_0 = const()[name = string("gate_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_13_strides_0 = const()[name = string("gate_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_13_pad_0 = const()[name = string("gate_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_13_dilations_0 = const()[name = string("gate_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_13_groups_0 = const()[name = string("gate_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_13 = conv(dilations = gate_13_dilations_0, groups = gate_13_groups_0, pad = gate_13_pad_0, pad_type = gate_13_pad_type_0, strides = gate_13_strides_0, weight = layers_3_mlp_gate_proj_weight_palettized, x = input_113)[name = string("gate_13")];
+            string up_7_pad_type_0 = const()[name = string("up_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_7_strides_0 = const()[name = string("up_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_7_pad_0 = const()[name = string("up_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_7_dilations_0 = const()[name = string("up_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_7_groups_0 = const()[name = string("up_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_7 = conv(dilations = up_7_dilations_0, groups = up_7_groups_0, pad = up_7_pad_0, pad_type = up_7_pad_type_0, strides = up_7_strides_0, weight = layers_3_mlp_up_proj_weight_palettized, x = input_113)[name = string("up_7")];
+            string gate_15_mode_0 = const()[name = string("gate_15_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_15 = gelu(mode = gate_15_mode_0, x = gate_13)[name = string("gate_15")];
+            tensor<fp16, [1, 10240, 1, 3]> input_115 = mul(x = gate_15, y = up_7)[name = string("input_115")];
+            string mlp_out_7_pad_type_0 = const()[name = string("mlp_out_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_7_strides_0 = const()[name = string("mlp_out_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_7_pad_0 = const()[name = string("mlp_out_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_7_dilations_0 = const()[name = string("mlp_out_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_7_groups_0 = const()[name = string("mlp_out_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_7 = conv(dilations = mlp_out_7_dilations_0, groups = mlp_out_7_groups_0, pad = mlp_out_7_pad_0, pad_type = mlp_out_7_pad_type_0, strides = mlp_out_7_strides_0, weight = layers_3_mlp_down_proj_weight_palettized, x = input_115)[name = string("mlp_out_7")];
+            tensor<int32, [1]> var_3165_axes_0 = const()[name = string("op_3165_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3165 = squeeze(axes = var_3165_axes_0, x = mlp_out_7)[name = string("op_3165")];
+            tensor<int32, [3]> var_3169 = const()[name = string("op_3169"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3175 = const()[name = string("op_3175"), val = int32(-1)];
+            fp16 const_46_promoted = const()[name = string("const_46_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_75 = transpose(perm = var_3169, x = var_3165)[name = string("transpose_163")];
+            tensor<fp16, [1, 3, 2560]> var_3177 = mul(x = x_75, y = const_46_promoted)[name = string("op_3177")];
+            bool input_117_interleave_0 = const()[name = string("input_117_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_117 = concat(axis = var_3175, interleave = input_117_interleave_0, values = (x_75, var_3177))[name = string("input_117")];
+            tensor<int32, [1]> normed_109_axes_0 = const()[name = string("normed_109_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3172_to_fp16 = const()[name = string("op_3172_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_109_cast_fp16 = layer_norm(axes = normed_109_axes_0, epsilon = var_3172_to_fp16, x = input_117)[name = string("normed_109_cast_fp16")];
+            tensor<int32, [2]> var_3182_split_sizes_0 = const()[name = string("op_3182_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3182_axis_0 = const()[name = string("op_3182_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3182_0, tensor<fp16, [1, 3, 2560]> var_3182_1 = split(axis = var_3182_axis_0, split_sizes = var_3182_split_sizes_0, x = normed_109_cast_fp16)[name = string("op_3182")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_33 = mul(x = var_3182_0, y = layers_3_post_feedforward_layernorm_weight)[name = string("hidden_states_33")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_35_cast_fp16 = add(x = x_73_cast_fp16, y = hidden_states_33)[name = string("hidden_states_35_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_7_begin_0 = const()[name = string("per_layer_slice_7_begin_0"), val = tensor<int32, [3]>([0, 0, 768])];
+            tensor<int32, [3]> per_layer_slice_7_end_0 = const()[name = string("per_layer_slice_7_end_0"), val = tensor<int32, [3]>([1, 3, 1024])];
+            tensor<bool, [3]> per_layer_slice_7_end_mask_0 = const()[name = string("per_layer_slice_7_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_7_cast_fp16 = slice_by_index(begin = per_layer_slice_7_begin_0, end = per_layer_slice_7_end_0, end_mask = per_layer_slice_7_end_mask_0, x = per_layer_combined_out)[name = string("per_layer_slice_7_cast_fp16")];
+            tensor<int32, [3]> var_3210 = const()[name = string("op_3210"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_119_axes_0 = const()[name = string("input_119_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3211 = transpose(perm = var_3210, x = hidden_states_35_cast_fp16)[name = string("transpose_162")];
+            tensor<fp16, [1, 2560, 1, 3]> input_119 = expand_dims(axes = input_119_axes_0, x = var_3211)[name = string("input_119")];
+            string gated_19_pad_type_0 = const()[name = string("gated_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_19_strides_0 = const()[name = string("gated_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_19_pad_0 = const()[name = string("gated_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_19_dilations_0 = const()[name = string("gated_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_19_groups_0 = const()[name = string("gated_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_19 = conv(dilations = gated_19_dilations_0, groups = gated_19_groups_0, pad = gated_19_pad_0, pad_type = gated_19_pad_type_0, strides = gated_19_strides_0, weight = layers_3_per_layer_input_gate_weight_palettized, x = input_119)[name = string("gated_19")];
+            string gated_21_mode_0 = const()[name = string("gated_21_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_21 = gelu(mode = gated_21_mode_0, x = gated_19)[name = string("gated_21")];
+            tensor<int32, [3]> var_3230 = const()[name = string("op_3230"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_7_axes_0 = const()[name = string("per_layer_slice_conv_7_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_3231_cast_fp16 = transpose(perm = var_3230, x = per_layer_slice_7_cast_fp16)[name = string("transpose_161")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_7_cast_fp16 = expand_dims(axes = per_layer_slice_conv_7_axes_0, x = var_3231_cast_fp16)[name = string("per_layer_slice_conv_7_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_121_cast_fp16 = mul(x = gated_21, y = per_layer_slice_conv_7_cast_fp16)[name = string("input_121_cast_fp16")];
+            string gated_23_pad_type_0 = const()[name = string("gated_23_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_23_strides_0 = const()[name = string("gated_23_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_23_pad_0 = const()[name = string("gated_23_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_23_dilations_0 = const()[name = string("gated_23_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_23_groups_0 = const()[name = string("gated_23_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_3_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(556590144))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(556917888))))[name = string("layers_3_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_23_cast_fp16 = conv(dilations = gated_23_dilations_0, groups = gated_23_groups_0, pad = gated_23_pad_0, pad_type = gated_23_pad_type_0, strides = gated_23_strides_0, weight = layers_3_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_121_cast_fp16)[name = string("gated_23_cast_fp16")];
+            tensor<int32, [1]> var_3247_axes_0 = const()[name = string("op_3247_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3247_cast_fp16 = squeeze(axes = var_3247_axes_0, x = gated_23_cast_fp16)[name = string("op_3247_cast_fp16")];
+            tensor<int32, [3]> var_3251 = const()[name = string("op_3251"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3257 = const()[name = string("op_3257"), val = int32(-1)];
+            fp16 const_47_promoted_to_fp16 = const()[name = string("const_47_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_77_cast_fp16 = transpose(perm = var_3251, x = var_3247_cast_fp16)[name = string("transpose_160")];
+            tensor<fp16, [1, 3, 2560]> var_3259_cast_fp16 = mul(x = x_77_cast_fp16, y = const_47_promoted_to_fp16)[name = string("op_3259_cast_fp16")];
+            bool input_123_interleave_0 = const()[name = string("input_123_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_123_cast_fp16 = concat(axis = var_3257, interleave = input_123_interleave_0, values = (x_77_cast_fp16, var_3259_cast_fp16))[name = string("input_123_cast_fp16")];
+            tensor<int32, [1]> normed_113_axes_0 = const()[name = string("normed_113_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3254_to_fp16 = const()[name = string("op_3254_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_113_cast_fp16 = layer_norm(axes = normed_113_axes_0, epsilon = var_3254_to_fp16, x = input_123_cast_fp16)[name = string("normed_113_cast_fp16")];
+            tensor<int32, [2]> var_3264_split_sizes_0 = const()[name = string("op_3264_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3264_axis_0 = const()[name = string("op_3264_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3264_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3264_cast_fp16_1 = split(axis = var_3264_axis_0, split_sizes = var_3264_split_sizes_0, x = normed_113_cast_fp16)[name = string("op_3264_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_3_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(556920512)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_39_cast_fp16 = mul(x = var_3264_cast_fp16_0, y = layers_3_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_39_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_41_cast_fp16 = add(x = hidden_states_35_cast_fp16, y = hidden_states_39_cast_fp16)[name = string("hidden_states_41_cast_fp16")];
+            tensor<fp16, [1]> const_48_promoted_to_fp16 = const()[name = string("const_48_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.6cp-1])];
+            tensor<fp16, [1, 3, 2560]> x_79_cast_fp16 = mul(x = hidden_states_41_cast_fp16, y = const_48_promoted_to_fp16)[name = string("x_79_cast_fp16")];
+            int32 var_3279 = const()[name = string("op_3279"), val = int32(-1)];
+            fp16 const_49_promoted_to_fp16 = const()[name = string("const_49_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_3281_cast_fp16 = mul(x = x_79_cast_fp16, y = const_49_promoted_to_fp16)[name = string("op_3281_cast_fp16")];
+            bool input_125_interleave_0 = const()[name = string("input_125_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_125_cast_fp16 = concat(axis = var_3279, interleave = input_125_interleave_0, values = (x_79_cast_fp16, var_3281_cast_fp16))[name = string("input_125_cast_fp16")];
+            tensor<int32, [1]> normed_117_axes_0 = const()[name = string("normed_117_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3276_to_fp16 = const()[name = string("op_3276_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_117_cast_fp16 = layer_norm(axes = normed_117_axes_0, epsilon = var_3276_to_fp16, x = input_125_cast_fp16)[name = string("normed_117_cast_fp16")];
+            tensor<int32, [2]> var_3286_split_sizes_0 = const()[name = string("op_3286_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3286_axis_0 = const()[name = string("op_3286_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3286_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3286_cast_fp16_1 = split(axis = var_3286_axis_0, split_sizes = var_3286_split_sizes_0, x = normed_117_cast_fp16)[name = string("op_3286_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_4_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(556925696)))];
+            tensor<fp16, [1, 3, 2560]> h_25_cast_fp16 = mul(x = var_3286_cast_fp16_0, y = layers_4_input_layernorm_weight_promoted_to_fp16)[name = string("h_25_cast_fp16")];
+            tensor<int32, [3]> var_3292 = const()[name = string("op_3292"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_3295_axes_0 = const()[name = string("op_3295_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3293_cast_fp16 = transpose(perm = var_3292, x = h_25_cast_fp16)[name = string("transpose_159")];
+            tensor<fp16, [1, 2560, 1, 3]> var_3295_cast_fp16 = expand_dims(axes = var_3295_axes_0, x = var_3293_cast_fp16)[name = string("op_3295_cast_fp16")];
+            string q_49_pad_type_0 = const()[name = string("q_49_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_49_strides_0 = const()[name = string("q_49_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_49_pad_0 = const()[name = string("q_49_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_49_dilations_0 = const()[name = string("q_49_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_49_groups_0 = const()[name = string("q_49_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_49 = conv(dilations = q_49_dilations_0, groups = q_49_groups_0, pad = q_49_pad_0, pad_type = q_49_pad_type_0, strides = q_49_strides_0, weight = layers_4_self_attn_q_proj_weight_palettized, x = var_3295_cast_fp16)[name = string("q_49")];
+            tensor<int32, [4]> var_3316 = const()[name = string("op_3316"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_3317 = reshape(shape = var_3316, x = q_49)[name = string("op_3317")];
+            tensor<int32, [4]> transpose_60_perm_0 = const()[name = string("transpose_60_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_3340 = const()[name = string("op_3340"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_60 = transpose(perm = transpose_60_perm_0, x = var_3317)[name = string("transpose_158")];
+            tensor<fp16, [3, 8, 256]> x_81 = reshape(shape = var_3340, x = transpose_60)[name = string("x_81")];
+            int32 var_3346 = const()[name = string("op_3346"), val = int32(-1)];
+            fp16 const_50_promoted = const()[name = string("const_50_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_3348 = mul(x = x_81, y = const_50_promoted)[name = string("op_3348")];
+            bool input_129_interleave_0 = const()[name = string("input_129_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_129 = concat(axis = var_3346, interleave = input_129_interleave_0, values = (x_81, var_3348))[name = string("input_129")];
+            tensor<int32, [1]> normed_121_axes_0 = const()[name = string("normed_121_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3343_to_fp16 = const()[name = string("op_3343_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_121_cast_fp16 = layer_norm(axes = normed_121_axes_0, epsilon = var_3343_to_fp16, x = input_129)[name = string("normed_121_cast_fp16")];
+            tensor<int32, [2]> var_3353_split_sizes_0 = const()[name = string("op_3353_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_3353_axis_0 = const()[name = string("op_3353_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_3353_0, tensor<fp16, [3, 8, 256]> var_3353_1 = split(axis = var_3353_axis_0, split_sizes = var_3353_split_sizes_0, x = normed_121_cast_fp16)[name = string("op_3353")];
+            tensor<fp16, [3, 8, 256]> q_53 = mul(x = var_3353_0, y = layers_4_self_attn_q_norm_weight)[name = string("q_53")];
+            tensor<int32, [4]> var_3360 = const()[name = string("op_3360"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_3361 = reshape(shape = var_3360, x = q_53)[name = string("op_3361")];
+            tensor<int32, [4]> var_3366 = const()[name = string("op_3366"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_55 = transpose(perm = var_3366, x = var_3361)[name = string("transpose_157")];
+            tensor<fp16, [1, 8, 3, 256]> var_3368_cast_fp16 = mul(x = q_55, y = cos_s)[name = string("op_3368_cast_fp16")];
+            tensor<int32, [2]> var_3369_split_sizes_0 = const()[name = string("op_3369_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_3369_axis_0 = const()[name = string("op_3369_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_3369_0, tensor<fp16, [1, 8, 3, 128]> var_3369_1 = split(axis = var_3369_axis_0, split_sizes = var_3369_split_sizes_0, x = q_55)[name = string("op_3369")];
+            fp16 const_51_promoted = const()[name = string("const_51_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_3371 = mul(x = var_3369_1, y = const_51_promoted)[name = string("op_3371")];
+            int32 var_3373 = const()[name = string("op_3373"), val = int32(-1)];
+            bool var_3374_interleave_0 = const()[name = string("op_3374_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_3374 = concat(axis = var_3373, interleave = var_3374_interleave_0, values = (var_3371, var_3369_0))[name = string("op_3374")];
+            tensor<fp16, [1, 8, 3, 256]> var_3375_cast_fp16 = mul(x = var_3374, y = sin_s)[name = string("op_3375_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_59_cast_fp16 = add(x = var_3368_cast_fp16, y = var_3375_cast_fp16)[name = string("q_59_cast_fp16")];
+            string k_25_pad_type_0 = const()[name = string("k_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_25_strides_0 = const()[name = string("k_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_25_pad_0 = const()[name = string("k_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_25_dilations_0 = const()[name = string("k_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_25_groups_0 = const()[name = string("k_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> k_25 = conv(dilations = k_25_dilations_0, groups = k_25_groups_0, pad = k_25_pad_0, pad_type = k_25_pad_type_0, strides = k_25_strides_0, weight = layers_4_self_attn_k_proj_weight_palettized, x = var_3295_cast_fp16)[name = string("k_25")];
+            tensor<int32, [4]> var_3393 = const()[name = string("op_3393"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_3394 = reshape(shape = var_3393, x = k_25)[name = string("op_3394")];
+            tensor<int32, [4]> transpose_61_perm_0 = const()[name = string("transpose_61_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            string v_9_pad_type_0 = const()[name = string("v_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_9_strides_0 = const()[name = string("v_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_9_pad_0 = const()[name = string("v_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_9_dilations_0 = const()[name = string("v_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_9_groups_0 = const()[name = string("v_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> v_9 = conv(dilations = v_9_dilations_0, groups = v_9_groups_0, pad = v_9_pad_0, pad_type = v_9_pad_type_0, strides = v_9_strides_0, weight = layers_4_self_attn_v_proj_weight_palettized, x = var_3295_cast_fp16)[name = string("v_9")];
+            tensor<int32, [4]> var_3421 = const()[name = string("op_3421"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_3422 = reshape(shape = var_3421, x = v_9)[name = string("op_3422")];
+            tensor<int32, [4]> var_3427 = const()[name = string("op_3427"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_3445 = const()[name = string("op_3445"), val = tensor<int32, [3]>([3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> transpose_61 = transpose(perm = transpose_61_perm_0, x = var_3394)[name = string("transpose_156")];
+            tensor<fp16, [3, 2, 256]> x_83 = reshape(shape = var_3445, x = transpose_61)[name = string("x_83")];
+            int32 var_3451 = const()[name = string("op_3451"), val = int32(-1)];
+            fp16 const_52_promoted = const()[name = string("const_52_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 2, 256]> var_3453 = mul(x = x_83, y = const_52_promoted)[name = string("op_3453")];
+            bool input_131_interleave_0 = const()[name = string("input_131_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 2, 512]> input_131 = concat(axis = var_3451, interleave = input_131_interleave_0, values = (x_83, var_3453))[name = string("input_131")];
+            tensor<int32, [1]> normed_125_axes_0 = const()[name = string("normed_125_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3448_to_fp16 = const()[name = string("op_3448_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 2, 512]> normed_125_cast_fp16 = layer_norm(axes = normed_125_axes_0, epsilon = var_3448_to_fp16, x = input_131)[name = string("normed_125_cast_fp16")];
+            tensor<int32, [2]> var_3458_split_sizes_0 = const()[name = string("op_3458_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_3458_axis_0 = const()[name = string("op_3458_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 2, 256]> var_3458_0, tensor<fp16, [3, 2, 256]> var_3458_1 = split(axis = var_3458_axis_0, split_sizes = var_3458_split_sizes_0, x = normed_125_cast_fp16)[name = string("op_3458")];
+            tensor<fp16, [3, 2, 256]> k_29 = mul(x = var_3458_0, y = layers_4_self_attn_k_norm_weight)[name = string("k_29")];
+            tensor<int32, [4]> var_3465 = const()[name = string("op_3465"), val = tensor<int32, [4]>([1, 3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> var_3466 = reshape(shape = var_3465, x = k_29)[name = string("op_3466")];
+            tensor<int32, [4]> var_3471 = const()[name = string("op_3471"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            fp16 var_3473_promoted = const()[name = string("op_3473_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 3, 256]> var_3428 = transpose(perm = var_3427, x = var_3422)[name = string("transpose_155")];
+            tensor<fp16, [1, 2, 3, 256]> var_3474 = pow(x = var_3428, y = var_3473_promoted)[name = string("op_3474")];
+            tensor<int32, [1]> var_3479_axes_0 = const()[name = string("op_3479_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3479_keep_dims_0 = const()[name = string("op_3479_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 3, 1]> var_3479 = reduce_mean(axes = var_3479_axes_0, keep_dims = var_3479_keep_dims_0, x = var_3474)[name = string("op_3479")];
+            fp16 var_3481_to_fp16 = const()[name = string("op_3481_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 3, 1]> mean_sq_9_cast_fp16 = add(x = var_3479, y = var_3481_to_fp16)[name = string("mean_sq_9_cast_fp16")];
+            fp32 var_3483_epsilon_0 = const()[name = string("op_3483_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 3, 1]> var_3483_cast_fp16 = rsqrt(epsilon = var_3483_epsilon_0, x = mean_sq_9_cast_fp16)[name = string("op_3483_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_135_cast_fp16 = mul(x = var_3428, y = var_3483_cast_fp16)[name = string("input_135_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> q_57 = transpose(perm = var_3471, x = var_3466)[name = string("transpose_154")];
+            tensor<fp16, [1, 2, 3, 256]> var_3485_cast_fp16 = mul(x = q_57, y = cos_s)[name = string("op_3485_cast_fp16")];
+            tensor<int32, [2]> var_3486_split_sizes_0 = const()[name = string("op_3486_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_3486_axis_0 = const()[name = string("op_3486_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 3, 128]> var_3486_0, tensor<fp16, [1, 2, 3, 128]> var_3486_1 = split(axis = var_3486_axis_0, split_sizes = var_3486_split_sizes_0, x = q_57)[name = string("op_3486")];
+            fp16 const_53_promoted = const()[name = string("const_53_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 3, 128]> var_3488 = mul(x = var_3486_1, y = const_53_promoted)[name = string("op_3488")];
+            int32 var_3490 = const()[name = string("op_3490"), val = int32(-1)];
+            bool var_3491_interleave_0 = const()[name = string("op_3491_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 3, 256]> var_3491 = concat(axis = var_3490, interleave = var_3491_interleave_0, values = (var_3488, var_3486_0))[name = string("op_3491")];
+            tensor<fp16, [1, 2, 3, 256]> var_3492_cast_fp16 = mul(x = var_3491, y = sin_s)[name = string("op_3492_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_133_cast_fp16 = add(x = var_3485_cast_fp16, y = var_3492_cast_fp16)[name = string("input_133_cast_fp16")];
+            tensor<int32, [8]> k_padded_9_pad_0 = const()[name = string("k_padded_9_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_9_mode_0 = const()[name = string("k_padded_9_mode_0"), val = string("constant")];
+            fp16 const_54_to_fp16 = const()[name = string("const_54_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> k_padded_9_cast_fp16 = pad(constant_val = const_54_to_fp16, mode = k_padded_9_mode_0, pad = k_padded_9_pad_0, x = input_133_cast_fp16)[name = string("k_padded_9_cast_fp16")];
+            tensor<int32, [8]> v_padded_9_pad_0 = const()[name = string("v_padded_9_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_9_mode_0 = const()[name = string("v_padded_9_mode_0"), val = string("constant")];
+            fp16 const_55_to_fp16 = const()[name = string("const_55_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> v_padded_9_cast_fp16 = pad(constant_val = const_55_to_fp16, mode = v_padded_9_mode_0, pad = v_padded_9_pad_0, x = input_135_cast_fp16)[name = string("v_padded_9_cast_fp16")];
+            tensor<int32, [4]> slot_k_9_begin_0 = const()[name = string("slot_k_9_begin_0"), val = tensor<int32, [4]>([4, 0, 0, 0])];
+            tensor<int32, [4]> slot_k_9_end_0 = const()[name = string("slot_k_9_end_0"), val = tensor<int32, [4]>([5, 2, 512, 512])];
+            tensor<bool, [4]> slot_k_9_end_mask_0 = const()[name = string("slot_k_9_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_k_9_cast_fp16 = slice_by_index(begin = slot_k_9_begin_0, end = slot_k_9_end_0, end_mask = slot_k_9_end_mask_0, x = K_sliding_out_7_cast_fp16)[name = string("slot_k_9_cast_fp16")];
+            tensor<int32, [4]> slot_v_9_begin_0 = const()[name = string("slot_v_9_begin_0"), val = tensor<int32, [4]>([4, 0, 0, 0])];
+            tensor<int32, [4]> slot_v_9_end_0 = const()[name = string("slot_v_9_end_0"), val = tensor<int32, [4]>([5, 2, 512, 512])];
+            tensor<bool, [4]> slot_v_9_end_mask_0 = const()[name = string("slot_v_9_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_v_9_cast_fp16 = slice_by_index(begin = slot_v_9_begin_0, end = slot_v_9_end_0, end_mask = slot_v_9_end_mask_0, x = V_sliding_out_7_cast_fp16)[name = string("slot_v_9_cast_fp16")];
+            tensor<int32, [4]> var_3531_begin_0 = const()[name = string("op_3531_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_3531_end_0 = const()[name = string("op_3531_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_3531_end_mask_0 = const()[name = string("op_3531_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_3531_cast_fp16 = slice_by_index(begin = var_3531_begin_0, end = var_3531_end_0, end_mask = var_3531_end_mask_0, x = slot_k_9_cast_fp16)[name = string("op_3531_cast_fp16")];
+            int32 var_3538 = const()[name = string("op_3538"), val = int32(2)];
+            bool new_k_9_interleave_0 = const()[name = string("new_k_9_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_k_9_cast_fp16 = concat(axis = var_3538, interleave = new_k_9_interleave_0, values = (var_3531_cast_fp16, k_padded_9_cast_fp16))[name = string("new_k_9_cast_fp16")];
+            tensor<int32, [4]> var_3554_begin_0 = const()[name = string("op_3554_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_3554_end_0 = const()[name = string("op_3554_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_3554_end_mask_0 = const()[name = string("op_3554_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_3554_cast_fp16 = slice_by_index(begin = var_3554_begin_0, end = var_3554_end_0, end_mask = var_3554_end_mask_0, x = slot_v_9_cast_fp16)[name = string("op_3554_cast_fp16")];
+            int32 var_3561 = const()[name = string("op_3561"), val = int32(2)];
+            bool new_v_9_interleave_0 = const()[name = string("new_v_9_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_v_9_cast_fp16 = concat(axis = var_3561, interleave = new_v_9_interleave_0, values = (var_3554_cast_fp16, v_padded_9_cast_fp16))[name = string("new_v_9_cast_fp16")];
+            tensor<int32, [4]> var_3567_begin_0 = const()[name = string("op_3567_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3567_end_0 = const()[name = string("op_3567_end_0"), val = tensor<int32, [4]>([4, 2, 512, 512])];
+            tensor<bool, [4]> var_3567_end_mask_0 = const()[name = string("op_3567_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [4, 2, 512, 512]> var_3567_cast_fp16 = slice_by_index(begin = var_3567_begin_0, end = var_3567_end_0, end_mask = var_3567_end_mask_0, x = K_sliding_out_7_cast_fp16)[name = string("op_3567_cast_fp16")];
+            tensor<int32, [4]> var_3572_begin_0 = const()[name = string("op_3572_begin_0"), val = tensor<int32, [4]>([5, 0, 0, 0])];
+            tensor<int32, [4]> var_3572_end_0 = const()[name = string("op_3572_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_3572_end_mask_0 = const()[name = string("op_3572_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [5, 2, 512, 512]> var_3572_cast_fp16 = slice_by_index(begin = var_3572_begin_0, end = var_3572_end_0, end_mask = var_3572_end_mask_0, x = K_sliding_out_7_cast_fp16)[name = string("op_3572_cast_fp16")];
+            int32 var_3574 = const()[name = string("op_3574"), val = int32(0)];
+            bool K_sliding_out_9_interleave_0 = const()[name = string("K_sliding_out_9_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> K_sliding_out_9_cast_fp16 = concat(axis = var_3574, interleave = K_sliding_out_9_interleave_0, values = (var_3567_cast_fp16, new_k_9_cast_fp16, var_3572_cast_fp16))[name = string("K_sliding_out_9_cast_fp16")];
+            tensor<int32, [4]> var_3580_begin_0 = const()[name = string("op_3580_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3580_end_0 = const()[name = string("op_3580_end_0"), val = tensor<int32, [4]>([4, 2, 512, 512])];
+            tensor<bool, [4]> var_3580_end_mask_0 = const()[name = string("op_3580_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [4, 2, 512, 512]> var_3580_cast_fp16 = slice_by_index(begin = var_3580_begin_0, end = var_3580_end_0, end_mask = var_3580_end_mask_0, x = V_sliding_out_7_cast_fp16)[name = string("op_3580_cast_fp16")];
+            tensor<int32, [4]> var_3585_begin_0 = const()[name = string("op_3585_begin_0"), val = tensor<int32, [4]>([5, 0, 0, 0])];
+            tensor<int32, [4]> var_3585_end_0 = const()[name = string("op_3585_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_3585_end_mask_0 = const()[name = string("op_3585_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [5, 2, 512, 512]> var_3585_cast_fp16 = slice_by_index(begin = var_3585_begin_0, end = var_3585_end_0, end_mask = var_3585_end_mask_0, x = V_sliding_out_7_cast_fp16)[name = string("op_3585_cast_fp16")];
+            int32 var_3587 = const()[name = string("op_3587"), val = int32(0)];
+            bool V_sliding_out_9_interleave_0 = const()[name = string("V_sliding_out_9_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> V_sliding_out_9_cast_fp16 = concat(axis = var_3587, interleave = V_sliding_out_9_interleave_0, values = (var_3580_cast_fp16, new_v_9_cast_fp16, var_3585_cast_fp16))[name = string("V_sliding_out_9_cast_fp16")];
+            tensor<int32, [4]> var_3593_begin_0 = const()[name = string("op_3593_begin_0"), val = tensor<int32, [4]>([4, 0, 0, 0])];
+            tensor<int32, [4]> var_3593_end_0 = const()[name = string("op_3593_end_0"), val = tensor<int32, [4]>([5, 2, 512, 512])];
+            tensor<bool, [4]> var_3593_end_mask_0 = const()[name = string("op_3593_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_3593_cast_fp16 = slice_by_index(begin = var_3593_begin_0, end = var_3593_end_0, end_mask = var_3593_end_mask_0, x = K_sliding_out_9_cast_fp16)[name = string("op_3593_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_9_begin_0 = const()[name = string("K_for_attn_9_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_9_end_0 = const()[name = string("K_for_attn_9_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_9_end_mask_0 = const()[name = string("K_for_attn_9_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_9_cast_fp16 = slice_by_index(begin = K_for_attn_9_begin_0, end = K_for_attn_9_end_0, end_mask = K_for_attn_9_end_mask_0, x = var_3593_cast_fp16)[name = string("K_for_attn_9_cast_fp16")];
+            tensor<int32, [4]> var_3603_begin_0 = const()[name = string("op_3603_begin_0"), val = tensor<int32, [4]>([4, 0, 0, 0])];
+            tensor<int32, [4]> var_3603_end_0 = const()[name = string("op_3603_end_0"), val = tensor<int32, [4]>([5, 2, 512, 512])];
+            tensor<bool, [4]> var_3603_end_mask_0 = const()[name = string("op_3603_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_3603_cast_fp16 = slice_by_index(begin = var_3603_begin_0, end = var_3603_end_0, end_mask = var_3603_end_mask_0, x = V_sliding_out_9_cast_fp16)[name = string("op_3603_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_9_begin_0 = const()[name = string("V_for_attn_9_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_9_end_0 = const()[name = string("V_for_attn_9_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_9_end_mask_0 = const()[name = string("V_for_attn_9_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_9_cast_fp16 = slice_by_index(begin = V_for_attn_9_begin_0, end = V_for_attn_9_end_0, end_mask = V_for_attn_9_end_mask_0, x = var_3603_cast_fp16)[name = string("V_for_attn_9_cast_fp16")];
+            tensor<int32, [4]> transpose_16_perm_0 = const()[name = string("transpose_16_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_8_reps_0 = const()[name = string("tile_8_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_16_cast_fp16 = transpose(perm = transpose_16_perm_0, x = K_for_attn_9_cast_fp16)[name = string("transpose_153")];
+            tensor<fp16, [8, 1, 512, 256]> tile_8_cast_fp16 = tile(reps = tile_8_reps_0, x = transpose_16_cast_fp16)[name = string("tile_8_cast_fp16")];
+            tensor<int32, [5]> concat_16 = const()[name = string("concat_16"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_16_cast_fp16 = reshape(shape = concat_16, x = tile_8_cast_fp16)[name = string("reshape_16_cast_fp16")];
+            tensor<int32, [5]> transpose_17_perm_0 = const()[name = string("transpose_17_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_17 = const()[name = string("concat_17"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_17_cast_fp16 = transpose(perm = transpose_17_perm_0, x = reshape_16_cast_fp16)[name = string("transpose_152")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_17_cast_fp16 = reshape(shape = concat_17, x = transpose_17_cast_fp16)[name = string("reshape_17_cast_fp16")];
+            tensor<int32, [4]> transpose_62_perm_0 = const()[name = string("transpose_62_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_18_perm_0 = const()[name = string("transpose_18_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_9_reps_0 = const()[name = string("tile_9_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_18_cast_fp16 = transpose(perm = transpose_18_perm_0, x = V_for_attn_9_cast_fp16)[name = string("transpose_151")];
+            tensor<fp16, [8, 1, 512, 256]> tile_9_cast_fp16 = tile(reps = tile_9_reps_0, x = transpose_18_cast_fp16)[name = string("tile_9_cast_fp16")];
+            tensor<int32, [5]> concat_18 = const()[name = string("concat_18"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_18_cast_fp16 = reshape(shape = concat_18, x = tile_9_cast_fp16)[name = string("reshape_18_cast_fp16")];
+            tensor<int32, [5]> transpose_19_perm_0 = const()[name = string("transpose_19_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_19 = const()[name = string("concat_19"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_19_cast_fp16 = transpose(perm = transpose_19_perm_0, x = reshape_18_cast_fp16)[name = string("transpose_150")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_19_cast_fp16 = reshape(shape = concat_19, x = transpose_19_cast_fp16)[name = string("reshape_19_cast_fp16")];
+            tensor<int32, [4]> V_expanded_9_perm_0 = const()[name = string("V_expanded_9_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_17_transpose_x_0 = const()[name = string("attn_weights_17_transpose_x_0"), val = bool(false)];
+            bool attn_weights_17_transpose_y_0 = const()[name = string("attn_weights_17_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_62_cast_fp16 = transpose(perm = transpose_62_perm_0, x = reshape_17_cast_fp16)[name = string("transpose_149")];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_17_cast_fp16 = matmul(transpose_x = attn_weights_17_transpose_x_0, transpose_y = attn_weights_17_transpose_y_0, x = q_59_cast_fp16, y = transpose_62_cast_fp16)[name = string("attn_weights_17_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_87_cast_fp16 = add(x = attn_weights_17_cast_fp16, y = causal_mask_sliding)[name = string("x_87_cast_fp16")];
+            tensor<int32, [1]> reduce_max_4_axes_0 = const()[name = string("reduce_max_4_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_4_keep_dims_0 = const()[name = string("reduce_max_4_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_4 = reduce_max(axes = reduce_max_4_axes_0, keep_dims = reduce_max_4_keep_dims_0, x = x_87_cast_fp16)[name = string("reduce_max_4")];
+            tensor<fp16, [1, 8, 3, 512]> var_3638 = sub(x = x_87_cast_fp16, y = reduce_max_4)[name = string("op_3638")];
+            tensor<fp16, [1, 8, 3, 512]> var_3644 = exp(x = var_3638)[name = string("op_3644")];
+            tensor<int32, [1]> var_3654_axes_0 = const()[name = string("op_3654_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3654_keep_dims_0 = const()[name = string("op_3654_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_3654 = reduce_sum(axes = var_3654_axes_0, keep_dims = var_3654_keep_dims_0, x = var_3644)[name = string("op_3654")];
+            tensor<fp16, [1, 8, 3, 512]> var_3660_cast_fp16 = real_div(x = var_3644, y = var_3654)[name = string("op_3660_cast_fp16")];
+            bool attn_output_25_transpose_x_0 = const()[name = string("attn_output_25_transpose_x_0"), val = bool(false)];
+            bool attn_output_25_transpose_y_0 = const()[name = string("attn_output_25_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_9_cast_fp16 = transpose(perm = V_expanded_9_perm_0, x = reshape_19_cast_fp16)[name = string("transpose_148")];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_25_cast_fp16 = matmul(transpose_x = attn_output_25_transpose_x_0, transpose_y = attn_output_25_transpose_y_0, x = var_3660_cast_fp16, y = V_expanded_9_cast_fp16)[name = string("attn_output_25_cast_fp16")];
+            tensor<int32, [4]> var_3671 = const()[name = string("op_3671"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_3678 = const()[name = string("op_3678"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_3672_cast_fp16 = transpose(perm = var_3671, x = attn_output_25_cast_fp16)[name = string("transpose_147")];
+            tensor<fp16, [1, 3, 2048]> attn_output_27_cast_fp16 = reshape(shape = var_3678, x = var_3672_cast_fp16)[name = string("attn_output_27_cast_fp16")];
+            tensor<int32, [3]> var_3683 = const()[name = string("op_3683"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_3699_pad_type_0 = const()[name = string("op_3699_pad_type_0"), val = string("valid")];
+            int32 var_3699_groups_0 = const()[name = string("op_3699_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_3699_strides_0 = const()[name = string("op_3699_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_3699_pad_0 = const()[name = string("op_3699_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_3699_dilations_0 = const()[name = string("op_3699_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_4_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(556930880))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(559552384))))[name = string("squeeze_4_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_3684_cast_fp16 = transpose(perm = var_3683, x = attn_output_27_cast_fp16)[name = string("transpose_146")];
+            tensor<fp16, [1, 2560, 3]> var_3699_cast_fp16 = conv(dilations = var_3699_dilations_0, groups = var_3699_groups_0, pad = var_3699_pad_0, pad_type = var_3699_pad_type_0, strides = var_3699_strides_0, weight = squeeze_4_cast_fp16_to_fp32_to_fp16_palettized, x = var_3684_cast_fp16)[name = string("op_3699_cast_fp16")];
+            tensor<int32, [3]> var_3703 = const()[name = string("op_3703"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3709 = const()[name = string("op_3709"), val = int32(-1)];
+            fp16 const_56_promoted_to_fp16 = const()[name = string("const_56_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_91_cast_fp16 = transpose(perm = var_3703, x = var_3699_cast_fp16)[name = string("transpose_145")];
+            tensor<fp16, [1, 3, 2560]> var_3711_cast_fp16 = mul(x = x_91_cast_fp16, y = const_56_promoted_to_fp16)[name = string("op_3711_cast_fp16")];
+            bool input_139_interleave_0 = const()[name = string("input_139_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_139_cast_fp16 = concat(axis = var_3709, interleave = input_139_interleave_0, values = (x_91_cast_fp16, var_3711_cast_fp16))[name = string("input_139_cast_fp16")];
+            tensor<int32, [1]> normed_129_axes_0 = const()[name = string("normed_129_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3706_to_fp16 = const()[name = string("op_3706_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_129_cast_fp16 = layer_norm(axes = normed_129_axes_0, epsilon = var_3706_to_fp16, x = input_139_cast_fp16)[name = string("normed_129_cast_fp16")];
+            tensor<int32, [2]> var_3716_split_sizes_0 = const()[name = string("op_3716_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3716_axis_0 = const()[name = string("op_3716_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3716_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3716_cast_fp16_1 = split(axis = var_3716_axis_0, split_sizes = var_3716_split_sizes_0, x = normed_129_cast_fp16)[name = string("op_3716_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_4_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(559555008)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_29_cast_fp16 = mul(x = var_3716_cast_fp16_0, y = layers_4_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_29_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_93_cast_fp16 = add(x = x_79_cast_fp16, y = attn_output_29_cast_fp16)[name = string("x_93_cast_fp16")];
+            int32 var_3725 = const()[name = string("op_3725"), val = int32(-1)];
+            fp16 const_57_promoted_to_fp16 = const()[name = string("const_57_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_3727_cast_fp16 = mul(x = x_93_cast_fp16, y = const_57_promoted_to_fp16)[name = string("op_3727_cast_fp16")];
+            bool input_141_interleave_0 = const()[name = string("input_141_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_141_cast_fp16 = concat(axis = var_3725, interleave = input_141_interleave_0, values = (x_93_cast_fp16, var_3727_cast_fp16))[name = string("input_141_cast_fp16")];
+            tensor<int32, [1]> normed_133_axes_0 = const()[name = string("normed_133_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3722_to_fp16 = const()[name = string("op_3722_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_133_cast_fp16 = layer_norm(axes = normed_133_axes_0, epsilon = var_3722_to_fp16, x = input_141_cast_fp16)[name = string("normed_133_cast_fp16")];
+            tensor<int32, [2]> var_3732_split_sizes_0 = const()[name = string("op_3732_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3732_axis_0 = const()[name = string("op_3732_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3732_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3732_cast_fp16_1 = split(axis = var_3732_axis_0, split_sizes = var_3732_split_sizes_0, x = normed_133_cast_fp16)[name = string("op_3732_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_4_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(559560192)))];
+            tensor<fp16, [1, 3, 2560]> h_27_cast_fp16 = mul(x = var_3732_cast_fp16_0, y = layers_4_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_27_cast_fp16")];
+            tensor<int32, [3]> var_3743 = const()[name = string("op_3743"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_143_axes_0 = const()[name = string("input_143_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3744 = transpose(perm = var_3743, x = h_27_cast_fp16)[name = string("transpose_144")];
+            tensor<fp16, [1, 2560, 1, 3]> input_143 = expand_dims(axes = input_143_axes_0, x = var_3744)[name = string("input_143")];
+            string gate_17_pad_type_0 = const()[name = string("gate_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_17_strides_0 = const()[name = string("gate_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_17_pad_0 = const()[name = string("gate_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_17_dilations_0 = const()[name = string("gate_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_17_groups_0 = const()[name = string("gate_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_17 = conv(dilations = gate_17_dilations_0, groups = gate_17_groups_0, pad = gate_17_pad_0, pad_type = gate_17_pad_type_0, strides = gate_17_strides_0, weight = layers_4_mlp_gate_proj_weight_palettized, x = input_143)[name = string("gate_17")];
+            string up_9_pad_type_0 = const()[name = string("up_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_9_strides_0 = const()[name = string("up_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_9_pad_0 = const()[name = string("up_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_9_dilations_0 = const()[name = string("up_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_9_groups_0 = const()[name = string("up_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_9 = conv(dilations = up_9_dilations_0, groups = up_9_groups_0, pad = up_9_pad_0, pad_type = up_9_pad_type_0, strides = up_9_strides_0, weight = layers_4_mlp_up_proj_weight_palettized, x = input_143)[name = string("up_9")];
+            string gate_19_mode_0 = const()[name = string("gate_19_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_19 = gelu(mode = gate_19_mode_0, x = gate_17)[name = string("gate_19")];
+            tensor<fp16, [1, 10240, 1, 3]> input_145 = mul(x = gate_19, y = up_9)[name = string("input_145")];
+            string mlp_out_9_pad_type_0 = const()[name = string("mlp_out_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_9_strides_0 = const()[name = string("mlp_out_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_9_pad_0 = const()[name = string("mlp_out_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_9_dilations_0 = const()[name = string("mlp_out_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_9_groups_0 = const()[name = string("mlp_out_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_9 = conv(dilations = mlp_out_9_dilations_0, groups = mlp_out_9_groups_0, pad = mlp_out_9_pad_0, pad_type = mlp_out_9_pad_type_0, strides = mlp_out_9_strides_0, weight = layers_4_mlp_down_proj_weight_palettized, x = input_145)[name = string("mlp_out_9")];
+            tensor<int32, [1]> var_3784_axes_0 = const()[name = string("op_3784_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3784 = squeeze(axes = var_3784_axes_0, x = mlp_out_9)[name = string("op_3784")];
+            tensor<int32, [3]> var_3788 = const()[name = string("op_3788"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3794 = const()[name = string("op_3794"), val = int32(-1)];
+            fp16 const_58_promoted = const()[name = string("const_58_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_95 = transpose(perm = var_3788, x = var_3784)[name = string("transpose_143")];
+            tensor<fp16, [1, 3, 2560]> var_3796 = mul(x = x_95, y = const_58_promoted)[name = string("op_3796")];
+            bool input_147_interleave_0 = const()[name = string("input_147_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_147 = concat(axis = var_3794, interleave = input_147_interleave_0, values = (x_95, var_3796))[name = string("input_147")];
+            tensor<int32, [1]> normed_137_axes_0 = const()[name = string("normed_137_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3791_to_fp16 = const()[name = string("op_3791_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_137_cast_fp16 = layer_norm(axes = normed_137_axes_0, epsilon = var_3791_to_fp16, x = input_147)[name = string("normed_137_cast_fp16")];
+            tensor<int32, [2]> var_3801_split_sizes_0 = const()[name = string("op_3801_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3801_axis_0 = const()[name = string("op_3801_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3801_0, tensor<fp16, [1, 3, 2560]> var_3801_1 = split(axis = var_3801_axis_0, split_sizes = var_3801_split_sizes_0, x = normed_137_cast_fp16)[name = string("op_3801")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_43 = mul(x = var_3801_0, y = layers_4_post_feedforward_layernorm_weight)[name = string("hidden_states_43")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_45_cast_fp16 = add(x = x_93_cast_fp16, y = hidden_states_43)[name = string("hidden_states_45_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_9_begin_0 = const()[name = string("per_layer_slice_9_begin_0"), val = tensor<int32, [3]>([0, 0, 1024])];
+            tensor<int32, [3]> per_layer_slice_9_end_0 = const()[name = string("per_layer_slice_9_end_0"), val = tensor<int32, [3]>([1, 3, 1280])];
+            tensor<bool, [3]> per_layer_slice_9_end_mask_0 = const()[name = string("per_layer_slice_9_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_9_cast_fp16 = slice_by_index(begin = per_layer_slice_9_begin_0, end = per_layer_slice_9_end_0, end_mask = per_layer_slice_9_end_mask_0, x = per_layer_combined_out)[name = string("per_layer_slice_9_cast_fp16")];
+            tensor<int32, [3]> var_3829 = const()[name = string("op_3829"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_149_axes_0 = const()[name = string("input_149_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3830 = transpose(perm = var_3829, x = hidden_states_45_cast_fp16)[name = string("transpose_142")];
+            tensor<fp16, [1, 2560, 1, 3]> input_149 = expand_dims(axes = input_149_axes_0, x = var_3830)[name = string("input_149")];
+            string gated_25_pad_type_0 = const()[name = string("gated_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_25_strides_0 = const()[name = string("gated_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_25_pad_0 = const()[name = string("gated_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_25_dilations_0 = const()[name = string("gated_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_25_groups_0 = const()[name = string("gated_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_25 = conv(dilations = gated_25_dilations_0, groups = gated_25_groups_0, pad = gated_25_pad_0, pad_type = gated_25_pad_type_0, strides = gated_25_strides_0, weight = layers_4_per_layer_input_gate_weight_palettized, x = input_149)[name = string("gated_25")];
+            string gated_27_mode_0 = const()[name = string("gated_27_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_27 = gelu(mode = gated_27_mode_0, x = gated_25)[name = string("gated_27")];
+            tensor<int32, [3]> var_3849 = const()[name = string("op_3849"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_9_axes_0 = const()[name = string("per_layer_slice_conv_9_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_3850_cast_fp16 = transpose(perm = var_3849, x = per_layer_slice_9_cast_fp16)[name = string("transpose_141")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_9_cast_fp16 = expand_dims(axes = per_layer_slice_conv_9_axes_0, x = var_3850_cast_fp16)[name = string("per_layer_slice_conv_9_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_151_cast_fp16 = mul(x = gated_27, y = per_layer_slice_conv_9_cast_fp16)[name = string("input_151_cast_fp16")];
+            string gated_29_pad_type_0 = const()[name = string("gated_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_29_strides_0 = const()[name = string("gated_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_29_pad_0 = const()[name = string("gated_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_29_dilations_0 = const()[name = string("gated_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_29_groups_0 = const()[name = string("gated_29_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_4_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(559565376))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(559893120))))[name = string("layers_4_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_29_cast_fp16 = conv(dilations = gated_29_dilations_0, groups = gated_29_groups_0, pad = gated_29_pad_0, pad_type = gated_29_pad_type_0, strides = gated_29_strides_0, weight = layers_4_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_151_cast_fp16)[name = string("gated_29_cast_fp16")];
+            tensor<int32, [1]> var_3866_axes_0 = const()[name = string("op_3866_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3866_cast_fp16 = squeeze(axes = var_3866_axes_0, x = gated_29_cast_fp16)[name = string("op_3866_cast_fp16")];
+            tensor<int32, [3]> var_3870 = const()[name = string("op_3870"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3876 = const()[name = string("op_3876"), val = int32(-1)];
+            fp16 const_59_promoted_to_fp16 = const()[name = string("const_59_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_97_cast_fp16 = transpose(perm = var_3870, x = var_3866_cast_fp16)[name = string("transpose_140")];
+            tensor<fp16, [1, 3, 2560]> var_3878_cast_fp16 = mul(x = x_97_cast_fp16, y = const_59_promoted_to_fp16)[name = string("op_3878_cast_fp16")];
+            bool input_153_interleave_0 = const()[name = string("input_153_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_153_cast_fp16 = concat(axis = var_3876, interleave = input_153_interleave_0, values = (x_97_cast_fp16, var_3878_cast_fp16))[name = string("input_153_cast_fp16")];
+            tensor<int32, [1]> normed_141_axes_0 = const()[name = string("normed_141_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3873_to_fp16 = const()[name = string("op_3873_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_141_cast_fp16 = layer_norm(axes = normed_141_axes_0, epsilon = var_3873_to_fp16, x = input_153_cast_fp16)[name = string("normed_141_cast_fp16")];
+            tensor<int32, [2]> var_3883_split_sizes_0 = const()[name = string("op_3883_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3883_axis_0 = const()[name = string("op_3883_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3883_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3883_cast_fp16_1 = split(axis = var_3883_axis_0, split_sizes = var_3883_split_sizes_0, x = normed_141_cast_fp16)[name = string("op_3883_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_4_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(559895744)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_49_cast_fp16 = mul(x = var_3883_cast_fp16_0, y = layers_4_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_49_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_51_cast_fp16 = add(x = hidden_states_45_cast_fp16, y = hidden_states_49_cast_fp16)[name = string("hidden_states_51_cast_fp16")];
+            tensor<fp16, [1]> const_60_promoted_to_fp16 = const()[name = string("const_60_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.2cp-1])];
+            tensor<fp16, [1, 3, 2560]> x_99_cast_fp16 = mul(x = hidden_states_51_cast_fp16, y = const_60_promoted_to_fp16)[name = string("x_99_cast_fp16")];
+            int32 var_3898 = const()[name = string("op_3898"), val = int32(-1)];
+            fp16 const_61_promoted_to_fp16 = const()[name = string("const_61_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_3900_cast_fp16 = mul(x = x_99_cast_fp16, y = const_61_promoted_to_fp16)[name = string("op_3900_cast_fp16")];
+            bool input_155_interleave_0 = const()[name = string("input_155_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_155_cast_fp16 = concat(axis = var_3898, interleave = input_155_interleave_0, values = (x_99_cast_fp16, var_3900_cast_fp16))[name = string("input_155_cast_fp16")];
+            tensor<int32, [1]> normed_145_axes_0 = const()[name = string("normed_145_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3895_to_fp16 = const()[name = string("op_3895_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_145_cast_fp16 = layer_norm(axes = normed_145_axes_0, epsilon = var_3895_to_fp16, x = input_155_cast_fp16)[name = string("normed_145_cast_fp16")];
+            tensor<int32, [2]> var_3905_split_sizes_0 = const()[name = string("op_3905_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3905_axis_0 = const()[name = string("op_3905_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3905_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3905_cast_fp16_1 = split(axis = var_3905_axis_0, split_sizes = var_3905_split_sizes_0, x = normed_145_cast_fp16)[name = string("op_3905_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_5_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(559900928)))];
+            tensor<fp16, [1, 3, 2560]> h_31_cast_fp16 = mul(x = var_3905_cast_fp16_0, y = layers_5_input_layernorm_weight_promoted_to_fp16)[name = string("h_31_cast_fp16")];
+            tensor<int32, [3]> var_3911 = const()[name = string("op_3911"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_3914_axes_0 = const()[name = string("op_3914_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3912_cast_fp16 = transpose(perm = var_3911, x = h_31_cast_fp16)[name = string("transpose_139")];
+            tensor<fp16, [1, 2560, 1, 3]> var_3914_cast_fp16 = expand_dims(axes = var_3914_axes_0, x = var_3912_cast_fp16)[name = string("op_3914_cast_fp16")];
+            string q_61_pad_type_0 = const()[name = string("q_61_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_61_strides_0 = const()[name = string("q_61_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_61_pad_0 = const()[name = string("q_61_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_61_dilations_0 = const()[name = string("q_61_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_61_groups_0 = const()[name = string("q_61_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 4096, 1, 3]> q_61 = conv(dilations = q_61_dilations_0, groups = q_61_groups_0, pad = q_61_pad_0, pad_type = q_61_pad_type_0, strides = q_61_strides_0, weight = layers_5_self_attn_q_proj_weight_palettized, x = var_3914_cast_fp16)[name = string("q_61")];
+            tensor<int32, [4]> var_3935 = const()[name = string("op_3935"), val = tensor<int32, [4]>([1, 8, 512, 3])];
+            tensor<fp16, [1, 8, 512, 3]> var_3936 = reshape(shape = var_3935, x = q_61)[name = string("op_3936")];
+            tensor<int32, [4]> transpose_63_perm_0 = const()[name = string("transpose_63_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_3959 = const()[name = string("op_3959"), val = tensor<int32, [3]>([3, 8, 512])];
+            tensor<fp16, [1, 3, 8, 512]> transpose_63 = transpose(perm = transpose_63_perm_0, x = var_3936)[name = string("transpose_138")];
+            tensor<fp16, [3, 8, 512]> x_101 = reshape(shape = var_3959, x = transpose_63)[name = string("x_101")];
+            int32 var_3965 = const()[name = string("op_3965"), val = int32(-1)];
+            fp16 const_62_promoted = const()[name = string("const_62_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 512]> var_3967 = mul(x = x_101, y = const_62_promoted)[name = string("op_3967")];
+            bool input_159_interleave_0 = const()[name = string("input_159_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 1024]> input_159 = concat(axis = var_3965, interleave = input_159_interleave_0, values = (x_101, var_3967))[name = string("input_159")];
+            tensor<int32, [1]> normed_149_axes_0 = const()[name = string("normed_149_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3962_to_fp16 = const()[name = string("op_3962_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 1024]> normed_149_cast_fp16 = layer_norm(axes = normed_149_axes_0, epsilon = var_3962_to_fp16, x = input_159)[name = string("normed_149_cast_fp16")];
+            tensor<int32, [2]> var_3972_split_sizes_0 = const()[name = string("op_3972_split_sizes_0"), val = tensor<int32, [2]>([512, 512])];
+            int32 var_3972_axis_0 = const()[name = string("op_3972_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 512]> var_3972_0, tensor<fp16, [3, 8, 512]> var_3972_1 = split(axis = var_3972_axis_0, split_sizes = var_3972_split_sizes_0, x = normed_149_cast_fp16)[name = string("op_3972")];
+            tensor<fp16, [3, 8, 512]> q_65 = mul(x = var_3972_0, y = layers_5_self_attn_q_norm_weight)[name = string("q_65")];
+            tensor<int32, [4]> var_3979 = const()[name = string("op_3979"), val = tensor<int32, [4]>([1, 3, 8, 512])];
+            tensor<fp16, [1, 3, 8, 512]> var_3980 = reshape(shape = var_3979, x = q_65)[name = string("op_3980")];
+            tensor<int32, [4]> var_3985 = const()[name = string("op_3985"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 512]> q_67 = transpose(perm = var_3985, x = var_3980)[name = string("transpose_137")];
+            tensor<fp16, [1, 8, 3, 512]> var_3987_cast_fp16 = mul(x = q_67, y = cos_f)[name = string("op_3987_cast_fp16")];
+            tensor<int32, [2]> var_3988_split_sizes_0 = const()[name = string("op_3988_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_3988_axis_0 = const()[name = string("op_3988_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 256]> var_3988_0, tensor<fp16, [1, 8, 3, 256]> var_3988_1 = split(axis = var_3988_axis_0, split_sizes = var_3988_split_sizes_0, x = q_67)[name = string("op_3988")];
+            fp16 const_63_promoted = const()[name = string("const_63_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 256]> var_3990 = mul(x = var_3988_1, y = const_63_promoted)[name = string("op_3990")];
+            int32 var_3992 = const()[name = string("op_3992"), val = int32(-1)];
+            bool var_3993_interleave_0 = const()[name = string("op_3993_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 512]> var_3993 = concat(axis = var_3992, interleave = var_3993_interleave_0, values = (var_3990, var_3988_0))[name = string("op_3993")];
+            tensor<fp16, [1, 8, 3, 512]> var_3994_cast_fp16 = mul(x = var_3993, y = sin_f)[name = string("op_3994_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> q_71_cast_fp16 = add(x = var_3987_cast_fp16, y = var_3994_cast_fp16)[name = string("q_71_cast_fp16")];
+            string k_31_pad_type_0 = const()[name = string("k_31_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_31_strides_0 = const()[name = string("k_31_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_31_pad_0 = const()[name = string("k_31_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_31_dilations_0 = const()[name = string("k_31_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_31_groups_0 = const()[name = string("k_31_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 3]> k_31 = conv(dilations = k_31_dilations_0, groups = k_31_groups_0, pad = k_31_pad_0, pad_type = k_31_pad_type_0, strides = k_31_strides_0, weight = layers_5_self_attn_k_proj_weight_palettized, x = var_3914_cast_fp16)[name = string("k_31")];
+            tensor<int32, [4]> var_4012 = const()[name = string("op_4012"), val = tensor<int32, [4]>([1, 2, 512, 3])];
+            tensor<fp16, [1, 2, 512, 3]> var_4013 = reshape(shape = var_4012, x = k_31)[name = string("op_4013")];
+            tensor<int32, [4]> transpose_64_perm_0 = const()[name = string("transpose_64_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            string v_11_pad_type_0 = const()[name = string("v_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_11_strides_0 = const()[name = string("v_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_11_pad_0 = const()[name = string("v_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_11_dilations_0 = const()[name = string("v_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_11_groups_0 = const()[name = string("v_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 3]> v_11 = conv(dilations = v_11_dilations_0, groups = v_11_groups_0, pad = v_11_pad_0, pad_type = v_11_pad_type_0, strides = v_11_strides_0, weight = layers_5_self_attn_v_proj_weight_palettized, x = var_3914_cast_fp16)[name = string("v_11")];
+            tensor<int32, [4]> var_4040 = const()[name = string("op_4040"), val = tensor<int32, [4]>([1, 2, 512, 3])];
+            tensor<fp16, [1, 2, 512, 3]> var_4041 = reshape(shape = var_4040, x = v_11)[name = string("op_4041")];
+            tensor<int32, [4]> var_4046 = const()[name = string("op_4046"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_4064 = const()[name = string("op_4064"), val = tensor<int32, [3]>([3, 2, 512])];
+            tensor<fp16, [1, 3, 2, 512]> transpose_64 = transpose(perm = transpose_64_perm_0, x = var_4013)[name = string("transpose_136")];
+            tensor<fp16, [3, 2, 512]> x_103 = reshape(shape = var_4064, x = transpose_64)[name = string("x_103")];
+            int32 var_4070 = const()[name = string("op_4070"), val = int32(-1)];
+            fp16 const_64_promoted = const()[name = string("const_64_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 2, 512]> var_4072 = mul(x = x_103, y = const_64_promoted)[name = string("op_4072")];
+            bool input_161_interleave_0 = const()[name = string("input_161_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 2, 1024]> input_161 = concat(axis = var_4070, interleave = input_161_interleave_0, values = (x_103, var_4072))[name = string("input_161")];
+            tensor<int32, [1]> normed_153_axes_0 = const()[name = string("normed_153_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4067_to_fp16 = const()[name = string("op_4067_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 2, 1024]> normed_153_cast_fp16 = layer_norm(axes = normed_153_axes_0, epsilon = var_4067_to_fp16, x = input_161)[name = string("normed_153_cast_fp16")];
+            tensor<int32, [2]> var_4077_split_sizes_0 = const()[name = string("op_4077_split_sizes_0"), val = tensor<int32, [2]>([512, 512])];
+            int32 var_4077_axis_0 = const()[name = string("op_4077_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 2, 512]> var_4077_0, tensor<fp16, [3, 2, 512]> var_4077_1 = split(axis = var_4077_axis_0, split_sizes = var_4077_split_sizes_0, x = normed_153_cast_fp16)[name = string("op_4077")];
+            tensor<fp16, [3, 2, 512]> k_35 = mul(x = var_4077_0, y = layers_5_self_attn_k_norm_weight)[name = string("k_35")];
+            tensor<int32, [4]> var_4084 = const()[name = string("op_4084"), val = tensor<int32, [4]>([1, 3, 2, 512])];
+            tensor<fp16, [1, 3, 2, 512]> var_4085 = reshape(shape = var_4084, x = k_35)[name = string("op_4085")];
+            tensor<int32, [4]> var_4090 = const()[name = string("op_4090"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            fp16 var_4092_promoted = const()[name = string("op_4092_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 3, 512]> var_4047 = transpose(perm = var_4046, x = var_4041)[name = string("transpose_135")];
+            tensor<fp16, [1, 2, 3, 512]> var_4093 = pow(x = var_4047, y = var_4092_promoted)[name = string("op_4093")];
+            tensor<int32, [1]> var_4098_axes_0 = const()[name = string("op_4098_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4098_keep_dims_0 = const()[name = string("op_4098_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 3, 1]> var_4098 = reduce_mean(axes = var_4098_axes_0, keep_dims = var_4098_keep_dims_0, x = var_4093)[name = string("op_4098")];
+            fp16 var_4100_to_fp16 = const()[name = string("op_4100_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 3, 1]> mean_sq_11_cast_fp16 = add(x = var_4098, y = var_4100_to_fp16)[name = string("mean_sq_11_cast_fp16")];
+            fp32 var_4102_epsilon_0 = const()[name = string("op_4102_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 3, 1]> var_4102_cast_fp16 = rsqrt(epsilon = var_4102_epsilon_0, x = mean_sq_11_cast_fp16)[name = string("op_4102_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 512]> v_13_cast_fp16 = mul(x = var_4047, y = var_4102_cast_fp16)[name = string("v_13_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 512]> q_69 = transpose(perm = var_4090, x = var_4085)[name = string("transpose_134")];
+            tensor<fp16, [1, 2, 3, 512]> var_4104_cast_fp16 = mul(x = q_69, y = cos_f)[name = string("op_4104_cast_fp16")];
+            tensor<int32, [2]> var_4105_split_sizes_0 = const()[name = string("op_4105_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_4105_axis_0 = const()[name = string("op_4105_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 3, 256]> var_4105_0, tensor<fp16, [1, 2, 3, 256]> var_4105_1 = split(axis = var_4105_axis_0, split_sizes = var_4105_split_sizes_0, x = q_69)[name = string("op_4105")];
+            fp16 const_65_promoted = const()[name = string("const_65_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 3, 256]> var_4107 = mul(x = var_4105_1, y = const_65_promoted)[name = string("op_4107")];
+            int32 var_4109 = const()[name = string("op_4109"), val = int32(-1)];
+            bool var_4110_interleave_0 = const()[name = string("op_4110_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 3, 512]> var_4110 = concat(axis = var_4109, interleave = var_4110_interleave_0, values = (var_4107, var_4105_0))[name = string("op_4110")];
+            tensor<fp16, [1, 2, 3, 512]> var_4111_cast_fp16 = mul(x = var_4110, y = sin_f)[name = string("op_4111_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 512]> k_37_cast_fp16 = add(x = var_4104_cast_fp16, y = var_4111_cast_fp16)[name = string("k_37_cast_fp16")];
+            tensor<int32, [4]> var_4120_reps_0 = const()[name = string("op_4120_reps_0"), val = tensor<int32, [4]>([1, 2, 1, 1])];
+            tensor<fp16, [1, 2, 2048, 3]> var_4120_cast_fp16 = tile(reps = var_4120_reps_0, x = update_indicator)[name = string("op_4120_cast_fp16")];
+            bool k_scattered_1_transpose_x_0 = const()[name = string("k_scattered_1_transpose_x_0"), val = bool(false)];
+            bool k_scattered_1_transpose_y_0 = const()[name = string("k_scattered_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 2048, 512]> k_scattered_1_cast_fp16 = matmul(transpose_x = k_scattered_1_transpose_x_0, transpose_y = k_scattered_1_transpose_y_0, x = var_4120_cast_fp16, y = k_37_cast_fp16)[name = string("k_scattered_1_cast_fp16")];
+            bool v_scattered_1_transpose_x_0 = const()[name = string("v_scattered_1_transpose_x_0"), val = bool(false)];
+            bool v_scattered_1_transpose_y_0 = const()[name = string("v_scattered_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 2048, 512]> v_scattered_1_cast_fp16 = matmul(transpose_x = v_scattered_1_transpose_x_0, transpose_y = v_scattered_1_transpose_y_0, x = var_4120_cast_fp16, y = v_13_cast_fp16)[name = string("v_scattered_1_cast_fp16")];
+            tensor<int32, [1]> var_4134_axes_0 = const()[name = string("op_4134_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4134_keep_dims_0 = const()[name = string("op_4134_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 2048, 1]> var_4134_cast_fp16 = reduce_sum(axes = var_4134_axes_0, keep_dims = var_4134_keep_dims_0, x = update_indicator)[name = string("op_4134_cast_fp16")];
+            tensor<int32, [4]> slot_k_11_begin_0 = const()[name = string("slot_k_11_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> slot_k_11_end_0 = const()[name = string("slot_k_11_end_0"), val = tensor<int32, [4]>([1, 2, 2048, 512])];
+            tensor<bool, [4]> slot_k_11_end_mask_0 = const()[name = string("slot_k_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 2048, 512]> slot_k_11_cast_fp16 = slice_by_index(begin = slot_k_11_begin_0, end = slot_k_11_end_0, end_mask = slot_k_11_end_mask_0, x = K_full_in)[name = string("slot_k_11_cast_fp16")];
+            tensor<int32, [4]> slot_v_11_begin_0 = const()[name = string("slot_v_11_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> slot_v_11_end_0 = const()[name = string("slot_v_11_end_0"), val = tensor<int32, [4]>([1, 2, 2048, 512])];
+            tensor<bool, [4]> slot_v_11_end_mask_0 = const()[name = string("slot_v_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 2048, 512]> slot_v_11_cast_fp16 = slice_by_index(begin = slot_v_11_begin_0, end = slot_v_11_end_0, end_mask = slot_v_11_end_mask_0, x = V_full_in)[name = string("slot_v_11_cast_fp16")];
+            fp16 var_4145_promoted_to_fp16 = const()[name = string("op_4145_promoted_to_fp16"), val = fp16(0x1p+0)];
+            tensor<fp16, [1, 1, 2048, 1]> var_4147_cast_fp16 = sub(x = var_4145_promoted_to_fp16, y = var_4134_cast_fp16)[name = string("op_4147_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_4148_cast_fp16 = mul(x = slot_k_11_cast_fp16, y = var_4147_cast_fp16)[name = string("op_4148_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> new_k_11_cast_fp16 = add(x = var_4148_cast_fp16, y = k_scattered_1_cast_fp16)[name = string("new_k_11_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_4154_cast_fp16 = mul(x = slot_v_11_cast_fp16, y = var_4147_cast_fp16)[name = string("op_4154_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> new_v_11_cast_fp16 = add(x = var_4154_cast_fp16, y = v_scattered_1_cast_fp16)[name = string("new_v_11_cast_fp16")];
+            tensor<int32, [4]> var_4166_begin_0 = const()[name = string("op_4166_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> var_4166_end_0 = const()[name = string("op_4166_end_0"), val = tensor<int32, [4]>([2, 2, 2048, 512])];
+            tensor<bool, [4]> var_4166_end_mask_0 = const()[name = string("op_4166_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 2048, 512]> var_4166_cast_fp16 = slice_by_index(begin = var_4166_begin_0, end = var_4166_end_0, end_mask = var_4166_end_mask_0, x = K_full_in)[name = string("op_4166_cast_fp16")];
+            int32 var_4168 = const()[name = string("op_4168"), val = int32(0)];
+            bool K_full_out_1_interleave_0 = const()[name = string("K_full_out_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [2, 2, 2048, 512]> K_full_out_1_cast_fp16 = concat(axis = var_4168, interleave = K_full_out_1_interleave_0, values = (new_k_11_cast_fp16, var_4166_cast_fp16))[name = string("K_full_out_1_cast_fp16")];
+            tensor<int32, [4]> var_4179_begin_0 = const()[name = string("op_4179_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> var_4179_end_0 = const()[name = string("op_4179_end_0"), val = tensor<int32, [4]>([2, 2, 2048, 512])];
+            tensor<bool, [4]> var_4179_end_mask_0 = const()[name = string("op_4179_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 2048, 512]> var_4179_cast_fp16 = slice_by_index(begin = var_4179_begin_0, end = var_4179_end_0, end_mask = var_4179_end_mask_0, x = V_full_in)[name = string("op_4179_cast_fp16")];
+            int32 var_4181 = const()[name = string("op_4181"), val = int32(0)];
+            bool V_full_out_1_interleave_0 = const()[name = string("V_full_out_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [2, 2, 2048, 512]> V_full_out_1_cast_fp16 = concat(axis = var_4181, interleave = V_full_out_1_interleave_0, values = (new_v_11_cast_fp16, var_4179_cast_fp16))[name = string("V_full_out_1_cast_fp16")];
+            tensor<int32, [4]> var_4187_begin_0 = const()[name = string("op_4187_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4187_end_0 = const()[name = string("op_4187_end_0"), val = tensor<int32, [4]>([1, 2, 2048, 512])];
+            tensor<bool, [4]> var_4187_end_mask_0 = const()[name = string("op_4187_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 2048, 512]> var_4187_cast_fp16 = slice_by_index(begin = var_4187_begin_0, end = var_4187_end_0, end_mask = var_4187_end_mask_0, x = K_full_out_1_cast_fp16)[name = string("op_4187_cast_fp16")];
+            tensor<int32, [4]> var_4197_begin_0 = const()[name = string("op_4197_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4197_end_0 = const()[name = string("op_4197_end_0"), val = tensor<int32, [4]>([1, 2, 2048, 512])];
+            tensor<bool, [4]> var_4197_end_mask_0 = const()[name = string("op_4197_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 2048, 512]> var_4197_cast_fp16 = slice_by_index(begin = var_4197_begin_0, end = var_4197_end_0, end_mask = var_4197_end_mask_0, x = V_full_out_1_cast_fp16)[name = string("op_4197_cast_fp16")];
+            tensor<int32, [4]> transpose_20_perm_0 = const()[name = string("transpose_20_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_10_reps_0 = const()[name = string("tile_10_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 2048, 512]> transpose_20_cast_fp16 = transpose(perm = transpose_20_perm_0, x = var_4187_cast_fp16)[name = string("transpose_133")];
+            tensor<fp16, [8, 1, 2048, 512]> tile_10_cast_fp16 = tile(reps = tile_10_reps_0, x = transpose_20_cast_fp16)[name = string("tile_10_cast_fp16")];
+            tensor<int32, [5]> concat_22 = const()[name = string("concat_22"), val = tensor<int32, [5]>([4, 2, 1, 2048, 512])];
+            tensor<fp16, [4, 2, 1, 2048, 512]> reshape_20_cast_fp16 = reshape(shape = concat_22, x = tile_10_cast_fp16)[name = string("reshape_20_cast_fp16")];
+            tensor<int32, [5]> transpose_21_perm_0 = const()[name = string("transpose_21_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_23 = const()[name = string("concat_23"), val = tensor<int32, [4]>([-1, 1, 2048, 512])];
+            tensor<fp16, [2, 4, 1, 2048, 512]> transpose_21_cast_fp16 = transpose(perm = transpose_21_perm_0, x = reshape_20_cast_fp16)[name = string("transpose_132")];
+            tensor<fp16, [8, 1, 2048, 512]> reshape_21_cast_fp16 = reshape(shape = concat_23, x = transpose_21_cast_fp16)[name = string("reshape_21_cast_fp16")];
+            tensor<int32, [4]> transpose_65_perm_0 = const()[name = string("transpose_65_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_22_perm_0 = const()[name = string("transpose_22_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_11_reps_0 = const()[name = string("tile_11_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 2048, 512]> transpose_22_cast_fp16 = transpose(perm = transpose_22_perm_0, x = var_4197_cast_fp16)[name = string("transpose_131")];
+            tensor<fp16, [8, 1, 2048, 512]> tile_11_cast_fp16 = tile(reps = tile_11_reps_0, x = transpose_22_cast_fp16)[name = string("tile_11_cast_fp16")];
+            tensor<int32, [5]> concat_24 = const()[name = string("concat_24"), val = tensor<int32, [5]>([4, 2, 1, 2048, 512])];
+            tensor<fp16, [4, 2, 1, 2048, 512]> reshape_22_cast_fp16 = reshape(shape = concat_24, x = tile_11_cast_fp16)[name = string("reshape_22_cast_fp16")];
+            tensor<int32, [5]> transpose_23_perm_0 = const()[name = string("transpose_23_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_25 = const()[name = string("concat_25"), val = tensor<int32, [4]>([-1, 1, 2048, 512])];
+            tensor<fp16, [2, 4, 1, 2048, 512]> transpose_23_cast_fp16 = transpose(perm = transpose_23_perm_0, x = reshape_22_cast_fp16)[name = string("transpose_130")];
+            tensor<fp16, [8, 1, 2048, 512]> reshape_23_cast_fp16 = reshape(shape = concat_25, x = transpose_23_cast_fp16)[name = string("reshape_23_cast_fp16")];
+            tensor<int32, [4]> V_expanded_11_perm_0 = const()[name = string("V_expanded_11_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_21_transpose_x_0 = const()[name = string("attn_weights_21_transpose_x_0"), val = bool(false)];
+            bool attn_weights_21_transpose_y_0 = const()[name = string("attn_weights_21_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 2048]> transpose_65_cast_fp16 = transpose(perm = transpose_65_perm_0, x = reshape_21_cast_fp16)[name = string("transpose_129")];
+            tensor<fp16, [1, 8, 3, 2048]> attn_weights_21_cast_fp16 = matmul(transpose_x = attn_weights_21_transpose_x_0, transpose_y = attn_weights_21_transpose_y_0, x = q_71_cast_fp16, y = transpose_65_cast_fp16)[name = string("attn_weights_21_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 2048]> x_107_cast_fp16 = add(x = attn_weights_21_cast_fp16, y = causal_mask_full)[name = string("x_107_cast_fp16")];
+            tensor<int32, [1]> reduce_max_5_axes_0 = const()[name = string("reduce_max_5_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_5_keep_dims_0 = const()[name = string("reduce_max_5_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_5 = reduce_max(axes = reduce_max_5_axes_0, keep_dims = reduce_max_5_keep_dims_0, x = x_107_cast_fp16)[name = string("reduce_max_5")];
+            tensor<fp16, [1, 8, 3, 2048]> var_4232 = sub(x = x_107_cast_fp16, y = reduce_max_5)[name = string("op_4232")];
+            tensor<fp16, [1, 8, 3, 2048]> var_4238 = exp(x = var_4232)[name = string("op_4238")];
+            tensor<int32, [1]> var_4248_axes_0 = const()[name = string("op_4248_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4248_keep_dims_0 = const()[name = string("op_4248_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_4248 = reduce_sum(axes = var_4248_axes_0, keep_dims = var_4248_keep_dims_0, x = var_4238)[name = string("op_4248")];
+            tensor<fp16, [1, 8, 3, 2048]> var_4254_cast_fp16 = real_div(x = var_4238, y = var_4248)[name = string("op_4254_cast_fp16")];
+            bool attn_output_31_transpose_x_0 = const()[name = string("attn_output_31_transpose_x_0"), val = bool(false)];
+            bool attn_output_31_transpose_y_0 = const()[name = string("attn_output_31_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 2048, 512]> V_expanded_11_cast_fp16 = transpose(perm = V_expanded_11_perm_0, x = reshape_23_cast_fp16)[name = string("transpose_128")];
+            tensor<fp16, [1, 8, 3, 512]> attn_output_31_cast_fp16 = matmul(transpose_x = attn_output_31_transpose_x_0, transpose_y = attn_output_31_transpose_y_0, x = var_4254_cast_fp16, y = V_expanded_11_cast_fp16)[name = string("attn_output_31_cast_fp16")];
+            tensor<int32, [4]> var_4265 = const()[name = string("op_4265"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_4272 = const()[name = string("op_4272"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 512]> var_4266_cast_fp16 = transpose(perm = var_4265, x = attn_output_31_cast_fp16)[name = string("transpose_127")];
+            tensor<fp16, [1, 3, 4096]> attn_output_33_cast_fp16 = reshape(shape = var_4272, x = var_4266_cast_fp16)[name = string("attn_output_33_cast_fp16")];
+            tensor<int32, [3]> var_4277 = const()[name = string("op_4277"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_4293_pad_type_0 = const()[name = string("op_4293_pad_type_0"), val = string("valid")];
+            int32 var_4293_groups_0 = const()[name = string("op_4293_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_4293_strides_0 = const()[name = string("op_4293_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_4293_pad_0 = const()[name = string("op_4293_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_4293_dilations_0 = const()[name = string("op_4293_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 4096, 1]> squeeze_5_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 4096, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(559906112))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(565149056))))[name = string("squeeze_5_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 4096, 3]> var_4278_cast_fp16 = transpose(perm = var_4277, x = attn_output_33_cast_fp16)[name = string("transpose_126")];
+            tensor<fp16, [1, 2560, 3]> var_4293_cast_fp16 = conv(dilations = var_4293_dilations_0, groups = var_4293_groups_0, pad = var_4293_pad_0, pad_type = var_4293_pad_type_0, strides = var_4293_strides_0, weight = squeeze_5_cast_fp16_to_fp32_to_fp16_palettized, x = var_4278_cast_fp16)[name = string("op_4293_cast_fp16")];
+            tensor<int32, [3]> var_4297 = const()[name = string("op_4297"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_4303 = const()[name = string("op_4303"), val = int32(-1)];
+            fp16 const_66_promoted_to_fp16 = const()[name = string("const_66_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_111_cast_fp16 = transpose(perm = var_4297, x = var_4293_cast_fp16)[name = string("transpose_125")];
+            tensor<fp16, [1, 3, 2560]> var_4305_cast_fp16 = mul(x = x_111_cast_fp16, y = const_66_promoted_to_fp16)[name = string("op_4305_cast_fp16")];
+            bool input_165_interleave_0 = const()[name = string("input_165_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_165_cast_fp16 = concat(axis = var_4303, interleave = input_165_interleave_0, values = (x_111_cast_fp16, var_4305_cast_fp16))[name = string("input_165_cast_fp16")];
+            tensor<int32, [1]> normed_157_axes_0 = const()[name = string("normed_157_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4300_to_fp16 = const()[name = string("op_4300_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_157_cast_fp16 = layer_norm(axes = normed_157_axes_0, epsilon = var_4300_to_fp16, x = input_165_cast_fp16)[name = string("normed_157_cast_fp16")];
+            tensor<int32, [2]> var_4310_split_sizes_0 = const()[name = string("op_4310_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4310_axis_0 = const()[name = string("op_4310_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_4310_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_4310_cast_fp16_1 = split(axis = var_4310_axis_0, split_sizes = var_4310_split_sizes_0, x = normed_157_cast_fp16)[name = string("op_4310_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_5_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(565151680)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_35_cast_fp16 = mul(x = var_4310_cast_fp16_0, y = layers_5_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_35_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_113_cast_fp16 = add(x = x_99_cast_fp16, y = attn_output_35_cast_fp16)[name = string("x_113_cast_fp16")];
+            int32 var_4319 = const()[name = string("op_4319"), val = int32(-1)];
+            fp16 const_67_promoted_to_fp16 = const()[name = string("const_67_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_4321_cast_fp16 = mul(x = x_113_cast_fp16, y = const_67_promoted_to_fp16)[name = string("op_4321_cast_fp16")];
+            bool input_167_interleave_0 = const()[name = string("input_167_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_167_cast_fp16 = concat(axis = var_4319, interleave = input_167_interleave_0, values = (x_113_cast_fp16, var_4321_cast_fp16))[name = string("input_167_cast_fp16")];
+            tensor<int32, [1]> normed_161_axes_0 = const()[name = string("normed_161_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4316_to_fp16 = const()[name = string("op_4316_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_161_cast_fp16 = layer_norm(axes = normed_161_axes_0, epsilon = var_4316_to_fp16, x = input_167_cast_fp16)[name = string("normed_161_cast_fp16")];
+            tensor<int32, [2]> var_4326_split_sizes_0 = const()[name = string("op_4326_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4326_axis_0 = const()[name = string("op_4326_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_4326_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_4326_cast_fp16_1 = split(axis = var_4326_axis_0, split_sizes = var_4326_split_sizes_0, x = normed_161_cast_fp16)[name = string("op_4326_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_5_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(565156864)))];
+            tensor<fp16, [1, 3, 2560]> h_33_cast_fp16 = mul(x = var_4326_cast_fp16_0, y = layers_5_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_33_cast_fp16")];
+            tensor<int32, [3]> var_4337 = const()[name = string("op_4337"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_169_axes_0 = const()[name = string("input_169_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_4338 = transpose(perm = var_4337, x = h_33_cast_fp16)[name = string("transpose_124")];
+            tensor<fp16, [1, 2560, 1, 3]> input_169 = expand_dims(axes = input_169_axes_0, x = var_4338)[name = string("input_169")];
+            string gate_21_pad_type_0 = const()[name = string("gate_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_21_strides_0 = const()[name = string("gate_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_21_pad_0 = const()[name = string("gate_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_21_dilations_0 = const()[name = string("gate_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_21_groups_0 = const()[name = string("gate_21_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_21 = conv(dilations = gate_21_dilations_0, groups = gate_21_groups_0, pad = gate_21_pad_0, pad_type = gate_21_pad_type_0, strides = gate_21_strides_0, weight = layers_5_mlp_gate_proj_weight_palettized, x = input_169)[name = string("gate_21")];
+            string up_11_pad_type_0 = const()[name = string("up_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_11_strides_0 = const()[name = string("up_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_11_pad_0 = const()[name = string("up_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_11_dilations_0 = const()[name = string("up_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_11_groups_0 = const()[name = string("up_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_11 = conv(dilations = up_11_dilations_0, groups = up_11_groups_0, pad = up_11_pad_0, pad_type = up_11_pad_type_0, strides = up_11_strides_0, weight = layers_5_mlp_up_proj_weight_palettized, x = input_169)[name = string("up_11")];
+            string gate_23_mode_0 = const()[name = string("gate_23_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_23 = gelu(mode = gate_23_mode_0, x = gate_21)[name = string("gate_23")];
+            tensor<fp16, [1, 10240, 1, 3]> input_171 = mul(x = gate_23, y = up_11)[name = string("input_171")];
+            string mlp_out_11_pad_type_0 = const()[name = string("mlp_out_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_11_strides_0 = const()[name = string("mlp_out_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_11_pad_0 = const()[name = string("mlp_out_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_11_dilations_0 = const()[name = string("mlp_out_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_11_groups_0 = const()[name = string("mlp_out_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_11 = conv(dilations = mlp_out_11_dilations_0, groups = mlp_out_11_groups_0, pad = mlp_out_11_pad_0, pad_type = mlp_out_11_pad_type_0, strides = mlp_out_11_strides_0, weight = layers_5_mlp_down_proj_weight_palettized, x = input_171)[name = string("mlp_out_11")];
+            tensor<int32, [1]> var_4378_axes_0 = const()[name = string("op_4378_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_4378 = squeeze(axes = var_4378_axes_0, x = mlp_out_11)[name = string("op_4378")];
+            tensor<int32, [3]> var_4382 = const()[name = string("op_4382"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_4388 = const()[name = string("op_4388"), val = int32(-1)];
+            fp16 const_68_promoted = const()[name = string("const_68_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_115 = transpose(perm = var_4382, x = var_4378)[name = string("transpose_123")];
+            tensor<fp16, [1, 3, 2560]> var_4390 = mul(x = x_115, y = const_68_promoted)[name = string("op_4390")];
+            bool input_173_interleave_0 = const()[name = string("input_173_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_173 = concat(axis = var_4388, interleave = input_173_interleave_0, values = (x_115, var_4390))[name = string("input_173")];
+            tensor<int32, [1]> normed_165_axes_0 = const()[name = string("normed_165_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4385_to_fp16 = const()[name = string("op_4385_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_165_cast_fp16 = layer_norm(axes = normed_165_axes_0, epsilon = var_4385_to_fp16, x = input_173)[name = string("normed_165_cast_fp16")];
+            tensor<int32, [2]> var_4395_split_sizes_0 = const()[name = string("op_4395_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4395_axis_0 = const()[name = string("op_4395_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_4395_0, tensor<fp16, [1, 3, 2560]> var_4395_1 = split(axis = var_4395_axis_0, split_sizes = var_4395_split_sizes_0, x = normed_165_cast_fp16)[name = string("op_4395")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_53 = mul(x = var_4395_0, y = layers_5_post_feedforward_layernorm_weight)[name = string("hidden_states_53")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_55_cast_fp16 = add(x = x_113_cast_fp16, y = hidden_states_53)[name = string("hidden_states_55_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_11_begin_0 = const()[name = string("per_layer_slice_11_begin_0"), val = tensor<int32, [3]>([0, 0, 1280])];
+            tensor<int32, [3]> per_layer_slice_11_end_0 = const()[name = string("per_layer_slice_11_end_0"), val = tensor<int32, [3]>([1, 3, 1536])];
+            tensor<bool, [3]> per_layer_slice_11_end_mask_0 = const()[name = string("per_layer_slice_11_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_11_cast_fp16 = slice_by_index(begin = per_layer_slice_11_begin_0, end = per_layer_slice_11_end_0, end_mask = per_layer_slice_11_end_mask_0, x = per_layer_combined_out)[name = string("per_layer_slice_11_cast_fp16")];
+            tensor<int32, [3]> var_4423 = const()[name = string("op_4423"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_175_axes_0 = const()[name = string("input_175_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_4424 = transpose(perm = var_4423, x = hidden_states_55_cast_fp16)[name = string("transpose_122")];
+            tensor<fp16, [1, 2560, 1, 3]> input_175 = expand_dims(axes = input_175_axes_0, x = var_4424)[name = string("input_175")];
+            string gated_31_pad_type_0 = const()[name = string("gated_31_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_31_strides_0 = const()[name = string("gated_31_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_31_pad_0 = const()[name = string("gated_31_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_31_dilations_0 = const()[name = string("gated_31_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_31_groups_0 = const()[name = string("gated_31_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_31 = conv(dilations = gated_31_dilations_0, groups = gated_31_groups_0, pad = gated_31_pad_0, pad_type = gated_31_pad_type_0, strides = gated_31_strides_0, weight = layers_5_per_layer_input_gate_weight_palettized, x = input_175)[name = string("gated_31")];
+            string gated_33_mode_0 = const()[name = string("gated_33_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_33 = gelu(mode = gated_33_mode_0, x = gated_31)[name = string("gated_33")];
+            tensor<int32, [3]> var_4443 = const()[name = string("op_4443"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_11_axes_0 = const()[name = string("per_layer_slice_conv_11_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_4444_cast_fp16 = transpose(perm = var_4443, x = per_layer_slice_11_cast_fp16)[name = string("transpose_121")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_11_cast_fp16 = expand_dims(axes = per_layer_slice_conv_11_axes_0, x = var_4444_cast_fp16)[name = string("per_layer_slice_conv_11_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_177_cast_fp16 = mul(x = gated_33, y = per_layer_slice_conv_11_cast_fp16)[name = string("input_177_cast_fp16")];
+            string gated_35_pad_type_0 = const()[name = string("gated_35_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_35_strides_0 = const()[name = string("gated_35_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_35_pad_0 = const()[name = string("gated_35_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_35_dilations_0 = const()[name = string("gated_35_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_35_groups_0 = const()[name = string("gated_35_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_5_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(565162048))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(565489792))))[name = string("layers_5_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_35_cast_fp16 = conv(dilations = gated_35_dilations_0, groups = gated_35_groups_0, pad = gated_35_pad_0, pad_type = gated_35_pad_type_0, strides = gated_35_strides_0, weight = layers_5_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_177_cast_fp16)[name = string("gated_35_cast_fp16")];
+            tensor<int32, [1]> var_4460_axes_0 = const()[name = string("op_4460_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_4460_cast_fp16 = squeeze(axes = var_4460_axes_0, x = gated_35_cast_fp16)[name = string("op_4460_cast_fp16")];
+            tensor<int32, [3]> var_4464 = const()[name = string("op_4464"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_4470 = const()[name = string("op_4470"), val = int32(-1)];
+            fp16 const_69_promoted_to_fp16 = const()[name = string("const_69_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_117_cast_fp16 = transpose(perm = var_4464, x = var_4460_cast_fp16)[name = string("transpose_120")];
+            tensor<fp16, [1, 3, 2560]> var_4472_cast_fp16 = mul(x = x_117_cast_fp16, y = const_69_promoted_to_fp16)[name = string("op_4472_cast_fp16")];
+            bool input_179_interleave_0 = const()[name = string("input_179_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_179_cast_fp16 = concat(axis = var_4470, interleave = input_179_interleave_0, values = (x_117_cast_fp16, var_4472_cast_fp16))[name = string("input_179_cast_fp16")];
+            tensor<int32, [1]> normed_169_axes_0 = const()[name = string("normed_169_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4467_to_fp16 = const()[name = string("op_4467_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_169_cast_fp16 = layer_norm(axes = normed_169_axes_0, epsilon = var_4467_to_fp16, x = input_179_cast_fp16)[name = string("normed_169_cast_fp16")];
+            tensor<int32, [2]> var_4477_split_sizes_0 = const()[name = string("op_4477_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4477_axis_0 = const()[name = string("op_4477_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_4477_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_4477_cast_fp16_1 = split(axis = var_4477_axis_0, split_sizes = var_4477_split_sizes_0, x = normed_169_cast_fp16)[name = string("op_4477_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_5_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(565492416)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_59_cast_fp16 = mul(x = var_4477_cast_fp16_0, y = layers_5_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_59_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_61_cast_fp16 = add(x = hidden_states_55_cast_fp16, y = hidden_states_59_cast_fp16)[name = string("hidden_states_61_cast_fp16")];
+            tensor<fp16, [1]> const_70_promoted_to_fp16 = const()[name = string("const_70_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.36p-1])];
+            tensor<fp16, [1, 3, 2560]> x_119_cast_fp16 = mul(x = hidden_states_61_cast_fp16, y = const_70_promoted_to_fp16)[name = string("x_119_cast_fp16")];
+            int32 var_4492 = const()[name = string("op_4492"), val = int32(-1)];
+            fp16 const_71_promoted_to_fp16 = const()[name = string("const_71_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_4494_cast_fp16 = mul(x = x_119_cast_fp16, y = const_71_promoted_to_fp16)[name = string("op_4494_cast_fp16")];
+            bool input_181_interleave_0 = const()[name = string("input_181_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_181_cast_fp16 = concat(axis = var_4492, interleave = input_181_interleave_0, values = (x_119_cast_fp16, var_4494_cast_fp16))[name = string("input_181_cast_fp16")];
+            tensor<int32, [1]> normed_173_axes_0 = const()[name = string("normed_173_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4489_to_fp16 = const()[name = string("op_4489_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_173_cast_fp16 = layer_norm(axes = normed_173_axes_0, epsilon = var_4489_to_fp16, x = input_181_cast_fp16)[name = string("normed_173_cast_fp16")];
+            tensor<int32, [2]> var_4499_split_sizes_0 = const()[name = string("op_4499_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4499_axis_0 = const()[name = string("op_4499_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_4499_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_4499_cast_fp16_1 = split(axis = var_4499_axis_0, split_sizes = var_4499_split_sizes_0, x = normed_173_cast_fp16)[name = string("op_4499_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_6_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(565497600)))];
+            tensor<fp16, [1, 3, 2560]> h_37_cast_fp16 = mul(x = var_4499_cast_fp16_0, y = layers_6_input_layernorm_weight_promoted_to_fp16)[name = string("h_37_cast_fp16")];
+            tensor<int32, [3]> var_4505 = const()[name = string("op_4505"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_4508_axes_0 = const()[name = string("op_4508_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_4506_cast_fp16 = transpose(perm = var_4505, x = h_37_cast_fp16)[name = string("transpose_119")];
+            tensor<fp16, [1, 2560, 1, 3]> var_4508_cast_fp16 = expand_dims(axes = var_4508_axes_0, x = var_4506_cast_fp16)[name = string("op_4508_cast_fp16")];
+            string q_73_pad_type_0 = const()[name = string("q_73_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_73_strides_0 = const()[name = string("q_73_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_73_pad_0 = const()[name = string("q_73_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_73_dilations_0 = const()[name = string("q_73_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_73_groups_0 = const()[name = string("q_73_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_73 = conv(dilations = q_73_dilations_0, groups = q_73_groups_0, pad = q_73_pad_0, pad_type = q_73_pad_type_0, strides = q_73_strides_0, weight = layers_6_self_attn_q_proj_weight_palettized, x = var_4508_cast_fp16)[name = string("q_73")];
+            tensor<int32, [4]> var_4529 = const()[name = string("op_4529"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_4530 = reshape(shape = var_4529, x = q_73)[name = string("op_4530")];
+            tensor<int32, [4]> transpose_66_perm_0 = const()[name = string("transpose_66_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_4553 = const()[name = string("op_4553"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_66 = transpose(perm = transpose_66_perm_0, x = var_4530)[name = string("transpose_118")];
+            tensor<fp16, [3, 8, 256]> x_121 = reshape(shape = var_4553, x = transpose_66)[name = string("x_121")];
+            int32 var_4559 = const()[name = string("op_4559"), val = int32(-1)];
+            fp16 const_72_promoted = const()[name = string("const_72_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_4561 = mul(x = x_121, y = const_72_promoted)[name = string("op_4561")];
+            bool input_185_interleave_0 = const()[name = string("input_185_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_185 = concat(axis = var_4559, interleave = input_185_interleave_0, values = (x_121, var_4561))[name = string("input_185")];
+            tensor<int32, [1]> normed_177_axes_0 = const()[name = string("normed_177_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4556_to_fp16 = const()[name = string("op_4556_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_177_cast_fp16 = layer_norm(axes = normed_177_axes_0, epsilon = var_4556_to_fp16, x = input_185)[name = string("normed_177_cast_fp16")];
+            tensor<int32, [2]> var_4566_split_sizes_0 = const()[name = string("op_4566_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_4566_axis_0 = const()[name = string("op_4566_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_4566_0, tensor<fp16, [3, 8, 256]> var_4566_1 = split(axis = var_4566_axis_0, split_sizes = var_4566_split_sizes_0, x = normed_177_cast_fp16)[name = string("op_4566")];
+            tensor<fp16, [3, 8, 256]> q_77 = mul(x = var_4566_0, y = layers_3_self_attn_q_norm_weight)[name = string("q_77")];
+            tensor<int32, [4]> var_4573 = const()[name = string("op_4573"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_4574 = reshape(shape = var_4573, x = q_77)[name = string("op_4574")];
+            tensor<int32, [4]> var_4579 = const()[name = string("op_4579"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_79 = transpose(perm = var_4579, x = var_4574)[name = string("transpose_117")];
+            tensor<fp16, [1, 8, 3, 256]> var_4581_cast_fp16 = mul(x = q_79, y = cos_s)[name = string("op_4581_cast_fp16")];
+            tensor<int32, [2]> var_4582_split_sizes_0 = const()[name = string("op_4582_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_4582_axis_0 = const()[name = string("op_4582_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_4582_0, tensor<fp16, [1, 8, 3, 128]> var_4582_1 = split(axis = var_4582_axis_0, split_sizes = var_4582_split_sizes_0, x = q_79)[name = string("op_4582")];
+            fp16 const_73_promoted = const()[name = string("const_73_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_4584 = mul(x = var_4582_1, y = const_73_promoted)[name = string("op_4584")];
+            int32 var_4586 = const()[name = string("op_4586"), val = int32(-1)];
+            bool var_4587_interleave_0 = const()[name = string("op_4587_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_4587 = concat(axis = var_4586, interleave = var_4587_interleave_0, values = (var_4584, var_4582_0))[name = string("op_4587")];
+            tensor<fp16, [1, 8, 3, 256]> var_4588_cast_fp16 = mul(x = var_4587, y = sin_s)[name = string("op_4588_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_83_cast_fp16 = add(x = var_4581_cast_fp16, y = var_4588_cast_fp16)[name = string("q_83_cast_fp16")];
+            string k_39_pad_type_0 = const()[name = string("k_39_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_39_strides_0 = const()[name = string("k_39_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_39_pad_0 = const()[name = string("k_39_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_39_dilations_0 = const()[name = string("k_39_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_39_groups_0 = const()[name = string("k_39_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> k_39 = conv(dilations = k_39_dilations_0, groups = k_39_groups_0, pad = k_39_pad_0, pad_type = k_39_pad_type_0, strides = k_39_strides_0, weight = layers_6_self_attn_k_proj_weight_palettized, x = var_4508_cast_fp16)[name = string("k_39")];
+            tensor<int32, [4]> var_4606 = const()[name = string("op_4606"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_4607 = reshape(shape = var_4606, x = k_39)[name = string("op_4607")];
+            tensor<int32, [4]> transpose_67_perm_0 = const()[name = string("transpose_67_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            string v_15_pad_type_0 = const()[name = string("v_15_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_15_strides_0 = const()[name = string("v_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_15_pad_0 = const()[name = string("v_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_15_dilations_0 = const()[name = string("v_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_15_groups_0 = const()[name = string("v_15_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> v_15 = conv(dilations = v_15_dilations_0, groups = v_15_groups_0, pad = v_15_pad_0, pad_type = v_15_pad_type_0, strides = v_15_strides_0, weight = layers_6_self_attn_v_proj_weight_palettized, x = var_4508_cast_fp16)[name = string("v_15")];
+            tensor<int32, [4]> var_4634 = const()[name = string("op_4634"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_4635 = reshape(shape = var_4634, x = v_15)[name = string("op_4635")];
+            tensor<int32, [4]> var_4640 = const()[name = string("op_4640"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_4658 = const()[name = string("op_4658"), val = tensor<int32, [3]>([3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> transpose_67 = transpose(perm = transpose_67_perm_0, x = var_4607)[name = string("transpose_116")];
+            tensor<fp16, [3, 2, 256]> x_123 = reshape(shape = var_4658, x = transpose_67)[name = string("x_123")];
+            int32 var_4664 = const()[name = string("op_4664"), val = int32(-1)];
+            fp16 const_74_promoted = const()[name = string("const_74_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 2, 256]> var_4666 = mul(x = x_123, y = const_74_promoted)[name = string("op_4666")];
+            bool input_187_interleave_0 = const()[name = string("input_187_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 2, 512]> input_187 = concat(axis = var_4664, interleave = input_187_interleave_0, values = (x_123, var_4666))[name = string("input_187")];
+            tensor<int32, [1]> normed_181_axes_0 = const()[name = string("normed_181_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4661_to_fp16 = const()[name = string("op_4661_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 2, 512]> normed_181_cast_fp16 = layer_norm(axes = normed_181_axes_0, epsilon = var_4661_to_fp16, x = input_187)[name = string("normed_181_cast_fp16")];
+            tensor<int32, [2]> var_4671_split_sizes_0 = const()[name = string("op_4671_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_4671_axis_0 = const()[name = string("op_4671_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 2, 256]> var_4671_0, tensor<fp16, [3, 2, 256]> var_4671_1 = split(axis = var_4671_axis_0, split_sizes = var_4671_split_sizes_0, x = normed_181_cast_fp16)[name = string("op_4671")];
+            tensor<fp16, [3, 2, 256]> k_43 = mul(x = var_4671_0, y = layers_6_self_attn_k_norm_weight)[name = string("k_43")];
+            tensor<int32, [4]> var_4678 = const()[name = string("op_4678"), val = tensor<int32, [4]>([1, 3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> var_4679 = reshape(shape = var_4678, x = k_43)[name = string("op_4679")];
+            tensor<int32, [4]> var_4684 = const()[name = string("op_4684"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            fp16 var_4686_promoted = const()[name = string("op_4686_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 3, 256]> var_4641 = transpose(perm = var_4640, x = var_4635)[name = string("transpose_115")];
+            tensor<fp16, [1, 2, 3, 256]> var_4687 = pow(x = var_4641, y = var_4686_promoted)[name = string("op_4687")];
+            tensor<int32, [1]> var_4692_axes_0 = const()[name = string("op_4692_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4692_keep_dims_0 = const()[name = string("op_4692_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 3, 1]> var_4692 = reduce_mean(axes = var_4692_axes_0, keep_dims = var_4692_keep_dims_0, x = var_4687)[name = string("op_4692")];
+            fp16 var_4694_to_fp16 = const()[name = string("op_4694_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 3, 1]> mean_sq_13_cast_fp16 = add(x = var_4692, y = var_4694_to_fp16)[name = string("mean_sq_13_cast_fp16")];
+            fp32 var_4696_epsilon_0 = const()[name = string("op_4696_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 3, 1]> var_4696_cast_fp16 = rsqrt(epsilon = var_4696_epsilon_0, x = mean_sq_13_cast_fp16)[name = string("op_4696_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_191_cast_fp16 = mul(x = var_4641, y = var_4696_cast_fp16)[name = string("input_191_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> q_81 = transpose(perm = var_4684, x = var_4679)[name = string("transpose_114")];
+            tensor<fp16, [1, 2, 3, 256]> var_4698_cast_fp16 = mul(x = q_81, y = cos_s)[name = string("op_4698_cast_fp16")];
+            tensor<int32, [2]> var_4699_split_sizes_0 = const()[name = string("op_4699_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_4699_axis_0 = const()[name = string("op_4699_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 3, 128]> var_4699_0, tensor<fp16, [1, 2, 3, 128]> var_4699_1 = split(axis = var_4699_axis_0, split_sizes = var_4699_split_sizes_0, x = q_81)[name = string("op_4699")];
+            fp16 const_75_promoted = const()[name = string("const_75_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 3, 128]> var_4701 = mul(x = var_4699_1, y = const_75_promoted)[name = string("op_4701")];
+            int32 var_4703 = const()[name = string("op_4703"), val = int32(-1)];
+            bool var_4704_interleave_0 = const()[name = string("op_4704_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 3, 256]> var_4704 = concat(axis = var_4703, interleave = var_4704_interleave_0, values = (var_4701, var_4699_0))[name = string("op_4704")];
+            tensor<fp16, [1, 2, 3, 256]> var_4705_cast_fp16 = mul(x = var_4704, y = sin_s)[name = string("op_4705_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_189_cast_fp16 = add(x = var_4698_cast_fp16, y = var_4705_cast_fp16)[name = string("input_189_cast_fp16")];
+            tensor<int32, [8]> k_padded_11_pad_0 = const()[name = string("k_padded_11_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_11_mode_0 = const()[name = string("k_padded_11_mode_0"), val = string("constant")];
+            fp16 const_76_to_fp16 = const()[name = string("const_76_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> k_padded_11_cast_fp16 = pad(constant_val = const_76_to_fp16, mode = k_padded_11_mode_0, pad = k_padded_11_pad_0, x = input_189_cast_fp16)[name = string("k_padded_11_cast_fp16")];
+            tensor<int32, [8]> v_padded_11_pad_0 = const()[name = string("v_padded_11_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_11_mode_0 = const()[name = string("v_padded_11_mode_0"), val = string("constant")];
+            fp16 const_77_to_fp16 = const()[name = string("const_77_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> v_padded_11_cast_fp16 = pad(constant_val = const_77_to_fp16, mode = v_padded_11_mode_0, pad = v_padded_11_pad_0, x = input_191_cast_fp16)[name = string("v_padded_11_cast_fp16")];
+            tensor<int32, [4]> slot_k_13_begin_0 = const()[name = string("slot_k_13_begin_0"), val = tensor<int32, [4]>([5, 0, 0, 0])];
+            tensor<int32, [4]> slot_k_13_end_0 = const()[name = string("slot_k_13_end_0"), val = tensor<int32, [4]>([6, 2, 512, 512])];
+            tensor<bool, [4]> slot_k_13_end_mask_0 = const()[name = string("slot_k_13_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_k_13_cast_fp16 = slice_by_index(begin = slot_k_13_begin_0, end = slot_k_13_end_0, end_mask = slot_k_13_end_mask_0, x = K_sliding_out_9_cast_fp16)[name = string("slot_k_13_cast_fp16")];
+            tensor<int32, [4]> slot_v_13_begin_0 = const()[name = string("slot_v_13_begin_0"), val = tensor<int32, [4]>([5, 0, 0, 0])];
+            tensor<int32, [4]> slot_v_13_end_0 = const()[name = string("slot_v_13_end_0"), val = tensor<int32, [4]>([6, 2, 512, 512])];
+            tensor<bool, [4]> slot_v_13_end_mask_0 = const()[name = string("slot_v_13_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_v_13_cast_fp16 = slice_by_index(begin = slot_v_13_begin_0, end = slot_v_13_end_0, end_mask = slot_v_13_end_mask_0, x = V_sliding_out_9_cast_fp16)[name = string("slot_v_13_cast_fp16")];
+            tensor<int32, [4]> var_4744_begin_0 = const()[name = string("op_4744_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_4744_end_0 = const()[name = string("op_4744_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_4744_end_mask_0 = const()[name = string("op_4744_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_4744_cast_fp16 = slice_by_index(begin = var_4744_begin_0, end = var_4744_end_0, end_mask = var_4744_end_mask_0, x = slot_k_13_cast_fp16)[name = string("op_4744_cast_fp16")];
+            int32 var_4751 = const()[name = string("op_4751"), val = int32(2)];
+            bool new_k_13_interleave_0 = const()[name = string("new_k_13_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_k_13_cast_fp16 = concat(axis = var_4751, interleave = new_k_13_interleave_0, values = (var_4744_cast_fp16, k_padded_11_cast_fp16))[name = string("new_k_13_cast_fp16")];
+            tensor<int32, [4]> var_4767_begin_0 = const()[name = string("op_4767_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_4767_end_0 = const()[name = string("op_4767_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_4767_end_mask_0 = const()[name = string("op_4767_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_4767_cast_fp16 = slice_by_index(begin = var_4767_begin_0, end = var_4767_end_0, end_mask = var_4767_end_mask_0, x = slot_v_13_cast_fp16)[name = string("op_4767_cast_fp16")];
+            int32 var_4774 = const()[name = string("op_4774"), val = int32(2)];
+            bool new_v_13_interleave_0 = const()[name = string("new_v_13_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_v_13_cast_fp16 = concat(axis = var_4774, interleave = new_v_13_interleave_0, values = (var_4767_cast_fp16, v_padded_11_cast_fp16))[name = string("new_v_13_cast_fp16")];
+            tensor<int32, [4]> var_4780_begin_0 = const()[name = string("op_4780_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4780_end_0 = const()[name = string("op_4780_end_0"), val = tensor<int32, [4]>([5, 2, 512, 512])];
+            tensor<bool, [4]> var_4780_end_mask_0 = const()[name = string("op_4780_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [5, 2, 512, 512]> var_4780_cast_fp16 = slice_by_index(begin = var_4780_begin_0, end = var_4780_end_0, end_mask = var_4780_end_mask_0, x = K_sliding_out_9_cast_fp16)[name = string("op_4780_cast_fp16")];
+            tensor<int32, [4]> var_4785_begin_0 = const()[name = string("op_4785_begin_0"), val = tensor<int32, [4]>([6, 0, 0, 0])];
+            tensor<int32, [4]> var_4785_end_0 = const()[name = string("op_4785_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_4785_end_mask_0 = const()[name = string("op_4785_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [4, 2, 512, 512]> var_4785_cast_fp16 = slice_by_index(begin = var_4785_begin_0, end = var_4785_end_0, end_mask = var_4785_end_mask_0, x = K_sliding_out_9_cast_fp16)[name = string("op_4785_cast_fp16")];
+            int32 var_4787 = const()[name = string("op_4787"), val = int32(0)];
+            bool K_sliding_out_11_interleave_0 = const()[name = string("K_sliding_out_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> K_sliding_out_11_cast_fp16 = concat(axis = var_4787, interleave = K_sliding_out_11_interleave_0, values = (var_4780_cast_fp16, new_k_13_cast_fp16, var_4785_cast_fp16))[name = string("K_sliding_out_11_cast_fp16")];
+            tensor<int32, [4]> var_4793_begin_0 = const()[name = string("op_4793_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4793_end_0 = const()[name = string("op_4793_end_0"), val = tensor<int32, [4]>([5, 2, 512, 512])];
+            tensor<bool, [4]> var_4793_end_mask_0 = const()[name = string("op_4793_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [5, 2, 512, 512]> var_4793_cast_fp16 = slice_by_index(begin = var_4793_begin_0, end = var_4793_end_0, end_mask = var_4793_end_mask_0, x = V_sliding_out_9_cast_fp16)[name = string("op_4793_cast_fp16")];
+            tensor<int32, [4]> var_4798_begin_0 = const()[name = string("op_4798_begin_0"), val = tensor<int32, [4]>([6, 0, 0, 0])];
+            tensor<int32, [4]> var_4798_end_0 = const()[name = string("op_4798_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_4798_end_mask_0 = const()[name = string("op_4798_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [4, 2, 512, 512]> var_4798_cast_fp16 = slice_by_index(begin = var_4798_begin_0, end = var_4798_end_0, end_mask = var_4798_end_mask_0, x = V_sliding_out_9_cast_fp16)[name = string("op_4798_cast_fp16")];
+            int32 var_4800 = const()[name = string("op_4800"), val = int32(0)];
+            bool V_sliding_out_11_interleave_0 = const()[name = string("V_sliding_out_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> V_sliding_out_11_cast_fp16 = concat(axis = var_4800, interleave = V_sliding_out_11_interleave_0, values = (var_4793_cast_fp16, new_v_13_cast_fp16, var_4798_cast_fp16))[name = string("V_sliding_out_11_cast_fp16")];
+            tensor<int32, [4]> var_4806_begin_0 = const()[name = string("op_4806_begin_0"), val = tensor<int32, [4]>([5, 0, 0, 0])];
+            tensor<int32, [4]> var_4806_end_0 = const()[name = string("op_4806_end_0"), val = tensor<int32, [4]>([6, 2, 512, 512])];
+            tensor<bool, [4]> var_4806_end_mask_0 = const()[name = string("op_4806_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_4806_cast_fp16 = slice_by_index(begin = var_4806_begin_0, end = var_4806_end_0, end_mask = var_4806_end_mask_0, x = K_sliding_out_11_cast_fp16)[name = string("op_4806_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_13_begin_0 = const()[name = string("K_for_attn_13_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_13_end_0 = const()[name = string("K_for_attn_13_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_13_end_mask_0 = const()[name = string("K_for_attn_13_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_13_cast_fp16 = slice_by_index(begin = K_for_attn_13_begin_0, end = K_for_attn_13_end_0, end_mask = K_for_attn_13_end_mask_0, x = var_4806_cast_fp16)[name = string("K_for_attn_13_cast_fp16")];
+            tensor<int32, [4]> var_4816_begin_0 = const()[name = string("op_4816_begin_0"), val = tensor<int32, [4]>([5, 0, 0, 0])];
+            tensor<int32, [4]> var_4816_end_0 = const()[name = string("op_4816_end_0"), val = tensor<int32, [4]>([6, 2, 512, 512])];
+            tensor<bool, [4]> var_4816_end_mask_0 = const()[name = string("op_4816_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_4816_cast_fp16 = slice_by_index(begin = var_4816_begin_0, end = var_4816_end_0, end_mask = var_4816_end_mask_0, x = V_sliding_out_11_cast_fp16)[name = string("op_4816_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_13_begin_0 = const()[name = string("V_for_attn_13_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_13_end_0 = const()[name = string("V_for_attn_13_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_13_end_mask_0 = const()[name = string("V_for_attn_13_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_13_cast_fp16 = slice_by_index(begin = V_for_attn_13_begin_0, end = V_for_attn_13_end_0, end_mask = V_for_attn_13_end_mask_0, x = var_4816_cast_fp16)[name = string("V_for_attn_13_cast_fp16")];
+            tensor<int32, [4]> transpose_24_perm_0 = const()[name = string("transpose_24_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_12_reps_0 = const()[name = string("tile_12_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_24_cast_fp16 = transpose(perm = transpose_24_perm_0, x = K_for_attn_13_cast_fp16)[name = string("transpose_113")];
+            tensor<fp16, [8, 1, 512, 256]> tile_12_cast_fp16 = tile(reps = tile_12_reps_0, x = transpose_24_cast_fp16)[name = string("tile_12_cast_fp16")];
+            tensor<int32, [5]> concat_26 = const()[name = string("concat_26"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_24_cast_fp16 = reshape(shape = concat_26, x = tile_12_cast_fp16)[name = string("reshape_24_cast_fp16")];
+            tensor<int32, [5]> transpose_25_perm_0 = const()[name = string("transpose_25_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_27 = const()[name = string("concat_27"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_25_cast_fp16 = transpose(perm = transpose_25_perm_0, x = reshape_24_cast_fp16)[name = string("transpose_112")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_25_cast_fp16 = reshape(shape = concat_27, x = transpose_25_cast_fp16)[name = string("reshape_25_cast_fp16")];
+            tensor<int32, [4]> transpose_68_perm_0 = const()[name = string("transpose_68_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_26_perm_0 = const()[name = string("transpose_26_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_13_reps_0 = const()[name = string("tile_13_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_26_cast_fp16 = transpose(perm = transpose_26_perm_0, x = V_for_attn_13_cast_fp16)[name = string("transpose_111")];
+            tensor<fp16, [8, 1, 512, 256]> tile_13_cast_fp16 = tile(reps = tile_13_reps_0, x = transpose_26_cast_fp16)[name = string("tile_13_cast_fp16")];
+            tensor<int32, [5]> concat_28 = const()[name = string("concat_28"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_26_cast_fp16 = reshape(shape = concat_28, x = tile_13_cast_fp16)[name = string("reshape_26_cast_fp16")];
+            tensor<int32, [5]> transpose_27_perm_0 = const()[name = string("transpose_27_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_29 = const()[name = string("concat_29"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_27_cast_fp16 = transpose(perm = transpose_27_perm_0, x = reshape_26_cast_fp16)[name = string("transpose_110")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_27_cast_fp16 = reshape(shape = concat_29, x = transpose_27_cast_fp16)[name = string("reshape_27_cast_fp16")];
+            tensor<int32, [4]> V_expanded_13_perm_0 = const()[name = string("V_expanded_13_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_25_transpose_x_0 = const()[name = string("attn_weights_25_transpose_x_0"), val = bool(false)];
+            bool attn_weights_25_transpose_y_0 = const()[name = string("attn_weights_25_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_68_cast_fp16 = transpose(perm = transpose_68_perm_0, x = reshape_25_cast_fp16)[name = string("transpose_109")];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_25_cast_fp16 = matmul(transpose_x = attn_weights_25_transpose_x_0, transpose_y = attn_weights_25_transpose_y_0, x = q_83_cast_fp16, y = transpose_68_cast_fp16)[name = string("attn_weights_25_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_127_cast_fp16 = add(x = attn_weights_25_cast_fp16, y = causal_mask_sliding)[name = string("x_127_cast_fp16")];
+            tensor<int32, [1]> reduce_max_6_axes_0 = const()[name = string("reduce_max_6_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_6_keep_dims_0 = const()[name = string("reduce_max_6_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_6 = reduce_max(axes = reduce_max_6_axes_0, keep_dims = reduce_max_6_keep_dims_0, x = x_127_cast_fp16)[name = string("reduce_max_6")];
+            tensor<fp16, [1, 8, 3, 512]> var_4851 = sub(x = x_127_cast_fp16, y = reduce_max_6)[name = string("op_4851")];
+            tensor<fp16, [1, 8, 3, 512]> var_4857 = exp(x = var_4851)[name = string("op_4857")];
+            tensor<int32, [1]> var_4867_axes_0 = const()[name = string("op_4867_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4867_keep_dims_0 = const()[name = string("op_4867_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_4867 = reduce_sum(axes = var_4867_axes_0, keep_dims = var_4867_keep_dims_0, x = var_4857)[name = string("op_4867")];
+            tensor<fp16, [1, 8, 3, 512]> var_4873_cast_fp16 = real_div(x = var_4857, y = var_4867)[name = string("op_4873_cast_fp16")];
+            bool attn_output_37_transpose_x_0 = const()[name = string("attn_output_37_transpose_x_0"), val = bool(false)];
+            bool attn_output_37_transpose_y_0 = const()[name = string("attn_output_37_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_13_cast_fp16 = transpose(perm = V_expanded_13_perm_0, x = reshape_27_cast_fp16)[name = string("transpose_108")];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_37_cast_fp16 = matmul(transpose_x = attn_output_37_transpose_x_0, transpose_y = attn_output_37_transpose_y_0, x = var_4873_cast_fp16, y = V_expanded_13_cast_fp16)[name = string("attn_output_37_cast_fp16")];
+            tensor<int32, [4]> var_4884 = const()[name = string("op_4884"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_4891 = const()[name = string("op_4891"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_4885_cast_fp16 = transpose(perm = var_4884, x = attn_output_37_cast_fp16)[name = string("transpose_107")];
+            tensor<fp16, [1, 3, 2048]> attn_output_39_cast_fp16 = reshape(shape = var_4891, x = var_4885_cast_fp16)[name = string("attn_output_39_cast_fp16")];
+            tensor<int32, [3]> var_4896 = const()[name = string("op_4896"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_4912_pad_type_0 = const()[name = string("op_4912_pad_type_0"), val = string("valid")];
+            int32 var_4912_groups_0 = const()[name = string("op_4912_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_4912_strides_0 = const()[name = string("op_4912_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_4912_pad_0 = const()[name = string("op_4912_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_4912_dilations_0 = const()[name = string("op_4912_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_6_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(565502784))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(568124288))))[name = string("squeeze_6_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_4897_cast_fp16 = transpose(perm = var_4896, x = attn_output_39_cast_fp16)[name = string("transpose_106")];
+            tensor<fp16, [1, 2560, 3]> var_4912_cast_fp16 = conv(dilations = var_4912_dilations_0, groups = var_4912_groups_0, pad = var_4912_pad_0, pad_type = var_4912_pad_type_0, strides = var_4912_strides_0, weight = squeeze_6_cast_fp16_to_fp32_to_fp16_palettized, x = var_4897_cast_fp16)[name = string("op_4912_cast_fp16")];
+            tensor<int32, [3]> var_4916 = const()[name = string("op_4916"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_4922 = const()[name = string("op_4922"), val = int32(-1)];
+            fp16 const_78_promoted_to_fp16 = const()[name = string("const_78_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_131_cast_fp16 = transpose(perm = var_4916, x = var_4912_cast_fp16)[name = string("transpose_105")];
+            tensor<fp16, [1, 3, 2560]> var_4924_cast_fp16 = mul(x = x_131_cast_fp16, y = const_78_promoted_to_fp16)[name = string("op_4924_cast_fp16")];
+            bool input_195_interleave_0 = const()[name = string("input_195_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_195_cast_fp16 = concat(axis = var_4922, interleave = input_195_interleave_0, values = (x_131_cast_fp16, var_4924_cast_fp16))[name = string("input_195_cast_fp16")];
+            tensor<int32, [1]> normed_185_axes_0 = const()[name = string("normed_185_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4919_to_fp16 = const()[name = string("op_4919_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_185_cast_fp16 = layer_norm(axes = normed_185_axes_0, epsilon = var_4919_to_fp16, x = input_195_cast_fp16)[name = string("normed_185_cast_fp16")];
+            tensor<int32, [2]> var_4929_split_sizes_0 = const()[name = string("op_4929_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4929_axis_0 = const()[name = string("op_4929_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_4929_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_4929_cast_fp16_1 = split(axis = var_4929_axis_0, split_sizes = var_4929_split_sizes_0, x = normed_185_cast_fp16)[name = string("op_4929_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_6_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(568126912)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_41_cast_fp16 = mul(x = var_4929_cast_fp16_0, y = layers_6_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_41_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_133_cast_fp16 = add(x = x_119_cast_fp16, y = attn_output_41_cast_fp16)[name = string("x_133_cast_fp16")];
+            int32 var_4938 = const()[name = string("op_4938"), val = int32(-1)];
+            fp16 const_79_promoted_to_fp16 = const()[name = string("const_79_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_4940_cast_fp16 = mul(x = x_133_cast_fp16, y = const_79_promoted_to_fp16)[name = string("op_4940_cast_fp16")];
+            bool input_197_interleave_0 = const()[name = string("input_197_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_197_cast_fp16 = concat(axis = var_4938, interleave = input_197_interleave_0, values = (x_133_cast_fp16, var_4940_cast_fp16))[name = string("input_197_cast_fp16")];
+            tensor<int32, [1]> normed_189_axes_0 = const()[name = string("normed_189_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4935_to_fp16 = const()[name = string("op_4935_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_189_cast_fp16 = layer_norm(axes = normed_189_axes_0, epsilon = var_4935_to_fp16, x = input_197_cast_fp16)[name = string("normed_189_cast_fp16")];
+            tensor<int32, [2]> var_4945_split_sizes_0 = const()[name = string("op_4945_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4945_axis_0 = const()[name = string("op_4945_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_4945_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_4945_cast_fp16_1 = split(axis = var_4945_axis_0, split_sizes = var_4945_split_sizes_0, x = normed_189_cast_fp16)[name = string("op_4945_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_6_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(568132096)))];
+            tensor<fp16, [1, 3, 2560]> h_39_cast_fp16 = mul(x = var_4945_cast_fp16_0, y = layers_6_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_39_cast_fp16")];
+            tensor<int32, [3]> var_4956 = const()[name = string("op_4956"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_199_axes_0 = const()[name = string("input_199_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_4957 = transpose(perm = var_4956, x = h_39_cast_fp16)[name = string("transpose_104")];
+            tensor<fp16, [1, 2560, 1, 3]> input_199 = expand_dims(axes = input_199_axes_0, x = var_4957)[name = string("input_199")];
+            string gate_25_pad_type_0 = const()[name = string("gate_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_25_strides_0 = const()[name = string("gate_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_25_pad_0 = const()[name = string("gate_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_25_dilations_0 = const()[name = string("gate_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_25_groups_0 = const()[name = string("gate_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_25 = conv(dilations = gate_25_dilations_0, groups = gate_25_groups_0, pad = gate_25_pad_0, pad_type = gate_25_pad_type_0, strides = gate_25_strides_0, weight = layers_6_mlp_gate_proj_weight_palettized, x = input_199)[name = string("gate_25")];
+            string up_13_pad_type_0 = const()[name = string("up_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_13_strides_0 = const()[name = string("up_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_13_pad_0 = const()[name = string("up_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_13_dilations_0 = const()[name = string("up_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_13_groups_0 = const()[name = string("up_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_13 = conv(dilations = up_13_dilations_0, groups = up_13_groups_0, pad = up_13_pad_0, pad_type = up_13_pad_type_0, strides = up_13_strides_0, weight = layers_6_mlp_up_proj_weight_palettized, x = input_199)[name = string("up_13")];
+            string gate_27_mode_0 = const()[name = string("gate_27_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_27 = gelu(mode = gate_27_mode_0, x = gate_25)[name = string("gate_27")];
+            tensor<fp16, [1, 10240, 1, 3]> input_201 = mul(x = gate_27, y = up_13)[name = string("input_201")];
+            string mlp_out_13_pad_type_0 = const()[name = string("mlp_out_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_13_strides_0 = const()[name = string("mlp_out_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_13_pad_0 = const()[name = string("mlp_out_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_13_dilations_0 = const()[name = string("mlp_out_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_13_groups_0 = const()[name = string("mlp_out_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_13 = conv(dilations = mlp_out_13_dilations_0, groups = mlp_out_13_groups_0, pad = mlp_out_13_pad_0, pad_type = mlp_out_13_pad_type_0, strides = mlp_out_13_strides_0, weight = layers_6_mlp_down_proj_weight_palettized, x = input_201)[name = string("mlp_out_13")];
+            tensor<int32, [1]> var_4997_axes_0 = const()[name = string("op_4997_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_4997 = squeeze(axes = var_4997_axes_0, x = mlp_out_13)[name = string("op_4997")];
+            tensor<int32, [3]> var_5001 = const()[name = string("op_5001"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_5007 = const()[name = string("op_5007"), val = int32(-1)];
+            fp16 const_80_promoted = const()[name = string("const_80_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_135 = transpose(perm = var_5001, x = var_4997)[name = string("transpose_103")];
+            tensor<fp16, [1, 3, 2560]> var_5009 = mul(x = x_135, y = const_80_promoted)[name = string("op_5009")];
+            bool input_203_interleave_0 = const()[name = string("input_203_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_203 = concat(axis = var_5007, interleave = input_203_interleave_0, values = (x_135, var_5009))[name = string("input_203")];
+            tensor<int32, [1]> normed_193_axes_0 = const()[name = string("normed_193_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5004_to_fp16 = const()[name = string("op_5004_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_193_cast_fp16 = layer_norm(axes = normed_193_axes_0, epsilon = var_5004_to_fp16, x = input_203)[name = string("normed_193_cast_fp16")];
+            tensor<int32, [2]> var_5014_split_sizes_0 = const()[name = string("op_5014_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5014_axis_0 = const()[name = string("op_5014_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_5014_0, tensor<fp16, [1, 3, 2560]> var_5014_1 = split(axis = var_5014_axis_0, split_sizes = var_5014_split_sizes_0, x = normed_193_cast_fp16)[name = string("op_5014")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_63 = mul(x = var_5014_0, y = layers_6_post_feedforward_layernorm_weight)[name = string("hidden_states_63")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_65_cast_fp16 = add(x = x_133_cast_fp16, y = hidden_states_63)[name = string("hidden_states_65_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_13_begin_0 = const()[name = string("per_layer_slice_13_begin_0"), val = tensor<int32, [3]>([0, 0, 1536])];
+            tensor<int32, [3]> per_layer_slice_13_end_0 = const()[name = string("per_layer_slice_13_end_0"), val = tensor<int32, [3]>([1, 3, 1792])];
+            tensor<bool, [3]> per_layer_slice_13_end_mask_0 = const()[name = string("per_layer_slice_13_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_13_cast_fp16 = slice_by_index(begin = per_layer_slice_13_begin_0, end = per_layer_slice_13_end_0, end_mask = per_layer_slice_13_end_mask_0, x = per_layer_combined_out)[name = string("per_layer_slice_13_cast_fp16")];
+            tensor<int32, [3]> var_5042 = const()[name = string("op_5042"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_205_axes_0 = const()[name = string("input_205_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_5043 = transpose(perm = var_5042, x = hidden_states_65_cast_fp16)[name = string("transpose_102")];
+            tensor<fp16, [1, 2560, 1, 3]> input_205 = expand_dims(axes = input_205_axes_0, x = var_5043)[name = string("input_205")];
+            string gated_37_pad_type_0 = const()[name = string("gated_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_37_strides_0 = const()[name = string("gated_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_37_pad_0 = const()[name = string("gated_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_37_dilations_0 = const()[name = string("gated_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_37_groups_0 = const()[name = string("gated_37_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_37 = conv(dilations = gated_37_dilations_0, groups = gated_37_groups_0, pad = gated_37_pad_0, pad_type = gated_37_pad_type_0, strides = gated_37_strides_0, weight = layers_6_per_layer_input_gate_weight_palettized, x = input_205)[name = string("gated_37")];
+            string gated_39_mode_0 = const()[name = string("gated_39_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_39 = gelu(mode = gated_39_mode_0, x = gated_37)[name = string("gated_39")];
+            tensor<int32, [3]> var_5062 = const()[name = string("op_5062"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_13_axes_0 = const()[name = string("per_layer_slice_conv_13_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_5063_cast_fp16 = transpose(perm = var_5062, x = per_layer_slice_13_cast_fp16)[name = string("transpose_101")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_13_cast_fp16 = expand_dims(axes = per_layer_slice_conv_13_axes_0, x = var_5063_cast_fp16)[name = string("per_layer_slice_conv_13_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_207_cast_fp16 = mul(x = gated_39, y = per_layer_slice_conv_13_cast_fp16)[name = string("input_207_cast_fp16")];
+            string gated_41_pad_type_0 = const()[name = string("gated_41_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_41_strides_0 = const()[name = string("gated_41_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_41_pad_0 = const()[name = string("gated_41_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_41_dilations_0 = const()[name = string("gated_41_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_41_groups_0 = const()[name = string("gated_41_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_6_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(568137280))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(568465024))))[name = string("layers_6_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_41_cast_fp16 = conv(dilations = gated_41_dilations_0, groups = gated_41_groups_0, pad = gated_41_pad_0, pad_type = gated_41_pad_type_0, strides = gated_41_strides_0, weight = layers_6_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_207_cast_fp16)[name = string("gated_41_cast_fp16")];
+            tensor<int32, [1]> var_5079_axes_0 = const()[name = string("op_5079_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_5079_cast_fp16 = squeeze(axes = var_5079_axes_0, x = gated_41_cast_fp16)[name = string("op_5079_cast_fp16")];
+            tensor<int32, [3]> var_5083 = const()[name = string("op_5083"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_5089 = const()[name = string("op_5089"), val = int32(-1)];
+            fp16 const_81_promoted_to_fp16 = const()[name = string("const_81_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_137_cast_fp16 = transpose(perm = var_5083, x = var_5079_cast_fp16)[name = string("transpose_100")];
+            tensor<fp16, [1, 3, 2560]> var_5091_cast_fp16 = mul(x = x_137_cast_fp16, y = const_81_promoted_to_fp16)[name = string("op_5091_cast_fp16")];
+            bool input_209_interleave_0 = const()[name = string("input_209_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_209_cast_fp16 = concat(axis = var_5089, interleave = input_209_interleave_0, values = (x_137_cast_fp16, var_5091_cast_fp16))[name = string("input_209_cast_fp16")];
+            tensor<int32, [1]> normed_197_axes_0 = const()[name = string("normed_197_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5086_to_fp16 = const()[name = string("op_5086_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_197_cast_fp16 = layer_norm(axes = normed_197_axes_0, epsilon = var_5086_to_fp16, x = input_209_cast_fp16)[name = string("normed_197_cast_fp16")];
+            tensor<int32, [2]> var_5096_split_sizes_0 = const()[name = string("op_5096_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5096_axis_0 = const()[name = string("op_5096_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_5096_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_5096_cast_fp16_1 = split(axis = var_5096_axis_0, split_sizes = var_5096_split_sizes_0, x = normed_197_cast_fp16)[name = string("op_5096_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_6_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(568467648)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_69_cast_fp16 = mul(x = var_5096_cast_fp16_0, y = layers_6_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_69_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_71_cast_fp16 = add(x = hidden_states_65_cast_fp16, y = hidden_states_69_cast_fp16)[name = string("hidden_states_71_cast_fp16")];
+            tensor<fp16, [1]> const_82_promoted_to_fp16 = const()[name = string("const_82_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.1ep-1])];
+            tensor<fp16, [1, 3, 2560]> x_139_cast_fp16 = mul(x = hidden_states_71_cast_fp16, y = const_82_promoted_to_fp16)[name = string("x_139_cast_fp16")];
+            int32 var_5111 = const()[name = string("op_5111"), val = int32(-1)];
+            fp16 const_83_promoted_to_fp16 = const()[name = string("const_83_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_5113_cast_fp16 = mul(x = x_139_cast_fp16, y = const_83_promoted_to_fp16)[name = string("op_5113_cast_fp16")];
+            bool input_211_interleave_0 = const()[name = string("input_211_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_211_cast_fp16 = concat(axis = var_5111, interleave = input_211_interleave_0, values = (x_139_cast_fp16, var_5113_cast_fp16))[name = string("input_211_cast_fp16")];
+            tensor<int32, [1]> normed_201_axes_0 = const()[name = string("normed_201_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5108_to_fp16 = const()[name = string("op_5108_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_201_cast_fp16 = layer_norm(axes = normed_201_axes_0, epsilon = var_5108_to_fp16, x = input_211_cast_fp16)[name = string("normed_201_cast_fp16")];
+            tensor<int32, [2]> var_5118_split_sizes_0 = const()[name = string("op_5118_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5118_axis_0 = const()[name = string("op_5118_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_5118_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_5118_cast_fp16_1 = split(axis = var_5118_axis_0, split_sizes = var_5118_split_sizes_0, x = normed_201_cast_fp16)[name = string("op_5118_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_7_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(568472832)))];
+            tensor<fp16, [1, 3, 2560]> h_43_cast_fp16 = mul(x = var_5118_cast_fp16_0, y = layers_7_input_layernorm_weight_promoted_to_fp16)[name = string("h_43_cast_fp16")];
+            tensor<int32, [3]> var_5124 = const()[name = string("op_5124"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_5127_axes_0 = const()[name = string("op_5127_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_5125_cast_fp16 = transpose(perm = var_5124, x = h_43_cast_fp16)[name = string("transpose_99")];
+            tensor<fp16, [1, 2560, 1, 3]> var_5127_cast_fp16 = expand_dims(axes = var_5127_axes_0, x = var_5125_cast_fp16)[name = string("op_5127_cast_fp16")];
+            string q_85_pad_type_0 = const()[name = string("q_85_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_85_strides_0 = const()[name = string("q_85_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_85_pad_0 = const()[name = string("q_85_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_85_dilations_0 = const()[name = string("q_85_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_85_groups_0 = const()[name = string("q_85_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_85 = conv(dilations = q_85_dilations_0, groups = q_85_groups_0, pad = q_85_pad_0, pad_type = q_85_pad_type_0, strides = q_85_strides_0, weight = layers_7_self_attn_q_proj_weight_palettized, x = var_5127_cast_fp16)[name = string("q_85")];
+            tensor<int32, [4]> var_5148 = const()[name = string("op_5148"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_5149 = reshape(shape = var_5148, x = q_85)[name = string("op_5149")];
+            tensor<int32, [4]> transpose_69_perm_0 = const()[name = string("transpose_69_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_5172 = const()[name = string("op_5172"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_69 = transpose(perm = transpose_69_perm_0, x = var_5149)[name = string("transpose_98")];
+            tensor<fp16, [3, 8, 256]> x_141 = reshape(shape = var_5172, x = transpose_69)[name = string("x_141")];
+            int32 var_5178 = const()[name = string("op_5178"), val = int32(-1)];
+            fp16 const_84_promoted = const()[name = string("const_84_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_5180 = mul(x = x_141, y = const_84_promoted)[name = string("op_5180")];
+            bool input_215_interleave_0 = const()[name = string("input_215_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_215 = concat(axis = var_5178, interleave = input_215_interleave_0, values = (x_141, var_5180))[name = string("input_215")];
+            tensor<int32, [1]> normed_205_axes_0 = const()[name = string("normed_205_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5175_to_fp16 = const()[name = string("op_5175_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_205_cast_fp16 = layer_norm(axes = normed_205_axes_0, epsilon = var_5175_to_fp16, x = input_215)[name = string("normed_205_cast_fp16")];
+            tensor<int32, [2]> var_5185_split_sizes_0 = const()[name = string("op_5185_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_5185_axis_0 = const()[name = string("op_5185_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_5185_0, tensor<fp16, [3, 8, 256]> var_5185_1 = split(axis = var_5185_axis_0, split_sizes = var_5185_split_sizes_0, x = normed_205_cast_fp16)[name = string("op_5185")];
+            tensor<fp16, [3, 8, 256]> q_89 = mul(x = var_5185_0, y = layers_7_self_attn_q_norm_weight)[name = string("q_89")];
+            tensor<int32, [4]> var_5192 = const()[name = string("op_5192"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_5193 = reshape(shape = var_5192, x = q_89)[name = string("op_5193")];
+            tensor<int32, [4]> var_5198 = const()[name = string("op_5198"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_91 = transpose(perm = var_5198, x = var_5193)[name = string("transpose_97")];
+            tensor<fp16, [1, 8, 3, 256]> var_5200_cast_fp16 = mul(x = q_91, y = cos_s)[name = string("op_5200_cast_fp16")];
+            tensor<int32, [2]> var_5201_split_sizes_0 = const()[name = string("op_5201_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_5201_axis_0 = const()[name = string("op_5201_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_5201_0, tensor<fp16, [1, 8, 3, 128]> var_5201_1 = split(axis = var_5201_axis_0, split_sizes = var_5201_split_sizes_0, x = q_91)[name = string("op_5201")];
+            fp16 const_85_promoted = const()[name = string("const_85_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_5203 = mul(x = var_5201_1, y = const_85_promoted)[name = string("op_5203")];
+            int32 var_5205 = const()[name = string("op_5205"), val = int32(-1)];
+            bool var_5206_interleave_0 = const()[name = string("op_5206_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_5206 = concat(axis = var_5205, interleave = var_5206_interleave_0, values = (var_5203, var_5201_0))[name = string("op_5206")];
+            tensor<fp16, [1, 8, 3, 256]> var_5207_cast_fp16 = mul(x = var_5206, y = sin_s)[name = string("op_5207_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_95_cast_fp16 = add(x = var_5200_cast_fp16, y = var_5207_cast_fp16)[name = string("q_95_cast_fp16")];
+            string k_45_pad_type_0 = const()[name = string("k_45_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_45_strides_0 = const()[name = string("k_45_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_45_pad_0 = const()[name = string("k_45_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_45_dilations_0 = const()[name = string("k_45_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_45_groups_0 = const()[name = string("k_45_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> k_45 = conv(dilations = k_45_dilations_0, groups = k_45_groups_0, pad = k_45_pad_0, pad_type = k_45_pad_type_0, strides = k_45_strides_0, weight = layers_7_self_attn_k_proj_weight_palettized, x = var_5127_cast_fp16)[name = string("k_45")];
+            tensor<int32, [4]> var_5225 = const()[name = string("op_5225"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_5226 = reshape(shape = var_5225, x = k_45)[name = string("op_5226")];
+            tensor<int32, [4]> transpose_70_perm_0 = const()[name = string("transpose_70_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            string v_17_pad_type_0 = const()[name = string("v_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_17_strides_0 = const()[name = string("v_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_17_pad_0 = const()[name = string("v_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_17_dilations_0 = const()[name = string("v_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_17_groups_0 = const()[name = string("v_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> v_17 = conv(dilations = v_17_dilations_0, groups = v_17_groups_0, pad = v_17_pad_0, pad_type = v_17_pad_type_0, strides = v_17_strides_0, weight = layers_7_self_attn_v_proj_weight_palettized, x = var_5127_cast_fp16)[name = string("v_17")];
+            tensor<int32, [4]> var_5253 = const()[name = string("op_5253"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_5254 = reshape(shape = var_5253, x = v_17)[name = string("op_5254")];
+            tensor<int32, [4]> var_5259 = const()[name = string("op_5259"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_5277 = const()[name = string("op_5277"), val = tensor<int32, [3]>([3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> transpose_70 = transpose(perm = transpose_70_perm_0, x = var_5226)[name = string("transpose_96")];
+            tensor<fp16, [3, 2, 256]> x_143 = reshape(shape = var_5277, x = transpose_70)[name = string("x_143")];
+            int32 var_5283 = const()[name = string("op_5283"), val = int32(-1)];
+            fp16 const_86_promoted = const()[name = string("const_86_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 2, 256]> var_5285 = mul(x = x_143, y = const_86_promoted)[name = string("op_5285")];
+            bool input_217_interleave_0 = const()[name = string("input_217_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 2, 512]> input_217 = concat(axis = var_5283, interleave = input_217_interleave_0, values = (x_143, var_5285))[name = string("input_217")];
+            tensor<int32, [1]> normed_209_axes_0 = const()[name = string("normed_209_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5280_to_fp16 = const()[name = string("op_5280_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 2, 512]> normed_209_cast_fp16 = layer_norm(axes = normed_209_axes_0, epsilon = var_5280_to_fp16, x = input_217)[name = string("normed_209_cast_fp16")];
+            tensor<int32, [2]> var_5290_split_sizes_0 = const()[name = string("op_5290_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_5290_axis_0 = const()[name = string("op_5290_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 2, 256]> var_5290_0, tensor<fp16, [3, 2, 256]> var_5290_1 = split(axis = var_5290_axis_0, split_sizes = var_5290_split_sizes_0, x = normed_209_cast_fp16)[name = string("op_5290")];
+            tensor<fp16, [3, 2, 256]> k_49 = mul(x = var_5290_0, y = layers_0_self_attn_k_norm_weight)[name = string("k_49")];
+            tensor<int32, [4]> var_5297 = const()[name = string("op_5297"), val = tensor<int32, [4]>([1, 3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> var_5298 = reshape(shape = var_5297, x = k_49)[name = string("op_5298")];
+            tensor<int32, [4]> var_5303 = const()[name = string("op_5303"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            fp16 var_5305_promoted = const()[name = string("op_5305_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 3, 256]> var_5260 = transpose(perm = var_5259, x = var_5254)[name = string("transpose_95")];
+            tensor<fp16, [1, 2, 3, 256]> var_5306 = pow(x = var_5260, y = var_5305_promoted)[name = string("op_5306")];
+            tensor<int32, [1]> var_5311_axes_0 = const()[name = string("op_5311_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5311_keep_dims_0 = const()[name = string("op_5311_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 3, 1]> var_5311 = reduce_mean(axes = var_5311_axes_0, keep_dims = var_5311_keep_dims_0, x = var_5306)[name = string("op_5311")];
+            fp16 var_5313_to_fp16 = const()[name = string("op_5313_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 3, 1]> mean_sq_15_cast_fp16 = add(x = var_5311, y = var_5313_to_fp16)[name = string("mean_sq_15_cast_fp16")];
+            fp32 var_5315_epsilon_0 = const()[name = string("op_5315_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 3, 1]> var_5315_cast_fp16 = rsqrt(epsilon = var_5315_epsilon_0, x = mean_sq_15_cast_fp16)[name = string("op_5315_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_221_cast_fp16 = mul(x = var_5260, y = var_5315_cast_fp16)[name = string("input_221_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> q_93 = transpose(perm = var_5303, x = var_5298)[name = string("transpose_94")];
+            tensor<fp16, [1, 2, 3, 256]> var_5317_cast_fp16 = mul(x = q_93, y = cos_s)[name = string("op_5317_cast_fp16")];
+            tensor<int32, [2]> var_5318_split_sizes_0 = const()[name = string("op_5318_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_5318_axis_0 = const()[name = string("op_5318_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 3, 128]> var_5318_0, tensor<fp16, [1, 2, 3, 128]> var_5318_1 = split(axis = var_5318_axis_0, split_sizes = var_5318_split_sizes_0, x = q_93)[name = string("op_5318")];
+            fp16 const_87_promoted = const()[name = string("const_87_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 3, 128]> var_5320 = mul(x = var_5318_1, y = const_87_promoted)[name = string("op_5320")];
+            int32 var_5322 = const()[name = string("op_5322"), val = int32(-1)];
+            bool var_5323_interleave_0 = const()[name = string("op_5323_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 3, 256]> var_5323 = concat(axis = var_5322, interleave = var_5323_interleave_0, values = (var_5320, var_5318_0))[name = string("op_5323")];
+            tensor<fp16, [1, 2, 3, 256]> var_5324_cast_fp16 = mul(x = var_5323, y = sin_s)[name = string("op_5324_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_219_cast_fp16 = add(x = var_5317_cast_fp16, y = var_5324_cast_fp16)[name = string("input_219_cast_fp16")];
+            tensor<int32, [8]> k_padded_13_pad_0 = const()[name = string("k_padded_13_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_13_mode_0 = const()[name = string("k_padded_13_mode_0"), val = string("constant")];
+            fp16 const_88_to_fp16 = const()[name = string("const_88_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> k_padded_13_cast_fp16 = pad(constant_val = const_88_to_fp16, mode = k_padded_13_mode_0, pad = k_padded_13_pad_0, x = input_219_cast_fp16)[name = string("k_padded_13_cast_fp16")];
+            tensor<int32, [8]> v_padded_13_pad_0 = const()[name = string("v_padded_13_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_13_mode_0 = const()[name = string("v_padded_13_mode_0"), val = string("constant")];
+            fp16 const_89_to_fp16 = const()[name = string("const_89_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> v_padded_13_cast_fp16 = pad(constant_val = const_89_to_fp16, mode = v_padded_13_mode_0, pad = v_padded_13_pad_0, x = input_221_cast_fp16)[name = string("v_padded_13_cast_fp16")];
+            tensor<int32, [4]> slot_k_15_begin_0 = const()[name = string("slot_k_15_begin_0"), val = tensor<int32, [4]>([6, 0, 0, 0])];
+            tensor<int32, [4]> slot_k_15_end_0 = const()[name = string("slot_k_15_end_0"), val = tensor<int32, [4]>([7, 2, 512, 512])];
+            tensor<bool, [4]> slot_k_15_end_mask_0 = const()[name = string("slot_k_15_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_k_15_cast_fp16 = slice_by_index(begin = slot_k_15_begin_0, end = slot_k_15_end_0, end_mask = slot_k_15_end_mask_0, x = K_sliding_out_11_cast_fp16)[name = string("slot_k_15_cast_fp16")];
+            tensor<int32, [4]> slot_v_15_begin_0 = const()[name = string("slot_v_15_begin_0"), val = tensor<int32, [4]>([6, 0, 0, 0])];
+            tensor<int32, [4]> slot_v_15_end_0 = const()[name = string("slot_v_15_end_0"), val = tensor<int32, [4]>([7, 2, 512, 512])];
+            tensor<bool, [4]> slot_v_15_end_mask_0 = const()[name = string("slot_v_15_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_v_15_cast_fp16 = slice_by_index(begin = slot_v_15_begin_0, end = slot_v_15_end_0, end_mask = slot_v_15_end_mask_0, x = V_sliding_out_11_cast_fp16)[name = string("slot_v_15_cast_fp16")];
+            tensor<int32, [4]> var_5363_begin_0 = const()[name = string("op_5363_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_5363_end_0 = const()[name = string("op_5363_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_5363_end_mask_0 = const()[name = string("op_5363_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_5363_cast_fp16 = slice_by_index(begin = var_5363_begin_0, end = var_5363_end_0, end_mask = var_5363_end_mask_0, x = slot_k_15_cast_fp16)[name = string("op_5363_cast_fp16")];
+            int32 var_5370 = const()[name = string("op_5370"), val = int32(2)];
+            bool new_k_15_interleave_0 = const()[name = string("new_k_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_k_15_cast_fp16 = concat(axis = var_5370, interleave = new_k_15_interleave_0, values = (var_5363_cast_fp16, k_padded_13_cast_fp16))[name = string("new_k_15_cast_fp16")];
+            tensor<int32, [4]> var_5386_begin_0 = const()[name = string("op_5386_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_5386_end_0 = const()[name = string("op_5386_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_5386_end_mask_0 = const()[name = string("op_5386_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_5386_cast_fp16 = slice_by_index(begin = var_5386_begin_0, end = var_5386_end_0, end_mask = var_5386_end_mask_0, x = slot_v_15_cast_fp16)[name = string("op_5386_cast_fp16")];
+            int32 var_5393 = const()[name = string("op_5393"), val = int32(2)];
+            bool new_v_15_interleave_0 = const()[name = string("new_v_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_v_15_cast_fp16 = concat(axis = var_5393, interleave = new_v_15_interleave_0, values = (var_5386_cast_fp16, v_padded_13_cast_fp16))[name = string("new_v_15_cast_fp16")];
+            tensor<int32, [4]> var_5399_begin_0 = const()[name = string("op_5399_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_5399_end_0 = const()[name = string("op_5399_end_0"), val = tensor<int32, [4]>([6, 2, 512, 512])];
+            tensor<bool, [4]> var_5399_end_mask_0 = const()[name = string("op_5399_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [6, 2, 512, 512]> var_5399_cast_fp16 = slice_by_index(begin = var_5399_begin_0, end = var_5399_end_0, end_mask = var_5399_end_mask_0, x = K_sliding_out_11_cast_fp16)[name = string("op_5399_cast_fp16")];
+            tensor<int32, [4]> var_5404_begin_0 = const()[name = string("op_5404_begin_0"), val = tensor<int32, [4]>([7, 0, 0, 0])];
+            tensor<int32, [4]> var_5404_end_0 = const()[name = string("op_5404_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_5404_end_mask_0 = const()[name = string("op_5404_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [3, 2, 512, 512]> var_5404_cast_fp16 = slice_by_index(begin = var_5404_begin_0, end = var_5404_end_0, end_mask = var_5404_end_mask_0, x = K_sliding_out_11_cast_fp16)[name = string("op_5404_cast_fp16")];
+            int32 var_5406 = const()[name = string("op_5406"), val = int32(0)];
+            bool K_sliding_out_13_interleave_0 = const()[name = string("K_sliding_out_13_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> K_sliding_out_13_cast_fp16 = concat(axis = var_5406, interleave = K_sliding_out_13_interleave_0, values = (var_5399_cast_fp16, new_k_15_cast_fp16, var_5404_cast_fp16))[name = string("K_sliding_out_13_cast_fp16")];
+            tensor<int32, [4]> var_5412_begin_0 = const()[name = string("op_5412_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_5412_end_0 = const()[name = string("op_5412_end_0"), val = tensor<int32, [4]>([6, 2, 512, 512])];
+            tensor<bool, [4]> var_5412_end_mask_0 = const()[name = string("op_5412_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [6, 2, 512, 512]> var_5412_cast_fp16 = slice_by_index(begin = var_5412_begin_0, end = var_5412_end_0, end_mask = var_5412_end_mask_0, x = V_sliding_out_11_cast_fp16)[name = string("op_5412_cast_fp16")];
+            tensor<int32, [4]> var_5417_begin_0 = const()[name = string("op_5417_begin_0"), val = tensor<int32, [4]>([7, 0, 0, 0])];
+            tensor<int32, [4]> var_5417_end_0 = const()[name = string("op_5417_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_5417_end_mask_0 = const()[name = string("op_5417_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [3, 2, 512, 512]> var_5417_cast_fp16 = slice_by_index(begin = var_5417_begin_0, end = var_5417_end_0, end_mask = var_5417_end_mask_0, x = V_sliding_out_11_cast_fp16)[name = string("op_5417_cast_fp16")];
+            int32 var_5419 = const()[name = string("op_5419"), val = int32(0)];
+            bool V_sliding_out_13_interleave_0 = const()[name = string("V_sliding_out_13_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> V_sliding_out_13_cast_fp16 = concat(axis = var_5419, interleave = V_sliding_out_13_interleave_0, values = (var_5412_cast_fp16, new_v_15_cast_fp16, var_5417_cast_fp16))[name = string("V_sliding_out_13_cast_fp16")];
+            tensor<int32, [4]> var_5425_begin_0 = const()[name = string("op_5425_begin_0"), val = tensor<int32, [4]>([6, 0, 0, 0])];
+            tensor<int32, [4]> var_5425_end_0 = const()[name = string("op_5425_end_0"), val = tensor<int32, [4]>([7, 2, 512, 512])];
+            tensor<bool, [4]> var_5425_end_mask_0 = const()[name = string("op_5425_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_5425_cast_fp16 = slice_by_index(begin = var_5425_begin_0, end = var_5425_end_0, end_mask = var_5425_end_mask_0, x = K_sliding_out_13_cast_fp16)[name = string("op_5425_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_15_begin_0 = const()[name = string("K_for_attn_15_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_15_end_0 = const()[name = string("K_for_attn_15_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_15_end_mask_0 = const()[name = string("K_for_attn_15_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_15_cast_fp16 = slice_by_index(begin = K_for_attn_15_begin_0, end = K_for_attn_15_end_0, end_mask = K_for_attn_15_end_mask_0, x = var_5425_cast_fp16)[name = string("K_for_attn_15_cast_fp16")];
+            tensor<int32, [4]> var_5435_begin_0 = const()[name = string("op_5435_begin_0"), val = tensor<int32, [4]>([6, 0, 0, 0])];
+            tensor<int32, [4]> var_5435_end_0 = const()[name = string("op_5435_end_0"), val = tensor<int32, [4]>([7, 2, 512, 512])];
+            tensor<bool, [4]> var_5435_end_mask_0 = const()[name = string("op_5435_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_5435_cast_fp16 = slice_by_index(begin = var_5435_begin_0, end = var_5435_end_0, end_mask = var_5435_end_mask_0, x = V_sliding_out_13_cast_fp16)[name = string("op_5435_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_15_begin_0 = const()[name = string("V_for_attn_15_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_15_end_0 = const()[name = string("V_for_attn_15_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_15_end_mask_0 = const()[name = string("V_for_attn_15_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_15_cast_fp16 = slice_by_index(begin = V_for_attn_15_begin_0, end = V_for_attn_15_end_0, end_mask = V_for_attn_15_end_mask_0, x = var_5435_cast_fp16)[name = string("V_for_attn_15_cast_fp16")];
+            tensor<int32, [4]> transpose_28_perm_0 = const()[name = string("transpose_28_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_14_reps_0 = const()[name = string("tile_14_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_28_cast_fp16 = transpose(perm = transpose_28_perm_0, x = K_for_attn_15_cast_fp16)[name = string("transpose_93")];
+            tensor<fp16, [8, 1, 512, 256]> tile_14_cast_fp16 = tile(reps = tile_14_reps_0, x = transpose_28_cast_fp16)[name = string("tile_14_cast_fp16")];
+            tensor<int32, [5]> concat_30 = const()[name = string("concat_30"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_28_cast_fp16 = reshape(shape = concat_30, x = tile_14_cast_fp16)[name = string("reshape_28_cast_fp16")];
+            tensor<int32, [5]> transpose_29_perm_0 = const()[name = string("transpose_29_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_31 = const()[name = string("concat_31"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_29_cast_fp16 = transpose(perm = transpose_29_perm_0, x = reshape_28_cast_fp16)[name = string("transpose_92")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_29_cast_fp16 = reshape(shape = concat_31, x = transpose_29_cast_fp16)[name = string("reshape_29_cast_fp16")];
+            tensor<int32, [4]> transpose_71_perm_0 = const()[name = string("transpose_71_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_30_perm_0 = const()[name = string("transpose_30_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_15_reps_0 = const()[name = string("tile_15_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_30_cast_fp16 = transpose(perm = transpose_30_perm_0, x = V_for_attn_15_cast_fp16)[name = string("transpose_91")];
+            tensor<fp16, [8, 1, 512, 256]> tile_15_cast_fp16 = tile(reps = tile_15_reps_0, x = transpose_30_cast_fp16)[name = string("tile_15_cast_fp16")];
+            tensor<int32, [5]> concat_32 = const()[name = string("concat_32"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_30_cast_fp16 = reshape(shape = concat_32, x = tile_15_cast_fp16)[name = string("reshape_30_cast_fp16")];
+            tensor<int32, [5]> transpose_31_perm_0 = const()[name = string("transpose_31_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_33 = const()[name = string("concat_33"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_31_cast_fp16 = transpose(perm = transpose_31_perm_0, x = reshape_30_cast_fp16)[name = string("transpose_90")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_31_cast_fp16 = reshape(shape = concat_33, x = transpose_31_cast_fp16)[name = string("reshape_31_cast_fp16")];
+            tensor<int32, [4]> V_expanded_15_perm_0 = const()[name = string("V_expanded_15_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_29_transpose_x_0 = const()[name = string("attn_weights_29_transpose_x_0"), val = bool(false)];
+            bool attn_weights_29_transpose_y_0 = const()[name = string("attn_weights_29_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_71_cast_fp16 = transpose(perm = transpose_71_perm_0, x = reshape_29_cast_fp16)[name = string("transpose_89")];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_29_cast_fp16 = matmul(transpose_x = attn_weights_29_transpose_x_0, transpose_y = attn_weights_29_transpose_y_0, x = q_95_cast_fp16, y = transpose_71_cast_fp16)[name = string("attn_weights_29_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_147_cast_fp16 = add(x = attn_weights_29_cast_fp16, y = causal_mask_sliding)[name = string("x_147_cast_fp16")];
+            tensor<int32, [1]> reduce_max_7_axes_0 = const()[name = string("reduce_max_7_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_7_keep_dims_0 = const()[name = string("reduce_max_7_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_7 = reduce_max(axes = reduce_max_7_axes_0, keep_dims = reduce_max_7_keep_dims_0, x = x_147_cast_fp16)[name = string("reduce_max_7")];
+            tensor<fp16, [1, 8, 3, 512]> var_5470 = sub(x = x_147_cast_fp16, y = reduce_max_7)[name = string("op_5470")];
+            tensor<fp16, [1, 8, 3, 512]> var_5476 = exp(x = var_5470)[name = string("op_5476")];
+            tensor<int32, [1]> var_5486_axes_0 = const()[name = string("op_5486_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5486_keep_dims_0 = const()[name = string("op_5486_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_5486 = reduce_sum(axes = var_5486_axes_0, keep_dims = var_5486_keep_dims_0, x = var_5476)[name = string("op_5486")];
+            tensor<fp16, [1, 8, 3, 512]> var_5492_cast_fp16 = real_div(x = var_5476, y = var_5486)[name = string("op_5492_cast_fp16")];
+            bool attn_output_43_transpose_x_0 = const()[name = string("attn_output_43_transpose_x_0"), val = bool(false)];
+            bool attn_output_43_transpose_y_0 = const()[name = string("attn_output_43_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_15_cast_fp16 = transpose(perm = V_expanded_15_perm_0, x = reshape_31_cast_fp16)[name = string("transpose_88")];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_43_cast_fp16 = matmul(transpose_x = attn_output_43_transpose_x_0, transpose_y = attn_output_43_transpose_y_0, x = var_5492_cast_fp16, y = V_expanded_15_cast_fp16)[name = string("attn_output_43_cast_fp16")];
+            tensor<int32, [4]> var_5503 = const()[name = string("op_5503"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_5510 = const()[name = string("op_5510"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_5504_cast_fp16 = transpose(perm = var_5503, x = attn_output_43_cast_fp16)[name = string("transpose_87")];
+            tensor<fp16, [1, 3, 2048]> attn_output_45_cast_fp16 = reshape(shape = var_5510, x = var_5504_cast_fp16)[name = string("attn_output_45_cast_fp16")];
+            tensor<int32, [3]> var_5515 = const()[name = string("op_5515"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_5531_pad_type_0 = const()[name = string("op_5531_pad_type_0"), val = string("valid")];
+            int32 var_5531_groups_0 = const()[name = string("op_5531_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_5531_strides_0 = const()[name = string("op_5531_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_5531_pad_0 = const()[name = string("op_5531_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_5531_dilations_0 = const()[name = string("op_5531_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_7_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(568478016))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(571099520))))[name = string("squeeze_7_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_5516_cast_fp16 = transpose(perm = var_5515, x = attn_output_45_cast_fp16)[name = string("transpose_86")];
+            tensor<fp16, [1, 2560, 3]> var_5531_cast_fp16 = conv(dilations = var_5531_dilations_0, groups = var_5531_groups_0, pad = var_5531_pad_0, pad_type = var_5531_pad_type_0, strides = var_5531_strides_0, weight = squeeze_7_cast_fp16_to_fp32_to_fp16_palettized, x = var_5516_cast_fp16)[name = string("op_5531_cast_fp16")];
+            tensor<int32, [3]> var_5535 = const()[name = string("op_5535"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_5541 = const()[name = string("op_5541"), val = int32(-1)];
+            fp16 const_90_promoted_to_fp16 = const()[name = string("const_90_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_151_cast_fp16 = transpose(perm = var_5535, x = var_5531_cast_fp16)[name = string("transpose_85")];
+            tensor<fp16, [1, 3, 2560]> var_5543_cast_fp16 = mul(x = x_151_cast_fp16, y = const_90_promoted_to_fp16)[name = string("op_5543_cast_fp16")];
+            bool input_225_interleave_0 = const()[name = string("input_225_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_225_cast_fp16 = concat(axis = var_5541, interleave = input_225_interleave_0, values = (x_151_cast_fp16, var_5543_cast_fp16))[name = string("input_225_cast_fp16")];
+            tensor<int32, [1]> normed_213_axes_0 = const()[name = string("normed_213_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5538_to_fp16 = const()[name = string("op_5538_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_213_cast_fp16 = layer_norm(axes = normed_213_axes_0, epsilon = var_5538_to_fp16, x = input_225_cast_fp16)[name = string("normed_213_cast_fp16")];
+            tensor<int32, [2]> var_5548_split_sizes_0 = const()[name = string("op_5548_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5548_axis_0 = const()[name = string("op_5548_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_5548_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_5548_cast_fp16_1 = split(axis = var_5548_axis_0, split_sizes = var_5548_split_sizes_0, x = normed_213_cast_fp16)[name = string("op_5548_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_7_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(571102144)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_47_cast_fp16 = mul(x = var_5548_cast_fp16_0, y = layers_7_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_47_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_153_cast_fp16 = add(x = x_139_cast_fp16, y = attn_output_47_cast_fp16)[name = string("x_153_cast_fp16")];
+            int32 var_5557 = const()[name = string("op_5557"), val = int32(-1)];
+            fp16 const_91_promoted_to_fp16 = const()[name = string("const_91_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_5559_cast_fp16 = mul(x = x_153_cast_fp16, y = const_91_promoted_to_fp16)[name = string("op_5559_cast_fp16")];
+            bool input_227_interleave_0 = const()[name = string("input_227_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_227_cast_fp16 = concat(axis = var_5557, interleave = input_227_interleave_0, values = (x_153_cast_fp16, var_5559_cast_fp16))[name = string("input_227_cast_fp16")];
+            tensor<int32, [1]> normed_217_axes_0 = const()[name = string("normed_217_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5554_to_fp16 = const()[name = string("op_5554_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_217_cast_fp16 = layer_norm(axes = normed_217_axes_0, epsilon = var_5554_to_fp16, x = input_227_cast_fp16)[name = string("normed_217_cast_fp16")];
+            tensor<int32, [2]> var_5564_split_sizes_0 = const()[name = string("op_5564_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5564_axis_0 = const()[name = string("op_5564_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_5564_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_5564_cast_fp16_1 = split(axis = var_5564_axis_0, split_sizes = var_5564_split_sizes_0, x = normed_217_cast_fp16)[name = string("op_5564_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_7_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(571107328)))];
+            tensor<fp16, [1, 3, 2560]> h_45_cast_fp16 = mul(x = var_5564_cast_fp16_0, y = layers_7_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_45_cast_fp16")];
+            tensor<int32, [3]> var_5575 = const()[name = string("op_5575"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_229_axes_0 = const()[name = string("input_229_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_5576 = transpose(perm = var_5575, x = h_45_cast_fp16)[name = string("transpose_84")];
+            tensor<fp16, [1, 2560, 1, 3]> input_229 = expand_dims(axes = input_229_axes_0, x = var_5576)[name = string("input_229")];
+            string gate_29_pad_type_0 = const()[name = string("gate_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_29_strides_0 = const()[name = string("gate_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_29_pad_0 = const()[name = string("gate_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_29_dilations_0 = const()[name = string("gate_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_29_groups_0 = const()[name = string("gate_29_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_29 = conv(dilations = gate_29_dilations_0, groups = gate_29_groups_0, pad = gate_29_pad_0, pad_type = gate_29_pad_type_0, strides = gate_29_strides_0, weight = layers_7_mlp_gate_proj_weight_palettized, x = input_229)[name = string("gate_29")];
+            string up_15_pad_type_0 = const()[name = string("up_15_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_15_strides_0 = const()[name = string("up_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_15_pad_0 = const()[name = string("up_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_15_dilations_0 = const()[name = string("up_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_15_groups_0 = const()[name = string("up_15_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_15 = conv(dilations = up_15_dilations_0, groups = up_15_groups_0, pad = up_15_pad_0, pad_type = up_15_pad_type_0, strides = up_15_strides_0, weight = layers_7_mlp_up_proj_weight_palettized, x = input_229)[name = string("up_15")];
+            string gate_31_mode_0 = const()[name = string("gate_31_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_31 = gelu(mode = gate_31_mode_0, x = gate_29)[name = string("gate_31")];
+            tensor<fp16, [1, 10240, 1, 3]> input_231 = mul(x = gate_31, y = up_15)[name = string("input_231")];
+            string mlp_out_15_pad_type_0 = const()[name = string("mlp_out_15_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_15_strides_0 = const()[name = string("mlp_out_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_15_pad_0 = const()[name = string("mlp_out_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_15_dilations_0 = const()[name = string("mlp_out_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_15_groups_0 = const()[name = string("mlp_out_15_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_15 = conv(dilations = mlp_out_15_dilations_0, groups = mlp_out_15_groups_0, pad = mlp_out_15_pad_0, pad_type = mlp_out_15_pad_type_0, strides = mlp_out_15_strides_0, weight = layers_7_mlp_down_proj_weight_palettized, x = input_231)[name = string("mlp_out_15")];
+            tensor<int32, [1]> var_5616_axes_0 = const()[name = string("op_5616_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_5616 = squeeze(axes = var_5616_axes_0, x = mlp_out_15)[name = string("op_5616")];
+            tensor<int32, [3]> var_5620 = const()[name = string("op_5620"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_5626 = const()[name = string("op_5626"), val = int32(-1)];
+            fp16 const_92_promoted = const()[name = string("const_92_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_155 = transpose(perm = var_5620, x = var_5616)[name = string("transpose_83")];
+            tensor<fp16, [1, 3, 2560]> var_5628 = mul(x = x_155, y = const_92_promoted)[name = string("op_5628")];
+            bool input_233_interleave_0 = const()[name = string("input_233_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_233 = concat(axis = var_5626, interleave = input_233_interleave_0, values = (x_155, var_5628))[name = string("input_233")];
+            tensor<int32, [1]> normed_221_axes_0 = const()[name = string("normed_221_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5623_to_fp16 = const()[name = string("op_5623_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_221_cast_fp16 = layer_norm(axes = normed_221_axes_0, epsilon = var_5623_to_fp16, x = input_233)[name = string("normed_221_cast_fp16")];
+            tensor<int32, [2]> var_5633_split_sizes_0 = const()[name = string("op_5633_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5633_axis_0 = const()[name = string("op_5633_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_5633_0, tensor<fp16, [1, 3, 2560]> var_5633_1 = split(axis = var_5633_axis_0, split_sizes = var_5633_split_sizes_0, x = normed_221_cast_fp16)[name = string("op_5633")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_73 = mul(x = var_5633_0, y = layers_7_post_feedforward_layernorm_weight)[name = string("hidden_states_73")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_75_cast_fp16 = add(x = x_153_cast_fp16, y = hidden_states_73)[name = string("hidden_states_75_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_15_begin_0 = const()[name = string("per_layer_slice_15_begin_0"), val = tensor<int32, [3]>([0, 0, 1792])];
+            tensor<int32, [3]> per_layer_slice_15_end_0 = const()[name = string("per_layer_slice_15_end_0"), val = tensor<int32, [3]>([1, 3, 2048])];
+            tensor<bool, [3]> per_layer_slice_15_end_mask_0 = const()[name = string("per_layer_slice_15_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_15_cast_fp16 = slice_by_index(begin = per_layer_slice_15_begin_0, end = per_layer_slice_15_end_0, end_mask = per_layer_slice_15_end_mask_0, x = per_layer_combined_out)[name = string("per_layer_slice_15_cast_fp16")];
+            tensor<int32, [3]> var_5661 = const()[name = string("op_5661"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_235_axes_0 = const()[name = string("input_235_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_5662 = transpose(perm = var_5661, x = hidden_states_75_cast_fp16)[name = string("transpose_82")];
+            tensor<fp16, [1, 2560, 1, 3]> input_235 = expand_dims(axes = input_235_axes_0, x = var_5662)[name = string("input_235")];
+            string gated_43_pad_type_0 = const()[name = string("gated_43_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_43_strides_0 = const()[name = string("gated_43_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_43_pad_0 = const()[name = string("gated_43_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_43_dilations_0 = const()[name = string("gated_43_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_43_groups_0 = const()[name = string("gated_43_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_43 = conv(dilations = gated_43_dilations_0, groups = gated_43_groups_0, pad = gated_43_pad_0, pad_type = gated_43_pad_type_0, strides = gated_43_strides_0, weight = layers_7_per_layer_input_gate_weight_palettized, x = input_235)[name = string("gated_43")];
+            string gated_45_mode_0 = const()[name = string("gated_45_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_45 = gelu(mode = gated_45_mode_0, x = gated_43)[name = string("gated_45")];
+            tensor<int32, [3]> var_5681 = const()[name = string("op_5681"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_15_axes_0 = const()[name = string("per_layer_slice_conv_15_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_5682_cast_fp16 = transpose(perm = var_5681, x = per_layer_slice_15_cast_fp16)[name = string("transpose_81")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_15_cast_fp16 = expand_dims(axes = per_layer_slice_conv_15_axes_0, x = var_5682_cast_fp16)[name = string("per_layer_slice_conv_15_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_237_cast_fp16 = mul(x = gated_45, y = per_layer_slice_conv_15_cast_fp16)[name = string("input_237_cast_fp16")];
+            string gated_47_pad_type_0 = const()[name = string("gated_47_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_47_strides_0 = const()[name = string("gated_47_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_47_pad_0 = const()[name = string("gated_47_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_47_dilations_0 = const()[name = string("gated_47_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_47_groups_0 = const()[name = string("gated_47_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_7_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(571112512))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(571440256))))[name = string("layers_7_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_47_cast_fp16 = conv(dilations = gated_47_dilations_0, groups = gated_47_groups_0, pad = gated_47_pad_0, pad_type = gated_47_pad_type_0, strides = gated_47_strides_0, weight = layers_7_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_237_cast_fp16)[name = string("gated_47_cast_fp16")];
+            tensor<int32, [1]> var_5698_axes_0 = const()[name = string("op_5698_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_5698_cast_fp16 = squeeze(axes = var_5698_axes_0, x = gated_47_cast_fp16)[name = string("op_5698_cast_fp16")];
+            tensor<int32, [3]> var_5702 = const()[name = string("op_5702"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_5708 = const()[name = string("op_5708"), val = int32(-1)];
+            fp16 const_93_promoted_to_fp16 = const()[name = string("const_93_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_157_cast_fp16 = transpose(perm = var_5702, x = var_5698_cast_fp16)[name = string("transpose_80")];
+            tensor<fp16, [1, 3, 2560]> var_5710_cast_fp16 = mul(x = x_157_cast_fp16, y = const_93_promoted_to_fp16)[name = string("op_5710_cast_fp16")];
+            bool input_239_interleave_0 = const()[name = string("input_239_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_239_cast_fp16 = concat(axis = var_5708, interleave = input_239_interleave_0, values = (x_157_cast_fp16, var_5710_cast_fp16))[name = string("input_239_cast_fp16")];
+            tensor<int32, [1]> normed_225_axes_0 = const()[name = string("normed_225_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5705_to_fp16 = const()[name = string("op_5705_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_225_cast_fp16 = layer_norm(axes = normed_225_axes_0, epsilon = var_5705_to_fp16, x = input_239_cast_fp16)[name = string("normed_225_cast_fp16")];
+            tensor<int32, [2]> var_5715_split_sizes_0 = const()[name = string("op_5715_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5715_axis_0 = const()[name = string("op_5715_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_5715_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_5715_cast_fp16_1 = split(axis = var_5715_axis_0, split_sizes = var_5715_split_sizes_0, x = normed_225_cast_fp16)[name = string("op_5715_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_7_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(571442880)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_79_cast_fp16 = mul(x = var_5715_cast_fp16_0, y = layers_7_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_79_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_81_cast_fp16 = add(x = hidden_states_75_cast_fp16, y = hidden_states_79_cast_fp16)[name = string("hidden_states_81_cast_fp16")];
+            tensor<fp16, [1]> const_94_promoted_to_fp16 = const()[name = string("const_94_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.58p-1])];
+            tensor<fp16, [1, 3, 2560]> x_159_cast_fp16 = mul(x = hidden_states_81_cast_fp16, y = const_94_promoted_to_fp16)[name = string("x_159_cast_fp16")];
+            int32 var_5730 = const()[name = string("op_5730"), val = int32(-1)];
+            fp16 const_95_promoted_to_fp16 = const()[name = string("const_95_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_5732_cast_fp16 = mul(x = x_159_cast_fp16, y = const_95_promoted_to_fp16)[name = string("op_5732_cast_fp16")];
+            bool input_241_interleave_0 = const()[name = string("input_241_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_241_cast_fp16 = concat(axis = var_5730, interleave = input_241_interleave_0, values = (x_159_cast_fp16, var_5732_cast_fp16))[name = string("input_241_cast_fp16")];
+            tensor<int32, [1]> normed_229_axes_0 = const()[name = string("normed_229_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5727_to_fp16 = const()[name = string("op_5727_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_229_cast_fp16 = layer_norm(axes = normed_229_axes_0, epsilon = var_5727_to_fp16, x = input_241_cast_fp16)[name = string("normed_229_cast_fp16")];
+            tensor<int32, [2]> var_5737_split_sizes_0 = const()[name = string("op_5737_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5737_axis_0 = const()[name = string("op_5737_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_5737_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_5737_cast_fp16_1 = split(axis = var_5737_axis_0, split_sizes = var_5737_split_sizes_0, x = normed_229_cast_fp16)[name = string("op_5737_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_8_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(571448064)))];
+            tensor<fp16, [1, 3, 2560]> h_49_cast_fp16 = mul(x = var_5737_cast_fp16_0, y = layers_8_input_layernorm_weight_promoted_to_fp16)[name = string("h_49_cast_fp16")];
+            tensor<int32, [3]> var_5743 = const()[name = string("op_5743"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_5746_axes_0 = const()[name = string("op_5746_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_5744_cast_fp16 = transpose(perm = var_5743, x = h_49_cast_fp16)[name = string("transpose_79")];
+            tensor<fp16, [1, 2560, 1, 3]> var_5746_cast_fp16 = expand_dims(axes = var_5746_axes_0, x = var_5744_cast_fp16)[name = string("op_5746_cast_fp16")];
+            string q_97_pad_type_0 = const()[name = string("q_97_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_97_strides_0 = const()[name = string("q_97_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_97_pad_0 = const()[name = string("q_97_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_97_dilations_0 = const()[name = string("q_97_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_97_groups_0 = const()[name = string("q_97_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_97 = conv(dilations = q_97_dilations_0, groups = q_97_groups_0, pad = q_97_pad_0, pad_type = q_97_pad_type_0, strides = q_97_strides_0, weight = layers_8_self_attn_q_proj_weight_palettized, x = var_5746_cast_fp16)[name = string("q_97")];
+            tensor<int32, [4]> var_5767 = const()[name = string("op_5767"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_5768 = reshape(shape = var_5767, x = q_97)[name = string("op_5768")];
+            tensor<int32, [4]> transpose_72_perm_0 = const()[name = string("transpose_72_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_5791 = const()[name = string("op_5791"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_72 = transpose(perm = transpose_72_perm_0, x = var_5768)[name = string("transpose_78")];
+            tensor<fp16, [3, 8, 256]> x_161 = reshape(shape = var_5791, x = transpose_72)[name = string("x_161")];
+            int32 var_5797 = const()[name = string("op_5797"), val = int32(-1)];
+            fp16 const_96_promoted = const()[name = string("const_96_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_5799 = mul(x = x_161, y = const_96_promoted)[name = string("op_5799")];
+            bool input_245_interleave_0 = const()[name = string("input_245_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_245 = concat(axis = var_5797, interleave = input_245_interleave_0, values = (x_161, var_5799))[name = string("input_245")];
+            tensor<int32, [1]> normed_233_axes_0 = const()[name = string("normed_233_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5794_to_fp16 = const()[name = string("op_5794_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_233_cast_fp16 = layer_norm(axes = normed_233_axes_0, epsilon = var_5794_to_fp16, x = input_245)[name = string("normed_233_cast_fp16")];
+            tensor<int32, [2]> var_5804_split_sizes_0 = const()[name = string("op_5804_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_5804_axis_0 = const()[name = string("op_5804_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_5804_0, tensor<fp16, [3, 8, 256]> var_5804_1 = split(axis = var_5804_axis_0, split_sizes = var_5804_split_sizes_0, x = normed_233_cast_fp16)[name = string("op_5804")];
+            tensor<fp16, [3, 8, 256]> q_101 = mul(x = var_5804_0, y = layers_8_self_attn_q_norm_weight)[name = string("q_101")];
+            tensor<int32, [4]> var_5811 = const()[name = string("op_5811"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_5812 = reshape(shape = var_5811, x = q_101)[name = string("op_5812")];
+            tensor<int32, [4]> var_5817 = const()[name = string("op_5817"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_103 = transpose(perm = var_5817, x = var_5812)[name = string("transpose_77")];
+            tensor<fp16, [1, 8, 3, 256]> var_5819_cast_fp16 = mul(x = q_103, y = cos_s)[name = string("op_5819_cast_fp16")];
+            tensor<int32, [2]> var_5820_split_sizes_0 = const()[name = string("op_5820_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_5820_axis_0 = const()[name = string("op_5820_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_5820_0, tensor<fp16, [1, 8, 3, 128]> var_5820_1 = split(axis = var_5820_axis_0, split_sizes = var_5820_split_sizes_0, x = q_103)[name = string("op_5820")];
+            fp16 const_97_promoted = const()[name = string("const_97_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_5822 = mul(x = var_5820_1, y = const_97_promoted)[name = string("op_5822")];
+            int32 var_5824 = const()[name = string("op_5824"), val = int32(-1)];
+            bool var_5825_interleave_0 = const()[name = string("op_5825_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_5825 = concat(axis = var_5824, interleave = var_5825_interleave_0, values = (var_5822, var_5820_0))[name = string("op_5825")];
+            tensor<fp16, [1, 8, 3, 256]> var_5826_cast_fp16 = mul(x = var_5825, y = sin_s)[name = string("op_5826_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_107_cast_fp16 = add(x = var_5819_cast_fp16, y = var_5826_cast_fp16)[name = string("q_107_cast_fp16")];
+            string k_51_pad_type_0 = const()[name = string("k_51_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_51_strides_0 = const()[name = string("k_51_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_51_pad_0 = const()[name = string("k_51_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_51_dilations_0 = const()[name = string("k_51_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_51_groups_0 = const()[name = string("k_51_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> k_51 = conv(dilations = k_51_dilations_0, groups = k_51_groups_0, pad = k_51_pad_0, pad_type = k_51_pad_type_0, strides = k_51_strides_0, weight = layers_8_self_attn_k_proj_weight_palettized, x = var_5746_cast_fp16)[name = string("k_51")];
+            tensor<int32, [4]> var_5844 = const()[name = string("op_5844"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_5845 = reshape(shape = var_5844, x = k_51)[name = string("op_5845")];
+            tensor<int32, [4]> transpose_73_perm_0 = const()[name = string("transpose_73_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            string v_19_pad_type_0 = const()[name = string("v_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_19_strides_0 = const()[name = string("v_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_19_pad_0 = const()[name = string("v_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_19_dilations_0 = const()[name = string("v_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_19_groups_0 = const()[name = string("v_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> v_19 = conv(dilations = v_19_dilations_0, groups = v_19_groups_0, pad = v_19_pad_0, pad_type = v_19_pad_type_0, strides = v_19_strides_0, weight = layers_8_self_attn_v_proj_weight_palettized, x = var_5746_cast_fp16)[name = string("v_19")];
+            tensor<int32, [4]> var_5872 = const()[name = string("op_5872"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_5873 = reshape(shape = var_5872, x = v_19)[name = string("op_5873")];
+            tensor<int32, [4]> var_5878 = const()[name = string("op_5878"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_5896 = const()[name = string("op_5896"), val = tensor<int32, [3]>([3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> transpose_73 = transpose(perm = transpose_73_perm_0, x = var_5845)[name = string("transpose_76")];
+            tensor<fp16, [3, 2, 256]> x_163 = reshape(shape = var_5896, x = transpose_73)[name = string("x_163")];
+            int32 var_5902 = const()[name = string("op_5902"), val = int32(-1)];
+            fp16 const_98_promoted = const()[name = string("const_98_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 2, 256]> var_5904 = mul(x = x_163, y = const_98_promoted)[name = string("op_5904")];
+            bool input_247_interleave_0 = const()[name = string("input_247_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 2, 512]> input_247 = concat(axis = var_5902, interleave = input_247_interleave_0, values = (x_163, var_5904))[name = string("input_247")];
+            tensor<int32, [1]> normed_237_axes_0 = const()[name = string("normed_237_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5899_to_fp16 = const()[name = string("op_5899_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 2, 512]> normed_237_cast_fp16 = layer_norm(axes = normed_237_axes_0, epsilon = var_5899_to_fp16, x = input_247)[name = string("normed_237_cast_fp16")];
+            tensor<int32, [2]> var_5909_split_sizes_0 = const()[name = string("op_5909_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_5909_axis_0 = const()[name = string("op_5909_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 2, 256]> var_5909_0, tensor<fp16, [3, 2, 256]> var_5909_1 = split(axis = var_5909_axis_0, split_sizes = var_5909_split_sizes_0, x = normed_237_cast_fp16)[name = string("op_5909")];
+            tensor<fp16, [3, 2, 256]> k_55 = mul(x = var_5909_0, y = layers_8_self_attn_k_norm_weight)[name = string("k_55")];
+            tensor<int32, [4]> var_5916 = const()[name = string("op_5916"), val = tensor<int32, [4]>([1, 3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> var_5917 = reshape(shape = var_5916, x = k_55)[name = string("op_5917")];
+            tensor<int32, [4]> var_5922 = const()[name = string("op_5922"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            fp16 var_5924_promoted = const()[name = string("op_5924_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 3, 256]> var_5879 = transpose(perm = var_5878, x = var_5873)[name = string("transpose_75")];
+            tensor<fp16, [1, 2, 3, 256]> var_5925 = pow(x = var_5879, y = var_5924_promoted)[name = string("op_5925")];
+            tensor<int32, [1]> var_5930_axes_0 = const()[name = string("op_5930_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5930_keep_dims_0 = const()[name = string("op_5930_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 3, 1]> var_5930 = reduce_mean(axes = var_5930_axes_0, keep_dims = var_5930_keep_dims_0, x = var_5925)[name = string("op_5930")];
+            fp16 var_5932_to_fp16 = const()[name = string("op_5932_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 3, 1]> mean_sq_17_cast_fp16 = add(x = var_5930, y = var_5932_to_fp16)[name = string("mean_sq_17_cast_fp16")];
+            fp32 var_5934_epsilon_0 = const()[name = string("op_5934_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 3, 1]> var_5934_cast_fp16 = rsqrt(epsilon = var_5934_epsilon_0, x = mean_sq_17_cast_fp16)[name = string("op_5934_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_251_cast_fp16 = mul(x = var_5879, y = var_5934_cast_fp16)[name = string("input_251_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> q_105 = transpose(perm = var_5922, x = var_5917)[name = string("transpose_74")];
+            tensor<fp16, [1, 2, 3, 256]> var_5936_cast_fp16 = mul(x = q_105, y = cos_s)[name = string("op_5936_cast_fp16")];
+            tensor<int32, [2]> var_5937_split_sizes_0 = const()[name = string("op_5937_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_5937_axis_0 = const()[name = string("op_5937_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 3, 128]> var_5937_0, tensor<fp16, [1, 2, 3, 128]> var_5937_1 = split(axis = var_5937_axis_0, split_sizes = var_5937_split_sizes_0, x = q_105)[name = string("op_5937")];
+            fp16 const_99_promoted = const()[name = string("const_99_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 3, 128]> var_5939 = mul(x = var_5937_1, y = const_99_promoted)[name = string("op_5939")];
+            int32 var_5941 = const()[name = string("op_5941"), val = int32(-1)];
+            bool var_5942_interleave_0 = const()[name = string("op_5942_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 3, 256]> var_5942 = concat(axis = var_5941, interleave = var_5942_interleave_0, values = (var_5939, var_5937_0))[name = string("op_5942")];
+            tensor<fp16, [1, 2, 3, 256]> var_5943_cast_fp16 = mul(x = var_5942, y = sin_s)[name = string("op_5943_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_249_cast_fp16 = add(x = var_5936_cast_fp16, y = var_5943_cast_fp16)[name = string("input_249_cast_fp16")];
+            tensor<int32, [8]> k_padded_15_pad_0 = const()[name = string("k_padded_15_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_15_mode_0 = const()[name = string("k_padded_15_mode_0"), val = string("constant")];
+            fp16 const_100_to_fp16 = const()[name = string("const_100_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> k_padded_15_cast_fp16 = pad(constant_val = const_100_to_fp16, mode = k_padded_15_mode_0, pad = k_padded_15_pad_0, x = input_249_cast_fp16)[name = string("k_padded_15_cast_fp16")];
+            tensor<int32, [8]> v_padded_15_pad_0 = const()[name = string("v_padded_15_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_15_mode_0 = const()[name = string("v_padded_15_mode_0"), val = string("constant")];
+            fp16 const_101_to_fp16 = const()[name = string("const_101_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> v_padded_15_cast_fp16 = pad(constant_val = const_101_to_fp16, mode = v_padded_15_mode_0, pad = v_padded_15_pad_0, x = input_251_cast_fp16)[name = string("v_padded_15_cast_fp16")];
+            tensor<int32, [4]> slot_k_17_begin_0 = const()[name = string("slot_k_17_begin_0"), val = tensor<int32, [4]>([7, 0, 0, 0])];
+            tensor<int32, [4]> slot_k_17_end_0 = const()[name = string("slot_k_17_end_0"), val = tensor<int32, [4]>([8, 2, 512, 512])];
+            tensor<bool, [4]> slot_k_17_end_mask_0 = const()[name = string("slot_k_17_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_k_17_cast_fp16 = slice_by_index(begin = slot_k_17_begin_0, end = slot_k_17_end_0, end_mask = slot_k_17_end_mask_0, x = K_sliding_out_13_cast_fp16)[name = string("slot_k_17_cast_fp16")];
+            tensor<int32, [4]> slot_v_17_begin_0 = const()[name = string("slot_v_17_begin_0"), val = tensor<int32, [4]>([7, 0, 0, 0])];
+            tensor<int32, [4]> slot_v_17_end_0 = const()[name = string("slot_v_17_end_0"), val = tensor<int32, [4]>([8, 2, 512, 512])];
+            tensor<bool, [4]> slot_v_17_end_mask_0 = const()[name = string("slot_v_17_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_v_17_cast_fp16 = slice_by_index(begin = slot_v_17_begin_0, end = slot_v_17_end_0, end_mask = slot_v_17_end_mask_0, x = V_sliding_out_13_cast_fp16)[name = string("slot_v_17_cast_fp16")];
+            tensor<int32, [4]> var_5982_begin_0 = const()[name = string("op_5982_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_5982_end_0 = const()[name = string("op_5982_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_5982_end_mask_0 = const()[name = string("op_5982_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_5982_cast_fp16 = slice_by_index(begin = var_5982_begin_0, end = var_5982_end_0, end_mask = var_5982_end_mask_0, x = slot_k_17_cast_fp16)[name = string("op_5982_cast_fp16")];
+            int32 var_5989 = const()[name = string("op_5989"), val = int32(2)];
+            bool new_k_17_interleave_0 = const()[name = string("new_k_17_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_k_17_cast_fp16 = concat(axis = var_5989, interleave = new_k_17_interleave_0, values = (var_5982_cast_fp16, k_padded_15_cast_fp16))[name = string("new_k_17_cast_fp16")];
+            tensor<int32, [4]> var_6005_begin_0 = const()[name = string("op_6005_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_6005_end_0 = const()[name = string("op_6005_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_6005_end_mask_0 = const()[name = string("op_6005_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_6005_cast_fp16 = slice_by_index(begin = var_6005_begin_0, end = var_6005_end_0, end_mask = var_6005_end_mask_0, x = slot_v_17_cast_fp16)[name = string("op_6005_cast_fp16")];
+            int32 var_6012 = const()[name = string("op_6012"), val = int32(2)];
+            bool new_v_17_interleave_0 = const()[name = string("new_v_17_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_v_17_cast_fp16 = concat(axis = var_6012, interleave = new_v_17_interleave_0, values = (var_6005_cast_fp16, v_padded_15_cast_fp16))[name = string("new_v_17_cast_fp16")];
+            tensor<int32, [4]> var_6018_begin_0 = const()[name = string("op_6018_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_6018_end_0 = const()[name = string("op_6018_end_0"), val = tensor<int32, [4]>([7, 2, 512, 512])];
+            tensor<bool, [4]> var_6018_end_mask_0 = const()[name = string("op_6018_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [7, 2, 512, 512]> var_6018_cast_fp16 = slice_by_index(begin = var_6018_begin_0, end = var_6018_end_0, end_mask = var_6018_end_mask_0, x = K_sliding_out_13_cast_fp16)[name = string("op_6018_cast_fp16")];
+            tensor<int32, [4]> var_6023_begin_0 = const()[name = string("op_6023_begin_0"), val = tensor<int32, [4]>([8, 0, 0, 0])];
+            tensor<int32, [4]> var_6023_end_0 = const()[name = string("op_6023_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_6023_end_mask_0 = const()[name = string("op_6023_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [2, 2, 512, 512]> var_6023_cast_fp16 = slice_by_index(begin = var_6023_begin_0, end = var_6023_end_0, end_mask = var_6023_end_mask_0, x = K_sliding_out_13_cast_fp16)[name = string("op_6023_cast_fp16")];
+            int32 var_6025 = const()[name = string("op_6025"), val = int32(0)];
+            bool K_sliding_out_15_interleave_0 = const()[name = string("K_sliding_out_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> K_sliding_out_15_cast_fp16 = concat(axis = var_6025, interleave = K_sliding_out_15_interleave_0, values = (var_6018_cast_fp16, new_k_17_cast_fp16, var_6023_cast_fp16))[name = string("K_sliding_out_15_cast_fp16")];
+            tensor<int32, [4]> var_6031_begin_0 = const()[name = string("op_6031_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_6031_end_0 = const()[name = string("op_6031_end_0"), val = tensor<int32, [4]>([7, 2, 512, 512])];
+            tensor<bool, [4]> var_6031_end_mask_0 = const()[name = string("op_6031_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [7, 2, 512, 512]> var_6031_cast_fp16 = slice_by_index(begin = var_6031_begin_0, end = var_6031_end_0, end_mask = var_6031_end_mask_0, x = V_sliding_out_13_cast_fp16)[name = string("op_6031_cast_fp16")];
+            tensor<int32, [4]> var_6036_begin_0 = const()[name = string("op_6036_begin_0"), val = tensor<int32, [4]>([8, 0, 0, 0])];
+            tensor<int32, [4]> var_6036_end_0 = const()[name = string("op_6036_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_6036_end_mask_0 = const()[name = string("op_6036_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [2, 2, 512, 512]> var_6036_cast_fp16 = slice_by_index(begin = var_6036_begin_0, end = var_6036_end_0, end_mask = var_6036_end_mask_0, x = V_sliding_out_13_cast_fp16)[name = string("op_6036_cast_fp16")];
+            int32 var_6038 = const()[name = string("op_6038"), val = int32(0)];
+            bool V_sliding_out_15_interleave_0 = const()[name = string("V_sliding_out_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> V_sliding_out_15_cast_fp16 = concat(axis = var_6038, interleave = V_sliding_out_15_interleave_0, values = (var_6031_cast_fp16, new_v_17_cast_fp16, var_6036_cast_fp16))[name = string("V_sliding_out_15_cast_fp16")];
+            tensor<int32, [4]> var_6044_begin_0 = const()[name = string("op_6044_begin_0"), val = tensor<int32, [4]>([7, 0, 0, 0])];
+            tensor<int32, [4]> var_6044_end_0 = const()[name = string("op_6044_end_0"), val = tensor<int32, [4]>([8, 2, 512, 512])];
+            tensor<bool, [4]> var_6044_end_mask_0 = const()[name = string("op_6044_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_6044_cast_fp16 = slice_by_index(begin = var_6044_begin_0, end = var_6044_end_0, end_mask = var_6044_end_mask_0, x = K_sliding_out_15_cast_fp16)[name = string("op_6044_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_17_begin_0 = const()[name = string("K_for_attn_17_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_17_end_0 = const()[name = string("K_for_attn_17_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_17_end_mask_0 = const()[name = string("K_for_attn_17_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_17_cast_fp16 = slice_by_index(begin = K_for_attn_17_begin_0, end = K_for_attn_17_end_0, end_mask = K_for_attn_17_end_mask_0, x = var_6044_cast_fp16)[name = string("K_for_attn_17_cast_fp16")];
+            tensor<int32, [4]> var_6054_begin_0 = const()[name = string("op_6054_begin_0"), val = tensor<int32, [4]>([7, 0, 0, 0])];
+            tensor<int32, [4]> var_6054_end_0 = const()[name = string("op_6054_end_0"), val = tensor<int32, [4]>([8, 2, 512, 512])];
+            tensor<bool, [4]> var_6054_end_mask_0 = const()[name = string("op_6054_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_6054_cast_fp16 = slice_by_index(begin = var_6054_begin_0, end = var_6054_end_0, end_mask = var_6054_end_mask_0, x = V_sliding_out_15_cast_fp16)[name = string("op_6054_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_17_begin_0 = const()[name = string("V_for_attn_17_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_17_end_0 = const()[name = string("V_for_attn_17_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_17_end_mask_0 = const()[name = string("V_for_attn_17_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_17_cast_fp16 = slice_by_index(begin = V_for_attn_17_begin_0, end = V_for_attn_17_end_0, end_mask = V_for_attn_17_end_mask_0, x = var_6054_cast_fp16)[name = string("V_for_attn_17_cast_fp16")];
+            tensor<int32, [4]> transpose_32_perm_0 = const()[name = string("transpose_32_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_16_reps_0 = const()[name = string("tile_16_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_32_cast_fp16 = transpose(perm = transpose_32_perm_0, x = K_for_attn_17_cast_fp16)[name = string("transpose_73")];
+            tensor<fp16, [8, 1, 512, 256]> tile_16_cast_fp16 = tile(reps = tile_16_reps_0, x = transpose_32_cast_fp16)[name = string("tile_16_cast_fp16")];
+            tensor<int32, [5]> concat_34 = const()[name = string("concat_34"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_32_cast_fp16 = reshape(shape = concat_34, x = tile_16_cast_fp16)[name = string("reshape_32_cast_fp16")];
+            tensor<int32, [5]> transpose_33_perm_0 = const()[name = string("transpose_33_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_35 = const()[name = string("concat_35"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_33_cast_fp16 = transpose(perm = transpose_33_perm_0, x = reshape_32_cast_fp16)[name = string("transpose_72")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_33_cast_fp16 = reshape(shape = concat_35, x = transpose_33_cast_fp16)[name = string("reshape_33_cast_fp16")];
+            tensor<int32, [4]> transpose_74_perm_0 = const()[name = string("transpose_74_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_34_perm_0 = const()[name = string("transpose_34_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_17_reps_0 = const()[name = string("tile_17_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_34_cast_fp16 = transpose(perm = transpose_34_perm_0, x = V_for_attn_17_cast_fp16)[name = string("transpose_71")];
+            tensor<fp16, [8, 1, 512, 256]> tile_17_cast_fp16 = tile(reps = tile_17_reps_0, x = transpose_34_cast_fp16)[name = string("tile_17_cast_fp16")];
+            tensor<int32, [5]> concat_36 = const()[name = string("concat_36"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_34_cast_fp16 = reshape(shape = concat_36, x = tile_17_cast_fp16)[name = string("reshape_34_cast_fp16")];
+            tensor<int32, [5]> transpose_35_perm_0 = const()[name = string("transpose_35_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_37 = const()[name = string("concat_37"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_35_cast_fp16 = transpose(perm = transpose_35_perm_0, x = reshape_34_cast_fp16)[name = string("transpose_70")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_35_cast_fp16 = reshape(shape = concat_37, x = transpose_35_cast_fp16)[name = string("reshape_35_cast_fp16")];
+            tensor<int32, [4]> V_expanded_17_perm_0 = const()[name = string("V_expanded_17_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_33_transpose_x_0 = const()[name = string("attn_weights_33_transpose_x_0"), val = bool(false)];
+            bool attn_weights_33_transpose_y_0 = const()[name = string("attn_weights_33_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_74_cast_fp16 = transpose(perm = transpose_74_perm_0, x = reshape_33_cast_fp16)[name = string("transpose_69")];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_33_cast_fp16 = matmul(transpose_x = attn_weights_33_transpose_x_0, transpose_y = attn_weights_33_transpose_y_0, x = q_107_cast_fp16, y = transpose_74_cast_fp16)[name = string("attn_weights_33_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_167_cast_fp16 = add(x = attn_weights_33_cast_fp16, y = causal_mask_sliding)[name = string("x_167_cast_fp16")];
+            tensor<int32, [1]> reduce_max_8_axes_0 = const()[name = string("reduce_max_8_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_8_keep_dims_0 = const()[name = string("reduce_max_8_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_8 = reduce_max(axes = reduce_max_8_axes_0, keep_dims = reduce_max_8_keep_dims_0, x = x_167_cast_fp16)[name = string("reduce_max_8")];
+            tensor<fp16, [1, 8, 3, 512]> var_6089 = sub(x = x_167_cast_fp16, y = reduce_max_8)[name = string("op_6089")];
+            tensor<fp16, [1, 8, 3, 512]> var_6095 = exp(x = var_6089)[name = string("op_6095")];
+            tensor<int32, [1]> var_6105_axes_0 = const()[name = string("op_6105_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_6105_keep_dims_0 = const()[name = string("op_6105_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_6105 = reduce_sum(axes = var_6105_axes_0, keep_dims = var_6105_keep_dims_0, x = var_6095)[name = string("op_6105")];
+            tensor<fp16, [1, 8, 3, 512]> var_6111_cast_fp16 = real_div(x = var_6095, y = var_6105)[name = string("op_6111_cast_fp16")];
+            bool attn_output_49_transpose_x_0 = const()[name = string("attn_output_49_transpose_x_0"), val = bool(false)];
+            bool attn_output_49_transpose_y_0 = const()[name = string("attn_output_49_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_17_cast_fp16 = transpose(perm = V_expanded_17_perm_0, x = reshape_35_cast_fp16)[name = string("transpose_68")];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_49_cast_fp16 = matmul(transpose_x = attn_output_49_transpose_x_0, transpose_y = attn_output_49_transpose_y_0, x = var_6111_cast_fp16, y = V_expanded_17_cast_fp16)[name = string("attn_output_49_cast_fp16")];
+            tensor<int32, [4]> var_6122 = const()[name = string("op_6122"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_6129 = const()[name = string("op_6129"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_6123_cast_fp16 = transpose(perm = var_6122, x = attn_output_49_cast_fp16)[name = string("transpose_67")];
+            tensor<fp16, [1, 3, 2048]> attn_output_51_cast_fp16 = reshape(shape = var_6129, x = var_6123_cast_fp16)[name = string("attn_output_51_cast_fp16")];
+            tensor<int32, [3]> var_6134 = const()[name = string("op_6134"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_6150_pad_type_0 = const()[name = string("op_6150_pad_type_0"), val = string("valid")];
+            int32 var_6150_groups_0 = const()[name = string("op_6150_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_6150_strides_0 = const()[name = string("op_6150_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_6150_pad_0 = const()[name = string("op_6150_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_6150_dilations_0 = const()[name = string("op_6150_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_8_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(571453248))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(574074752))))[name = string("squeeze_8_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_6135_cast_fp16 = transpose(perm = var_6134, x = attn_output_51_cast_fp16)[name = string("transpose_66")];
+            tensor<fp16, [1, 2560, 3]> var_6150_cast_fp16 = conv(dilations = var_6150_dilations_0, groups = var_6150_groups_0, pad = var_6150_pad_0, pad_type = var_6150_pad_type_0, strides = var_6150_strides_0, weight = squeeze_8_cast_fp16_to_fp32_to_fp16_palettized, x = var_6135_cast_fp16)[name = string("op_6150_cast_fp16")];
+            tensor<int32, [3]> var_6154 = const()[name = string("op_6154"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_6160 = const()[name = string("op_6160"), val = int32(-1)];
+            fp16 const_102_promoted_to_fp16 = const()[name = string("const_102_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_171_cast_fp16 = transpose(perm = var_6154, x = var_6150_cast_fp16)[name = string("transpose_65")];
+            tensor<fp16, [1, 3, 2560]> var_6162_cast_fp16 = mul(x = x_171_cast_fp16, y = const_102_promoted_to_fp16)[name = string("op_6162_cast_fp16")];
+            bool input_255_interleave_0 = const()[name = string("input_255_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_255_cast_fp16 = concat(axis = var_6160, interleave = input_255_interleave_0, values = (x_171_cast_fp16, var_6162_cast_fp16))[name = string("input_255_cast_fp16")];
+            tensor<int32, [1]> normed_241_axes_0 = const()[name = string("normed_241_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6157_to_fp16 = const()[name = string("op_6157_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_241_cast_fp16 = layer_norm(axes = normed_241_axes_0, epsilon = var_6157_to_fp16, x = input_255_cast_fp16)[name = string("normed_241_cast_fp16")];
+            tensor<int32, [2]> var_6167_split_sizes_0 = const()[name = string("op_6167_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6167_axis_0 = const()[name = string("op_6167_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_6167_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_6167_cast_fp16_1 = split(axis = var_6167_axis_0, split_sizes = var_6167_split_sizes_0, x = normed_241_cast_fp16)[name = string("op_6167_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_8_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(574077376)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_53_cast_fp16 = mul(x = var_6167_cast_fp16_0, y = layers_8_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_53_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_173_cast_fp16 = add(x = x_159_cast_fp16, y = attn_output_53_cast_fp16)[name = string("x_173_cast_fp16")];
+            int32 var_6176 = const()[name = string("op_6176"), val = int32(-1)];
+            fp16 const_103_promoted_to_fp16 = const()[name = string("const_103_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_6178_cast_fp16 = mul(x = x_173_cast_fp16, y = const_103_promoted_to_fp16)[name = string("op_6178_cast_fp16")];
+            bool input_257_interleave_0 = const()[name = string("input_257_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_257_cast_fp16 = concat(axis = var_6176, interleave = input_257_interleave_0, values = (x_173_cast_fp16, var_6178_cast_fp16))[name = string("input_257_cast_fp16")];
+            tensor<int32, [1]> normed_245_axes_0 = const()[name = string("normed_245_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6173_to_fp16 = const()[name = string("op_6173_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_245_cast_fp16 = layer_norm(axes = normed_245_axes_0, epsilon = var_6173_to_fp16, x = input_257_cast_fp16)[name = string("normed_245_cast_fp16")];
+            tensor<int32, [2]> var_6183_split_sizes_0 = const()[name = string("op_6183_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6183_axis_0 = const()[name = string("op_6183_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_6183_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_6183_cast_fp16_1 = split(axis = var_6183_axis_0, split_sizes = var_6183_split_sizes_0, x = normed_245_cast_fp16)[name = string("op_6183_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_8_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(574082560)))];
+            tensor<fp16, [1, 3, 2560]> h_51_cast_fp16 = mul(x = var_6183_cast_fp16_0, y = layers_8_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_51_cast_fp16")];
+            tensor<int32, [3]> var_6194 = const()[name = string("op_6194"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_259_axes_0 = const()[name = string("input_259_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_6195 = transpose(perm = var_6194, x = h_51_cast_fp16)[name = string("transpose_64")];
+            tensor<fp16, [1, 2560, 1, 3]> input_259 = expand_dims(axes = input_259_axes_0, x = var_6195)[name = string("input_259")];
+            string gate_33_pad_type_0 = const()[name = string("gate_33_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_33_strides_0 = const()[name = string("gate_33_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_33_pad_0 = const()[name = string("gate_33_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_33_dilations_0 = const()[name = string("gate_33_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_33_groups_0 = const()[name = string("gate_33_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_33 = conv(dilations = gate_33_dilations_0, groups = gate_33_groups_0, pad = gate_33_pad_0, pad_type = gate_33_pad_type_0, strides = gate_33_strides_0, weight = layers_8_mlp_gate_proj_weight_palettized, x = input_259)[name = string("gate_33")];
+            string up_17_pad_type_0 = const()[name = string("up_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_17_strides_0 = const()[name = string("up_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_17_pad_0 = const()[name = string("up_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_17_dilations_0 = const()[name = string("up_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_17_groups_0 = const()[name = string("up_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_17 = conv(dilations = up_17_dilations_0, groups = up_17_groups_0, pad = up_17_pad_0, pad_type = up_17_pad_type_0, strides = up_17_strides_0, weight = layers_8_mlp_up_proj_weight_palettized, x = input_259)[name = string("up_17")];
+            string gate_35_mode_0 = const()[name = string("gate_35_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_35 = gelu(mode = gate_35_mode_0, x = gate_33)[name = string("gate_35")];
+            tensor<fp16, [1, 10240, 1, 3]> input_261 = mul(x = gate_35, y = up_17)[name = string("input_261")];
+            string mlp_out_17_pad_type_0 = const()[name = string("mlp_out_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_17_strides_0 = const()[name = string("mlp_out_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_17_pad_0 = const()[name = string("mlp_out_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_17_dilations_0 = const()[name = string("mlp_out_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_17_groups_0 = const()[name = string("mlp_out_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_17 = conv(dilations = mlp_out_17_dilations_0, groups = mlp_out_17_groups_0, pad = mlp_out_17_pad_0, pad_type = mlp_out_17_pad_type_0, strides = mlp_out_17_strides_0, weight = layers_8_mlp_down_proj_weight_palettized, x = input_261)[name = string("mlp_out_17")];
+            tensor<int32, [1]> var_6235_axes_0 = const()[name = string("op_6235_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_6235 = squeeze(axes = var_6235_axes_0, x = mlp_out_17)[name = string("op_6235")];
+            tensor<int32, [3]> var_6239 = const()[name = string("op_6239"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_6245 = const()[name = string("op_6245"), val = int32(-1)];
+            fp16 const_104_promoted = const()[name = string("const_104_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_175 = transpose(perm = var_6239, x = var_6235)[name = string("transpose_63")];
+            tensor<fp16, [1, 3, 2560]> var_6247 = mul(x = x_175, y = const_104_promoted)[name = string("op_6247")];
+            bool input_263_interleave_0 = const()[name = string("input_263_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_263 = concat(axis = var_6245, interleave = input_263_interleave_0, values = (x_175, var_6247))[name = string("input_263")];
+            tensor<int32, [1]> normed_249_axes_0 = const()[name = string("normed_249_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6242_to_fp16 = const()[name = string("op_6242_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_249_cast_fp16 = layer_norm(axes = normed_249_axes_0, epsilon = var_6242_to_fp16, x = input_263)[name = string("normed_249_cast_fp16")];
+            tensor<int32, [2]> var_6252_split_sizes_0 = const()[name = string("op_6252_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6252_axis_0 = const()[name = string("op_6252_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_6252_0, tensor<fp16, [1, 3, 2560]> var_6252_1 = split(axis = var_6252_axis_0, split_sizes = var_6252_split_sizes_0, x = normed_249_cast_fp16)[name = string("op_6252")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_83 = mul(x = var_6252_0, y = layers_8_post_feedforward_layernorm_weight)[name = string("hidden_states_83")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_85_cast_fp16 = add(x = x_173_cast_fp16, y = hidden_states_83)[name = string("hidden_states_85_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_17_begin_0 = const()[name = string("per_layer_slice_17_begin_0"), val = tensor<int32, [3]>([0, 0, 2048])];
+            tensor<int32, [3]> per_layer_slice_17_end_0 = const()[name = string("per_layer_slice_17_end_0"), val = tensor<int32, [3]>([1, 3, 2304])];
+            tensor<bool, [3]> per_layer_slice_17_end_mask_0 = const()[name = string("per_layer_slice_17_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_17_cast_fp16 = slice_by_index(begin = per_layer_slice_17_begin_0, end = per_layer_slice_17_end_0, end_mask = per_layer_slice_17_end_mask_0, x = per_layer_combined_out)[name = string("per_layer_slice_17_cast_fp16")];
+            tensor<int32, [3]> var_6280 = const()[name = string("op_6280"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_265_axes_0 = const()[name = string("input_265_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_6281 = transpose(perm = var_6280, x = hidden_states_85_cast_fp16)[name = string("transpose_62")];
+            tensor<fp16, [1, 2560, 1, 3]> input_265 = expand_dims(axes = input_265_axes_0, x = var_6281)[name = string("input_265")];
+            string gated_49_pad_type_0 = const()[name = string("gated_49_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_49_strides_0 = const()[name = string("gated_49_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_49_pad_0 = const()[name = string("gated_49_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_49_dilations_0 = const()[name = string("gated_49_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_49_groups_0 = const()[name = string("gated_49_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_49 = conv(dilations = gated_49_dilations_0, groups = gated_49_groups_0, pad = gated_49_pad_0, pad_type = gated_49_pad_type_0, strides = gated_49_strides_0, weight = layers_8_per_layer_input_gate_weight_palettized, x = input_265)[name = string("gated_49")];
+            string gated_51_mode_0 = const()[name = string("gated_51_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_51 = gelu(mode = gated_51_mode_0, x = gated_49)[name = string("gated_51")];
+            tensor<int32, [3]> var_6300 = const()[name = string("op_6300"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_17_axes_0 = const()[name = string("per_layer_slice_conv_17_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_6301_cast_fp16 = transpose(perm = var_6300, x = per_layer_slice_17_cast_fp16)[name = string("transpose_61")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_17_cast_fp16 = expand_dims(axes = per_layer_slice_conv_17_axes_0, x = var_6301_cast_fp16)[name = string("per_layer_slice_conv_17_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_267_cast_fp16 = mul(x = gated_51, y = per_layer_slice_conv_17_cast_fp16)[name = string("input_267_cast_fp16")];
+            string gated_53_pad_type_0 = const()[name = string("gated_53_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_53_strides_0 = const()[name = string("gated_53_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_53_pad_0 = const()[name = string("gated_53_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_53_dilations_0 = const()[name = string("gated_53_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_53_groups_0 = const()[name = string("gated_53_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_8_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(574087744))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(574415488))))[name = string("layers_8_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_53_cast_fp16 = conv(dilations = gated_53_dilations_0, groups = gated_53_groups_0, pad = gated_53_pad_0, pad_type = gated_53_pad_type_0, strides = gated_53_strides_0, weight = layers_8_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_267_cast_fp16)[name = string("gated_53_cast_fp16")];
+            tensor<int32, [1]> var_6317_axes_0 = const()[name = string("op_6317_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_6317_cast_fp16 = squeeze(axes = var_6317_axes_0, x = gated_53_cast_fp16)[name = string("op_6317_cast_fp16")];
+            tensor<int32, [3]> var_6321 = const()[name = string("op_6321"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_6327 = const()[name = string("op_6327"), val = int32(-1)];
+            fp16 const_105_promoted_to_fp16 = const()[name = string("const_105_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_177_cast_fp16 = transpose(perm = var_6321, x = var_6317_cast_fp16)[name = string("transpose_60")];
+            tensor<fp16, [1, 3, 2560]> var_6329_cast_fp16 = mul(x = x_177_cast_fp16, y = const_105_promoted_to_fp16)[name = string("op_6329_cast_fp16")];
+            bool input_269_interleave_0 = const()[name = string("input_269_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_269_cast_fp16 = concat(axis = var_6327, interleave = input_269_interleave_0, values = (x_177_cast_fp16, var_6329_cast_fp16))[name = string("input_269_cast_fp16")];
+            tensor<int32, [1]> normed_253_axes_0 = const()[name = string("normed_253_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6324_to_fp16 = const()[name = string("op_6324_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_253_cast_fp16 = layer_norm(axes = normed_253_axes_0, epsilon = var_6324_to_fp16, x = input_269_cast_fp16)[name = string("normed_253_cast_fp16")];
+            tensor<int32, [2]> var_6334_split_sizes_0 = const()[name = string("op_6334_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6334_axis_0 = const()[name = string("op_6334_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_6334_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_6334_cast_fp16_1 = split(axis = var_6334_axis_0, split_sizes = var_6334_split_sizes_0, x = normed_253_cast_fp16)[name = string("op_6334_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_8_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(574418112)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_89_cast_fp16 = mul(x = var_6334_cast_fp16_0, y = layers_8_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_89_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_91_cast_fp16 = add(x = hidden_states_85_cast_fp16, y = hidden_states_89_cast_fp16)[name = string("hidden_states_91_cast_fp16")];
+            tensor<fp16, [1]> const_106_promoted_to_fp16 = const()[name = string("const_106_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.d4p-3])];
+            tensor<fp16, [1, 3, 2560]> x_179_cast_fp16 = mul(x = hidden_states_91_cast_fp16, y = const_106_promoted_to_fp16)[name = string("x_179_cast_fp16")];
+            int32 var_6349 = const()[name = string("op_6349"), val = int32(-1)];
+            fp16 const_107_promoted_to_fp16 = const()[name = string("const_107_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_6351_cast_fp16 = mul(x = x_179_cast_fp16, y = const_107_promoted_to_fp16)[name = string("op_6351_cast_fp16")];
+            bool input_271_interleave_0 = const()[name = string("input_271_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_271_cast_fp16 = concat(axis = var_6349, interleave = input_271_interleave_0, values = (x_179_cast_fp16, var_6351_cast_fp16))[name = string("input_271_cast_fp16")];
+            tensor<int32, [1]> normed_257_axes_0 = const()[name = string("normed_257_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6346_to_fp16 = const()[name = string("op_6346_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_257_cast_fp16 = layer_norm(axes = normed_257_axes_0, epsilon = var_6346_to_fp16, x = input_271_cast_fp16)[name = string("normed_257_cast_fp16")];
+            tensor<int32, [2]> var_6356_split_sizes_0 = const()[name = string("op_6356_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6356_axis_0 = const()[name = string("op_6356_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_6356_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_6356_cast_fp16_1 = split(axis = var_6356_axis_0, split_sizes = var_6356_split_sizes_0, x = normed_257_cast_fp16)[name = string("op_6356_cast_fp16")];
+            tensor<fp16, [2560]> layers_9_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_9_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(574423296)))];
+            tensor<fp16, [1, 3, 2560]> h_55_cast_fp16 = mul(x = var_6356_cast_fp16_0, y = layers_9_input_layernorm_weight_promoted_to_fp16)[name = string("h_55_cast_fp16")];
+            tensor<int32, [3]> var_6362 = const()[name = string("op_6362"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_6365_axes_0 = const()[name = string("op_6365_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_6363_cast_fp16 = transpose(perm = var_6362, x = h_55_cast_fp16)[name = string("transpose_59")];
+            tensor<fp16, [1, 2560, 1, 3]> var_6365_cast_fp16 = expand_dims(axes = var_6365_axes_0, x = var_6363_cast_fp16)[name = string("op_6365_cast_fp16")];
+            string q_109_pad_type_0 = const()[name = string("q_109_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_109_strides_0 = const()[name = string("q_109_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_109_pad_0 = const()[name = string("q_109_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_109_dilations_0 = const()[name = string("q_109_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_109_groups_0 = const()[name = string("q_109_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_109 = conv(dilations = q_109_dilations_0, groups = q_109_groups_0, pad = q_109_pad_0, pad_type = q_109_pad_type_0, strides = q_109_strides_0, weight = layers_9_self_attn_q_proj_weight_palettized, x = var_6365_cast_fp16)[name = string("q_109")];
+            tensor<int32, [4]> var_6386 = const()[name = string("op_6386"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_6387 = reshape(shape = var_6386, x = q_109)[name = string("op_6387")];
+            tensor<int32, [4]> transpose_75_perm_0 = const()[name = string("transpose_75_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_6410 = const()[name = string("op_6410"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_75 = transpose(perm = transpose_75_perm_0, x = var_6387)[name = string("transpose_58")];
+            tensor<fp16, [3, 8, 256]> x_181 = reshape(shape = var_6410, x = transpose_75)[name = string("x_181")];
+            int32 var_6416 = const()[name = string("op_6416"), val = int32(-1)];
+            fp16 const_108_promoted = const()[name = string("const_108_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_6418 = mul(x = x_181, y = const_108_promoted)[name = string("op_6418")];
+            bool input_275_interleave_0 = const()[name = string("input_275_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_275 = concat(axis = var_6416, interleave = input_275_interleave_0, values = (x_181, var_6418))[name = string("input_275")];
+            tensor<int32, [1]> normed_261_axes_0 = const()[name = string("normed_261_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6413_to_fp16 = const()[name = string("op_6413_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_261_cast_fp16 = layer_norm(axes = normed_261_axes_0, epsilon = var_6413_to_fp16, x = input_275)[name = string("normed_261_cast_fp16")];
+            tensor<int32, [2]> var_6423_split_sizes_0 = const()[name = string("op_6423_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_6423_axis_0 = const()[name = string("op_6423_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_6423_0, tensor<fp16, [3, 8, 256]> var_6423_1 = split(axis = var_6423_axis_0, split_sizes = var_6423_split_sizes_0, x = normed_261_cast_fp16)[name = string("op_6423")];
+            tensor<fp16, [3, 8, 256]> q_113 = mul(x = var_6423_0, y = layers_9_self_attn_q_norm_weight)[name = string("q_113")];
+            tensor<int32, [4]> var_6430 = const()[name = string("op_6430"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_6431 = reshape(shape = var_6430, x = q_113)[name = string("op_6431")];
+            tensor<int32, [4]> var_6436 = const()[name = string("op_6436"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_115 = transpose(perm = var_6436, x = var_6431)[name = string("transpose_57")];
+            tensor<fp16, [1, 8, 3, 256]> var_6438_cast_fp16 = mul(x = q_115, y = cos_s)[name = string("op_6438_cast_fp16")];
+            tensor<int32, [2]> var_6439_split_sizes_0 = const()[name = string("op_6439_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_6439_axis_0 = const()[name = string("op_6439_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_6439_0, tensor<fp16, [1, 8, 3, 128]> var_6439_1 = split(axis = var_6439_axis_0, split_sizes = var_6439_split_sizes_0, x = q_115)[name = string("op_6439")];
+            fp16 const_109_promoted = const()[name = string("const_109_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_6441 = mul(x = var_6439_1, y = const_109_promoted)[name = string("op_6441")];
+            int32 var_6443 = const()[name = string("op_6443"), val = int32(-1)];
+            bool var_6444_interleave_0 = const()[name = string("op_6444_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_6444 = concat(axis = var_6443, interleave = var_6444_interleave_0, values = (var_6441, var_6439_0))[name = string("op_6444")];
+            tensor<fp16, [1, 8, 3, 256]> var_6445_cast_fp16 = mul(x = var_6444, y = sin_s)[name = string("op_6445_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_119_cast_fp16 = add(x = var_6438_cast_fp16, y = var_6445_cast_fp16)[name = string("q_119_cast_fp16")];
+            string k_57_pad_type_0 = const()[name = string("k_57_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_57_strides_0 = const()[name = string("k_57_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_57_pad_0 = const()[name = string("k_57_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_57_dilations_0 = const()[name = string("k_57_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_57_groups_0 = const()[name = string("k_57_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> k_57 = conv(dilations = k_57_dilations_0, groups = k_57_groups_0, pad = k_57_pad_0, pad_type = k_57_pad_type_0, strides = k_57_strides_0, weight = layers_9_self_attn_k_proj_weight_palettized, x = var_6365_cast_fp16)[name = string("k_57")];
+            tensor<int32, [4]> var_6463 = const()[name = string("op_6463"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_6464 = reshape(shape = var_6463, x = k_57)[name = string("op_6464")];
+            tensor<int32, [4]> transpose_76_perm_0 = const()[name = string("transpose_76_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            string v_21_pad_type_0 = const()[name = string("v_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_21_strides_0 = const()[name = string("v_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_21_pad_0 = const()[name = string("v_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_21_dilations_0 = const()[name = string("v_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_21_groups_0 = const()[name = string("v_21_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> v_21 = conv(dilations = v_21_dilations_0, groups = v_21_groups_0, pad = v_21_pad_0, pad_type = v_21_pad_type_0, strides = v_21_strides_0, weight = layers_9_self_attn_v_proj_weight_palettized, x = var_6365_cast_fp16)[name = string("v_21")];
+            tensor<int32, [4]> var_6491 = const()[name = string("op_6491"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_6492 = reshape(shape = var_6491, x = v_21)[name = string("op_6492")];
+            tensor<int32, [4]> var_6497 = const()[name = string("op_6497"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_6515 = const()[name = string("op_6515"), val = tensor<int32, [3]>([3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> transpose_76 = transpose(perm = transpose_76_perm_0, x = var_6464)[name = string("transpose_56")];
+            tensor<fp16, [3, 2, 256]> x_183 = reshape(shape = var_6515, x = transpose_76)[name = string("x_183")];
+            int32 var_6521 = const()[name = string("op_6521"), val = int32(-1)];
+            fp16 const_110_promoted = const()[name = string("const_110_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 2, 256]> var_6523 = mul(x = x_183, y = const_110_promoted)[name = string("op_6523")];
+            bool input_277_interleave_0 = const()[name = string("input_277_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 2, 512]> input_277 = concat(axis = var_6521, interleave = input_277_interleave_0, values = (x_183, var_6523))[name = string("input_277")];
+            tensor<int32, [1]> normed_265_axes_0 = const()[name = string("normed_265_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6518_to_fp16 = const()[name = string("op_6518_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 2, 512]> normed_265_cast_fp16 = layer_norm(axes = normed_265_axes_0, epsilon = var_6518_to_fp16, x = input_277)[name = string("normed_265_cast_fp16")];
+            tensor<int32, [2]> var_6528_split_sizes_0 = const()[name = string("op_6528_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_6528_axis_0 = const()[name = string("op_6528_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 2, 256]> var_6528_0, tensor<fp16, [3, 2, 256]> var_6528_1 = split(axis = var_6528_axis_0, split_sizes = var_6528_split_sizes_0, x = normed_265_cast_fp16)[name = string("op_6528")];
+            tensor<fp16, [3, 2, 256]> k_61 = mul(x = var_6528_0, y = layers_9_self_attn_k_norm_weight)[name = string("k_61")];
+            tensor<int32, [4]> var_6535 = const()[name = string("op_6535"), val = tensor<int32, [4]>([1, 3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> var_6536 = reshape(shape = var_6535, x = k_61)[name = string("op_6536")];
+            tensor<int32, [4]> var_6541 = const()[name = string("op_6541"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            fp16 var_6543_promoted = const()[name = string("op_6543_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 3, 256]> var_6498 = transpose(perm = var_6497, x = var_6492)[name = string("transpose_55")];
+            tensor<fp16, [1, 2, 3, 256]> var_6544 = pow(x = var_6498, y = var_6543_promoted)[name = string("op_6544")];
+            tensor<int32, [1]> var_6549_axes_0 = const()[name = string("op_6549_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_6549_keep_dims_0 = const()[name = string("op_6549_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 3, 1]> var_6549 = reduce_mean(axes = var_6549_axes_0, keep_dims = var_6549_keep_dims_0, x = var_6544)[name = string("op_6549")];
+            fp16 var_6551_to_fp16 = const()[name = string("op_6551_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 3, 1]> mean_sq_19_cast_fp16 = add(x = var_6549, y = var_6551_to_fp16)[name = string("mean_sq_19_cast_fp16")];
+            fp32 var_6553_epsilon_0 = const()[name = string("op_6553_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 3, 1]> var_6553_cast_fp16 = rsqrt(epsilon = var_6553_epsilon_0, x = mean_sq_19_cast_fp16)[name = string("op_6553_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_281_cast_fp16 = mul(x = var_6498, y = var_6553_cast_fp16)[name = string("input_281_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> q_117 = transpose(perm = var_6541, x = var_6536)[name = string("transpose_54")];
+            tensor<fp16, [1, 2, 3, 256]> var_6555_cast_fp16 = mul(x = q_117, y = cos_s)[name = string("op_6555_cast_fp16")];
+            tensor<int32, [2]> var_6556_split_sizes_0 = const()[name = string("op_6556_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_6556_axis_0 = const()[name = string("op_6556_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 3, 128]> var_6556_0, tensor<fp16, [1, 2, 3, 128]> var_6556_1 = split(axis = var_6556_axis_0, split_sizes = var_6556_split_sizes_0, x = q_117)[name = string("op_6556")];
+            fp16 const_111_promoted = const()[name = string("const_111_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 3, 128]> var_6558 = mul(x = var_6556_1, y = const_111_promoted)[name = string("op_6558")];
+            int32 var_6560 = const()[name = string("op_6560"), val = int32(-1)];
+            bool var_6561_interleave_0 = const()[name = string("op_6561_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 3, 256]> var_6561 = concat(axis = var_6560, interleave = var_6561_interleave_0, values = (var_6558, var_6556_0))[name = string("op_6561")];
+            tensor<fp16, [1, 2, 3, 256]> var_6562_cast_fp16 = mul(x = var_6561, y = sin_s)[name = string("op_6562_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_279_cast_fp16 = add(x = var_6555_cast_fp16, y = var_6562_cast_fp16)[name = string("input_279_cast_fp16")];
+            tensor<int32, [8]> k_padded_17_pad_0 = const()[name = string("k_padded_17_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_17_mode_0 = const()[name = string("k_padded_17_mode_0"), val = string("constant")];
+            fp16 const_112_to_fp16 = const()[name = string("const_112_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> k_padded_17_cast_fp16 = pad(constant_val = const_112_to_fp16, mode = k_padded_17_mode_0, pad = k_padded_17_pad_0, x = input_279_cast_fp16)[name = string("k_padded_17_cast_fp16")];
+            tensor<int32, [8]> v_padded_17_pad_0 = const()[name = string("v_padded_17_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_17_mode_0 = const()[name = string("v_padded_17_mode_0"), val = string("constant")];
+            fp16 const_113_to_fp16 = const()[name = string("const_113_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> v_padded_17_cast_fp16 = pad(constant_val = const_113_to_fp16, mode = v_padded_17_mode_0, pad = v_padded_17_pad_0, x = input_281_cast_fp16)[name = string("v_padded_17_cast_fp16")];
+            tensor<int32, [4]> slot_k_19_begin_0 = const()[name = string("slot_k_19_begin_0"), val = tensor<int32, [4]>([8, 0, 0, 0])];
+            tensor<int32, [4]> slot_k_19_end_0 = const()[name = string("slot_k_19_end_0"), val = tensor<int32, [4]>([9, 2, 512, 512])];
+            tensor<bool, [4]> slot_k_19_end_mask_0 = const()[name = string("slot_k_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_k_19_cast_fp16 = slice_by_index(begin = slot_k_19_begin_0, end = slot_k_19_end_0, end_mask = slot_k_19_end_mask_0, x = K_sliding_out_15_cast_fp16)[name = string("slot_k_19_cast_fp16")];
+            tensor<int32, [4]> slot_v_19_begin_0 = const()[name = string("slot_v_19_begin_0"), val = tensor<int32, [4]>([8, 0, 0, 0])];
+            tensor<int32, [4]> slot_v_19_end_0 = const()[name = string("slot_v_19_end_0"), val = tensor<int32, [4]>([9, 2, 512, 512])];
+            tensor<bool, [4]> slot_v_19_end_mask_0 = const()[name = string("slot_v_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_v_19_cast_fp16 = slice_by_index(begin = slot_v_19_begin_0, end = slot_v_19_end_0, end_mask = slot_v_19_end_mask_0, x = V_sliding_out_15_cast_fp16)[name = string("slot_v_19_cast_fp16")];
+            tensor<int32, [4]> var_6601_begin_0 = const()[name = string("op_6601_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_6601_end_0 = const()[name = string("op_6601_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_6601_end_mask_0 = const()[name = string("op_6601_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_6601_cast_fp16 = slice_by_index(begin = var_6601_begin_0, end = var_6601_end_0, end_mask = var_6601_end_mask_0, x = slot_k_19_cast_fp16)[name = string("op_6601_cast_fp16")];
+            int32 var_6608 = const()[name = string("op_6608"), val = int32(2)];
+            bool new_k_19_interleave_0 = const()[name = string("new_k_19_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_k_19_cast_fp16 = concat(axis = var_6608, interleave = new_k_19_interleave_0, values = (var_6601_cast_fp16, k_padded_17_cast_fp16))[name = string("new_k_19_cast_fp16")];
+            tensor<int32, [4]> var_6624_begin_0 = const()[name = string("op_6624_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_6624_end_0 = const()[name = string("op_6624_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_6624_end_mask_0 = const()[name = string("op_6624_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_6624_cast_fp16 = slice_by_index(begin = var_6624_begin_0, end = var_6624_end_0, end_mask = var_6624_end_mask_0, x = slot_v_19_cast_fp16)[name = string("op_6624_cast_fp16")];
+            int32 var_6631 = const()[name = string("op_6631"), val = int32(2)];
+            bool new_v_19_interleave_0 = const()[name = string("new_v_19_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_v_19_cast_fp16 = concat(axis = var_6631, interleave = new_v_19_interleave_0, values = (var_6624_cast_fp16, v_padded_17_cast_fp16))[name = string("new_v_19_cast_fp16")];
+            tensor<int32, [4]> var_6637_begin_0 = const()[name = string("op_6637_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_6637_end_0 = const()[name = string("op_6637_end_0"), val = tensor<int32, [4]>([8, 2, 512, 512])];
+            tensor<bool, [4]> var_6637_end_mask_0 = const()[name = string("op_6637_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [8, 2, 512, 512]> var_6637_cast_fp16 = slice_by_index(begin = var_6637_begin_0, end = var_6637_end_0, end_mask = var_6637_end_mask_0, x = K_sliding_out_15_cast_fp16)[name = string("op_6637_cast_fp16")];
+            tensor<int32, [4]> var_6642_begin_0 = const()[name = string("op_6642_begin_0"), val = tensor<int32, [4]>([9, 0, 0, 0])];
+            tensor<int32, [4]> var_6642_end_0 = const()[name = string("op_6642_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_6642_end_mask_0 = const()[name = string("op_6642_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_6642_cast_fp16 = slice_by_index(begin = var_6642_begin_0, end = var_6642_end_0, end_mask = var_6642_end_mask_0, x = K_sliding_out_15_cast_fp16)[name = string("op_6642_cast_fp16")];
+            int32 var_6644 = const()[name = string("op_6644"), val = int32(0)];
+            bool K_sliding_out_17_interleave_0 = const()[name = string("K_sliding_out_17_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> K_sliding_out_17_cast_fp16 = concat(axis = var_6644, interleave = K_sliding_out_17_interleave_0, values = (var_6637_cast_fp16, new_k_19_cast_fp16, var_6642_cast_fp16))[name = string("K_sliding_out_17_cast_fp16")];
+            tensor<int32, [4]> var_6650_begin_0 = const()[name = string("op_6650_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_6650_end_0 = const()[name = string("op_6650_end_0"), val = tensor<int32, [4]>([8, 2, 512, 512])];
+            tensor<bool, [4]> var_6650_end_mask_0 = const()[name = string("op_6650_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [8, 2, 512, 512]> var_6650_cast_fp16 = slice_by_index(begin = var_6650_begin_0, end = var_6650_end_0, end_mask = var_6650_end_mask_0, x = V_sliding_out_15_cast_fp16)[name = string("op_6650_cast_fp16")];
+            tensor<int32, [4]> var_6655_begin_0 = const()[name = string("op_6655_begin_0"), val = tensor<int32, [4]>([9, 0, 0, 0])];
+            tensor<int32, [4]> var_6655_end_0 = const()[name = string("op_6655_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_6655_end_mask_0 = const()[name = string("op_6655_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_6655_cast_fp16 = slice_by_index(begin = var_6655_begin_0, end = var_6655_end_0, end_mask = var_6655_end_mask_0, x = V_sliding_out_15_cast_fp16)[name = string("op_6655_cast_fp16")];
+            int32 var_6657 = const()[name = string("op_6657"), val = int32(0)];
+            bool V_sliding_out_17_interleave_0 = const()[name = string("V_sliding_out_17_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> V_sliding_out_17_cast_fp16 = concat(axis = var_6657, interleave = V_sliding_out_17_interleave_0, values = (var_6650_cast_fp16, new_v_19_cast_fp16, var_6655_cast_fp16))[name = string("V_sliding_out_17_cast_fp16")];
+            tensor<int32, [4]> var_6663_begin_0 = const()[name = string("op_6663_begin_0"), val = tensor<int32, [4]>([8, 0, 0, 0])];
+            tensor<int32, [4]> var_6663_end_0 = const()[name = string("op_6663_end_0"), val = tensor<int32, [4]>([9, 2, 512, 512])];
+            tensor<bool, [4]> var_6663_end_mask_0 = const()[name = string("op_6663_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_6663_cast_fp16 = slice_by_index(begin = var_6663_begin_0, end = var_6663_end_0, end_mask = var_6663_end_mask_0, x = K_sliding_out_17_cast_fp16)[name = string("op_6663_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_19_begin_0 = const()[name = string("K_for_attn_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_19_end_0 = const()[name = string("K_for_attn_19_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_19_end_mask_0 = const()[name = string("K_for_attn_19_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_19_cast_fp16 = slice_by_index(begin = K_for_attn_19_begin_0, end = K_for_attn_19_end_0, end_mask = K_for_attn_19_end_mask_0, x = var_6663_cast_fp16)[name = string("K_for_attn_19_cast_fp16")];
+            tensor<int32, [4]> var_6673_begin_0 = const()[name = string("op_6673_begin_0"), val = tensor<int32, [4]>([8, 0, 0, 0])];
+            tensor<int32, [4]> var_6673_end_0 = const()[name = string("op_6673_end_0"), val = tensor<int32, [4]>([9, 2, 512, 512])];
+            tensor<bool, [4]> var_6673_end_mask_0 = const()[name = string("op_6673_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_6673_cast_fp16 = slice_by_index(begin = var_6673_begin_0, end = var_6673_end_0, end_mask = var_6673_end_mask_0, x = V_sliding_out_17_cast_fp16)[name = string("op_6673_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_19_begin_0 = const()[name = string("V_for_attn_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_19_end_0 = const()[name = string("V_for_attn_19_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_19_end_mask_0 = const()[name = string("V_for_attn_19_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_19_cast_fp16 = slice_by_index(begin = V_for_attn_19_begin_0, end = V_for_attn_19_end_0, end_mask = V_for_attn_19_end_mask_0, x = var_6673_cast_fp16)[name = string("V_for_attn_19_cast_fp16")];
+            tensor<int32, [4]> transpose_36_perm_0 = const()[name = string("transpose_36_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_18_reps_0 = const()[name = string("tile_18_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_36_cast_fp16 = transpose(perm = transpose_36_perm_0, x = K_for_attn_19_cast_fp16)[name = string("transpose_53")];
+            tensor<fp16, [8, 1, 512, 256]> tile_18_cast_fp16 = tile(reps = tile_18_reps_0, x = transpose_36_cast_fp16)[name = string("tile_18_cast_fp16")];
+            tensor<int32, [5]> concat_38 = const()[name = string("concat_38"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_36_cast_fp16 = reshape(shape = concat_38, x = tile_18_cast_fp16)[name = string("reshape_36_cast_fp16")];
+            tensor<int32, [5]> transpose_37_perm_0 = const()[name = string("transpose_37_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_39 = const()[name = string("concat_39"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_37_cast_fp16 = transpose(perm = transpose_37_perm_0, x = reshape_36_cast_fp16)[name = string("transpose_52")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_37_cast_fp16 = reshape(shape = concat_39, x = transpose_37_cast_fp16)[name = string("reshape_37_cast_fp16")];
+            tensor<int32, [4]> transpose_77_perm_0 = const()[name = string("transpose_77_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_38_perm_0 = const()[name = string("transpose_38_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_19_reps_0 = const()[name = string("tile_19_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_38_cast_fp16 = transpose(perm = transpose_38_perm_0, x = V_for_attn_19_cast_fp16)[name = string("transpose_51")];
+            tensor<fp16, [8, 1, 512, 256]> tile_19_cast_fp16 = tile(reps = tile_19_reps_0, x = transpose_38_cast_fp16)[name = string("tile_19_cast_fp16")];
+            tensor<int32, [5]> concat_40 = const()[name = string("concat_40"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_38_cast_fp16 = reshape(shape = concat_40, x = tile_19_cast_fp16)[name = string("reshape_38_cast_fp16")];
+            tensor<int32, [5]> transpose_39_perm_0 = const()[name = string("transpose_39_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_41 = const()[name = string("concat_41"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_39_cast_fp16 = transpose(perm = transpose_39_perm_0, x = reshape_38_cast_fp16)[name = string("transpose_50")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_39_cast_fp16 = reshape(shape = concat_41, x = transpose_39_cast_fp16)[name = string("reshape_39_cast_fp16")];
+            tensor<int32, [4]> V_expanded_19_perm_0 = const()[name = string("V_expanded_19_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_37_transpose_x_0 = const()[name = string("attn_weights_37_transpose_x_0"), val = bool(false)];
+            bool attn_weights_37_transpose_y_0 = const()[name = string("attn_weights_37_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_77_cast_fp16 = transpose(perm = transpose_77_perm_0, x = reshape_37_cast_fp16)[name = string("transpose_49")];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_37_cast_fp16 = matmul(transpose_x = attn_weights_37_transpose_x_0, transpose_y = attn_weights_37_transpose_y_0, x = q_119_cast_fp16, y = transpose_77_cast_fp16)[name = string("attn_weights_37_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_187_cast_fp16 = add(x = attn_weights_37_cast_fp16, y = causal_mask_sliding)[name = string("x_187_cast_fp16")];
+            tensor<int32, [1]> reduce_max_9_axes_0 = const()[name = string("reduce_max_9_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_9_keep_dims_0 = const()[name = string("reduce_max_9_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_9 = reduce_max(axes = reduce_max_9_axes_0, keep_dims = reduce_max_9_keep_dims_0, x = x_187_cast_fp16)[name = string("reduce_max_9")];
+            tensor<fp16, [1, 8, 3, 512]> var_6708 = sub(x = x_187_cast_fp16, y = reduce_max_9)[name = string("op_6708")];
+            tensor<fp16, [1, 8, 3, 512]> var_6714 = exp(x = var_6708)[name = string("op_6714")];
+            tensor<int32, [1]> var_6724_axes_0 = const()[name = string("op_6724_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_6724_keep_dims_0 = const()[name = string("op_6724_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_6724 = reduce_sum(axes = var_6724_axes_0, keep_dims = var_6724_keep_dims_0, x = var_6714)[name = string("op_6724")];
+            tensor<fp16, [1, 8, 3, 512]> var_6730_cast_fp16 = real_div(x = var_6714, y = var_6724)[name = string("op_6730_cast_fp16")];
+            bool attn_output_55_transpose_x_0 = const()[name = string("attn_output_55_transpose_x_0"), val = bool(false)];
+            bool attn_output_55_transpose_y_0 = const()[name = string("attn_output_55_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_19_cast_fp16 = transpose(perm = V_expanded_19_perm_0, x = reshape_39_cast_fp16)[name = string("transpose_48")];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_55_cast_fp16 = matmul(transpose_x = attn_output_55_transpose_x_0, transpose_y = attn_output_55_transpose_y_0, x = var_6730_cast_fp16, y = V_expanded_19_cast_fp16)[name = string("attn_output_55_cast_fp16")];
+            tensor<int32, [4]> var_6741 = const()[name = string("op_6741"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_6748 = const()[name = string("op_6748"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_6742_cast_fp16 = transpose(perm = var_6741, x = attn_output_55_cast_fp16)[name = string("transpose_47")];
+            tensor<fp16, [1, 3, 2048]> attn_output_57_cast_fp16 = reshape(shape = var_6748, x = var_6742_cast_fp16)[name = string("attn_output_57_cast_fp16")];
+            tensor<int32, [3]> var_6753 = const()[name = string("op_6753"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_6769_pad_type_0 = const()[name = string("op_6769_pad_type_0"), val = string("valid")];
+            int32 var_6769_groups_0 = const()[name = string("op_6769_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_6769_strides_0 = const()[name = string("op_6769_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_6769_pad_0 = const()[name = string("op_6769_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_6769_dilations_0 = const()[name = string("op_6769_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_9_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(574428480))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(577049984))))[name = string("squeeze_9_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_6754_cast_fp16 = transpose(perm = var_6753, x = attn_output_57_cast_fp16)[name = string("transpose_46")];
+            tensor<fp16, [1, 2560, 3]> var_6769_cast_fp16 = conv(dilations = var_6769_dilations_0, groups = var_6769_groups_0, pad = var_6769_pad_0, pad_type = var_6769_pad_type_0, strides = var_6769_strides_0, weight = squeeze_9_cast_fp16_to_fp32_to_fp16_palettized, x = var_6754_cast_fp16)[name = string("op_6769_cast_fp16")];
+            tensor<int32, [3]> var_6773 = const()[name = string("op_6773"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_6779 = const()[name = string("op_6779"), val = int32(-1)];
+            fp16 const_114_promoted_to_fp16 = const()[name = string("const_114_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_191_cast_fp16 = transpose(perm = var_6773, x = var_6769_cast_fp16)[name = string("transpose_45")];
+            tensor<fp16, [1, 3, 2560]> var_6781_cast_fp16 = mul(x = x_191_cast_fp16, y = const_114_promoted_to_fp16)[name = string("op_6781_cast_fp16")];
+            bool input_285_interleave_0 = const()[name = string("input_285_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_285_cast_fp16 = concat(axis = var_6779, interleave = input_285_interleave_0, values = (x_191_cast_fp16, var_6781_cast_fp16))[name = string("input_285_cast_fp16")];
+            tensor<int32, [1]> normed_269_axes_0 = const()[name = string("normed_269_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6776_to_fp16 = const()[name = string("op_6776_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_269_cast_fp16 = layer_norm(axes = normed_269_axes_0, epsilon = var_6776_to_fp16, x = input_285_cast_fp16)[name = string("normed_269_cast_fp16")];
+            tensor<int32, [2]> var_6786_split_sizes_0 = const()[name = string("op_6786_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6786_axis_0 = const()[name = string("op_6786_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_6786_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_6786_cast_fp16_1 = split(axis = var_6786_axis_0, split_sizes = var_6786_split_sizes_0, x = normed_269_cast_fp16)[name = string("op_6786_cast_fp16")];
+            tensor<fp16, [2560]> layers_9_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_9_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(577052608)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_59_cast_fp16 = mul(x = var_6786_cast_fp16_0, y = layers_9_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_59_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_193_cast_fp16 = add(x = x_179_cast_fp16, y = attn_output_59_cast_fp16)[name = string("x_193_cast_fp16")];
+            int32 var_6795 = const()[name = string("op_6795"), val = int32(-1)];
+            fp16 const_115_promoted_to_fp16 = const()[name = string("const_115_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_6797_cast_fp16 = mul(x = x_193_cast_fp16, y = const_115_promoted_to_fp16)[name = string("op_6797_cast_fp16")];
+            bool input_287_interleave_0 = const()[name = string("input_287_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_287_cast_fp16 = concat(axis = var_6795, interleave = input_287_interleave_0, values = (x_193_cast_fp16, var_6797_cast_fp16))[name = string("input_287_cast_fp16")];
+            tensor<int32, [1]> normed_273_axes_0 = const()[name = string("normed_273_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6792_to_fp16 = const()[name = string("op_6792_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_273_cast_fp16 = layer_norm(axes = normed_273_axes_0, epsilon = var_6792_to_fp16, x = input_287_cast_fp16)[name = string("normed_273_cast_fp16")];
+            tensor<int32, [2]> var_6802_split_sizes_0 = const()[name = string("op_6802_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6802_axis_0 = const()[name = string("op_6802_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_6802_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_6802_cast_fp16_1 = split(axis = var_6802_axis_0, split_sizes = var_6802_split_sizes_0, x = normed_273_cast_fp16)[name = string("op_6802_cast_fp16")];
+            tensor<fp16, [2560]> layers_9_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_9_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(577057792)))];
+            tensor<fp16, [1, 3, 2560]> h_57_cast_fp16 = mul(x = var_6802_cast_fp16_0, y = layers_9_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_57_cast_fp16")];
+            tensor<int32, [3]> var_6813 = const()[name = string("op_6813"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_289_axes_0 = const()[name = string("input_289_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_6814 = transpose(perm = var_6813, x = h_57_cast_fp16)[name = string("transpose_44")];
+            tensor<fp16, [1, 2560, 1, 3]> input_289 = expand_dims(axes = input_289_axes_0, x = var_6814)[name = string("input_289")];
+            string gate_37_pad_type_0 = const()[name = string("gate_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_37_strides_0 = const()[name = string("gate_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_37_pad_0 = const()[name = string("gate_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_37_dilations_0 = const()[name = string("gate_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_37_groups_0 = const()[name = string("gate_37_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_37 = conv(dilations = gate_37_dilations_0, groups = gate_37_groups_0, pad = gate_37_pad_0, pad_type = gate_37_pad_type_0, strides = gate_37_strides_0, weight = layers_9_mlp_gate_proj_weight_palettized, x = input_289)[name = string("gate_37")];
+            string up_19_pad_type_0 = const()[name = string("up_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_19_strides_0 = const()[name = string("up_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_19_pad_0 = const()[name = string("up_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_19_dilations_0 = const()[name = string("up_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_19_groups_0 = const()[name = string("up_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_19 = conv(dilations = up_19_dilations_0, groups = up_19_groups_0, pad = up_19_pad_0, pad_type = up_19_pad_type_0, strides = up_19_strides_0, weight = layers_9_mlp_up_proj_weight_palettized, x = input_289)[name = string("up_19")];
+            string gate_39_mode_0 = const()[name = string("gate_39_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_39 = gelu(mode = gate_39_mode_0, x = gate_37)[name = string("gate_39")];
+            tensor<fp16, [1, 10240, 1, 3]> input_291 = mul(x = gate_39, y = up_19)[name = string("input_291")];
+            string mlp_out_19_pad_type_0 = const()[name = string("mlp_out_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_19_strides_0 = const()[name = string("mlp_out_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_19_pad_0 = const()[name = string("mlp_out_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_19_dilations_0 = const()[name = string("mlp_out_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_19_groups_0 = const()[name = string("mlp_out_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_19 = conv(dilations = mlp_out_19_dilations_0, groups = mlp_out_19_groups_0, pad = mlp_out_19_pad_0, pad_type = mlp_out_19_pad_type_0, strides = mlp_out_19_strides_0, weight = layers_9_mlp_down_proj_weight_palettized, x = input_291)[name = string("mlp_out_19")];
+            tensor<int32, [1]> var_6854_axes_0 = const()[name = string("op_6854_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_6854 = squeeze(axes = var_6854_axes_0, x = mlp_out_19)[name = string("op_6854")];
+            tensor<int32, [3]> var_6858 = const()[name = string("op_6858"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_6864 = const()[name = string("op_6864"), val = int32(-1)];
+            fp16 const_116_promoted = const()[name = string("const_116_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_195 = transpose(perm = var_6858, x = var_6854)[name = string("transpose_43")];
+            tensor<fp16, [1, 3, 2560]> var_6866 = mul(x = x_195, y = const_116_promoted)[name = string("op_6866")];
+            bool input_293_interleave_0 = const()[name = string("input_293_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_293 = concat(axis = var_6864, interleave = input_293_interleave_0, values = (x_195, var_6866))[name = string("input_293")];
+            tensor<int32, [1]> normed_277_axes_0 = const()[name = string("normed_277_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6861_to_fp16 = const()[name = string("op_6861_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_277_cast_fp16 = layer_norm(axes = normed_277_axes_0, epsilon = var_6861_to_fp16, x = input_293)[name = string("normed_277_cast_fp16")];
+            tensor<int32, [2]> var_6871_split_sizes_0 = const()[name = string("op_6871_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6871_axis_0 = const()[name = string("op_6871_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_6871_0, tensor<fp16, [1, 3, 2560]> var_6871_1 = split(axis = var_6871_axis_0, split_sizes = var_6871_split_sizes_0, x = normed_277_cast_fp16)[name = string("op_6871")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_93 = mul(x = var_6871_0, y = layers_9_post_feedforward_layernorm_weight)[name = string("hidden_states_93")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_95_cast_fp16 = add(x = x_193_cast_fp16, y = hidden_states_93)[name = string("hidden_states_95_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_19_begin_0 = const()[name = string("per_layer_slice_19_begin_0"), val = tensor<int32, [3]>([0, 0, 2304])];
+            tensor<int32, [3]> per_layer_slice_19_end_0 = const()[name = string("per_layer_slice_19_end_0"), val = tensor<int32, [3]>([1, 3, 2560])];
+            tensor<bool, [3]> per_layer_slice_19_end_mask_0 = const()[name = string("per_layer_slice_19_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_19_cast_fp16 = slice_by_index(begin = per_layer_slice_19_begin_0, end = per_layer_slice_19_end_0, end_mask = per_layer_slice_19_end_mask_0, x = per_layer_combined_out)[name = string("per_layer_slice_19_cast_fp16")];
+            tensor<int32, [3]> var_6899 = const()[name = string("op_6899"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_295_axes_0 = const()[name = string("input_295_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_6900 = transpose(perm = var_6899, x = hidden_states_95_cast_fp16)[name = string("transpose_42")];
+            tensor<fp16, [1, 2560, 1, 3]> input_295 = expand_dims(axes = input_295_axes_0, x = var_6900)[name = string("input_295")];
+            string gated_55_pad_type_0 = const()[name = string("gated_55_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_55_strides_0 = const()[name = string("gated_55_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_55_pad_0 = const()[name = string("gated_55_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_55_dilations_0 = const()[name = string("gated_55_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_55_groups_0 = const()[name = string("gated_55_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_55 = conv(dilations = gated_55_dilations_0, groups = gated_55_groups_0, pad = gated_55_pad_0, pad_type = gated_55_pad_type_0, strides = gated_55_strides_0, weight = layers_9_per_layer_input_gate_weight_palettized, x = input_295)[name = string("gated_55")];
+            string gated_57_mode_0 = const()[name = string("gated_57_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_57 = gelu(mode = gated_57_mode_0, x = gated_55)[name = string("gated_57")];
+            tensor<int32, [3]> var_6919 = const()[name = string("op_6919"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_19_axes_0 = const()[name = string("per_layer_slice_conv_19_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_6920_cast_fp16 = transpose(perm = var_6919, x = per_layer_slice_19_cast_fp16)[name = string("transpose_41")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_19_cast_fp16 = expand_dims(axes = per_layer_slice_conv_19_axes_0, x = var_6920_cast_fp16)[name = string("per_layer_slice_conv_19_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_297_cast_fp16 = mul(x = gated_57, y = per_layer_slice_conv_19_cast_fp16)[name = string("input_297_cast_fp16")];
+            string gated_59_pad_type_0 = const()[name = string("gated_59_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_59_strides_0 = const()[name = string("gated_59_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_59_pad_0 = const()[name = string("gated_59_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_59_dilations_0 = const()[name = string("gated_59_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_59_groups_0 = const()[name = string("gated_59_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_9_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(577062976))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(577390720))))[name = string("layers_9_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_59_cast_fp16 = conv(dilations = gated_59_dilations_0, groups = gated_59_groups_0, pad = gated_59_pad_0, pad_type = gated_59_pad_type_0, strides = gated_59_strides_0, weight = layers_9_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_297_cast_fp16)[name = string("gated_59_cast_fp16")];
+            tensor<int32, [1]> var_6936_axes_0 = const()[name = string("op_6936_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_6936_cast_fp16 = squeeze(axes = var_6936_axes_0, x = gated_59_cast_fp16)[name = string("op_6936_cast_fp16")];
+            tensor<int32, [3]> var_6940 = const()[name = string("op_6940"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_6946 = const()[name = string("op_6946"), val = int32(-1)];
+            fp16 const_117_promoted_to_fp16 = const()[name = string("const_117_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_197_cast_fp16 = transpose(perm = var_6940, x = var_6936_cast_fp16)[name = string("transpose_40")];
+            tensor<fp16, [1, 3, 2560]> var_6948_cast_fp16 = mul(x = x_197_cast_fp16, y = const_117_promoted_to_fp16)[name = string("op_6948_cast_fp16")];
+            bool input_299_interleave_0 = const()[name = string("input_299_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_299_cast_fp16 = concat(axis = var_6946, interleave = input_299_interleave_0, values = (x_197_cast_fp16, var_6948_cast_fp16))[name = string("input_299_cast_fp16")];
+            tensor<int32, [1]> normed_281_axes_0 = const()[name = string("normed_281_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6943_to_fp16 = const()[name = string("op_6943_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_281_cast_fp16 = layer_norm(axes = normed_281_axes_0, epsilon = var_6943_to_fp16, x = input_299_cast_fp16)[name = string("normed_281_cast_fp16")];
+            tensor<int32, [2]> var_6953_split_sizes_0 = const()[name = string("op_6953_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6953_axis_0 = const()[name = string("op_6953_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_6953_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_6953_cast_fp16_1 = split(axis = var_6953_axis_0, split_sizes = var_6953_split_sizes_0, x = normed_281_cast_fp16)[name = string("op_6953_cast_fp16")];
+            tensor<fp16, [2560]> layers_9_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_9_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(577393344)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_99_cast_fp16 = mul(x = var_6953_cast_fp16_0, y = layers_9_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_99_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_101_cast_fp16 = add(x = hidden_states_95_cast_fp16, y = hidden_states_99_cast_fp16)[name = string("hidden_states_101_cast_fp16")];
+            tensor<fp16, [1]> const_118_promoted_to_fp16 = const()[name = string("const_118_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.a8p-2])];
+            tensor<fp16, [1, 3, 2560]> x_199_cast_fp16 = mul(x = hidden_states_101_cast_fp16, y = const_118_promoted_to_fp16)[name = string("x_199_cast_fp16")];
+            int32 var_6968 = const()[name = string("op_6968"), val = int32(-1)];
+            fp16 const_119_promoted_to_fp16 = const()[name = string("const_119_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_6970_cast_fp16 = mul(x = x_199_cast_fp16, y = const_119_promoted_to_fp16)[name = string("op_6970_cast_fp16")];
+            bool input_301_interleave_0 = const()[name = string("input_301_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_301_cast_fp16 = concat(axis = var_6968, interleave = input_301_interleave_0, values = (x_199_cast_fp16, var_6970_cast_fp16))[name = string("input_301_cast_fp16")];
+            tensor<int32, [1]> normed_285_axes_0 = const()[name = string("normed_285_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6965_to_fp16 = const()[name = string("op_6965_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_285_cast_fp16 = layer_norm(axes = normed_285_axes_0, epsilon = var_6965_to_fp16, x = input_301_cast_fp16)[name = string("normed_285_cast_fp16")];
+            tensor<int32, [2]> var_6975_split_sizes_0 = const()[name = string("op_6975_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6975_axis_0 = const()[name = string("op_6975_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_6975_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_6975_cast_fp16_1 = split(axis = var_6975_axis_0, split_sizes = var_6975_split_sizes_0, x = normed_285_cast_fp16)[name = string("op_6975_cast_fp16")];
+            tensor<fp16, [2560]> layers_10_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_10_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(577398528)))];
+            tensor<fp16, [1, 3, 2560]> h_61_cast_fp16 = mul(x = var_6975_cast_fp16_0, y = layers_10_input_layernorm_weight_promoted_to_fp16)[name = string("h_61_cast_fp16")];
+            tensor<int32, [3]> var_6981 = const()[name = string("op_6981"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_6984_axes_0 = const()[name = string("op_6984_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_6982_cast_fp16 = transpose(perm = var_6981, x = h_61_cast_fp16)[name = string("transpose_39")];
+            tensor<fp16, [1, 2560, 1, 3]> var_6984_cast_fp16 = expand_dims(axes = var_6984_axes_0, x = var_6982_cast_fp16)[name = string("op_6984_cast_fp16")];
+            string q_121_pad_type_0 = const()[name = string("q_121_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_121_strides_0 = const()[name = string("q_121_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_121_pad_0 = const()[name = string("q_121_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_121_dilations_0 = const()[name = string("q_121_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_121_groups_0 = const()[name = string("q_121_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_121 = conv(dilations = q_121_dilations_0, groups = q_121_groups_0, pad = q_121_pad_0, pad_type = q_121_pad_type_0, strides = q_121_strides_0, weight = layers_10_self_attn_q_proj_weight_palettized, x = var_6984_cast_fp16)[name = string("q_121")];
+            tensor<int32, [4]> var_7005 = const()[name = string("op_7005"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_7006 = reshape(shape = var_7005, x = q_121)[name = string("op_7006")];
+            tensor<int32, [4]> transpose_78_perm_0 = const()[name = string("transpose_78_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_7029 = const()[name = string("op_7029"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_78 = transpose(perm = transpose_78_perm_0, x = var_7006)[name = string("transpose_38")];
+            tensor<fp16, [3, 8, 256]> x_201 = reshape(shape = var_7029, x = transpose_78)[name = string("x_201")];
+            int32 var_7035 = const()[name = string("op_7035"), val = int32(-1)];
+            fp16 const_120_promoted = const()[name = string("const_120_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_7037 = mul(x = x_201, y = const_120_promoted)[name = string("op_7037")];
+            bool input_305_interleave_0 = const()[name = string("input_305_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_305 = concat(axis = var_7035, interleave = input_305_interleave_0, values = (x_201, var_7037))[name = string("input_305")];
+            tensor<int32, [1]> normed_289_axes_0 = const()[name = string("normed_289_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7032_to_fp16 = const()[name = string("op_7032_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_289_cast_fp16 = layer_norm(axes = normed_289_axes_0, epsilon = var_7032_to_fp16, x = input_305)[name = string("normed_289_cast_fp16")];
+            tensor<int32, [2]> var_7042_split_sizes_0 = const()[name = string("op_7042_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_7042_axis_0 = const()[name = string("op_7042_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_7042_0, tensor<fp16, [3, 8, 256]> var_7042_1 = split(axis = var_7042_axis_0, split_sizes = var_7042_split_sizes_0, x = normed_289_cast_fp16)[name = string("op_7042")];
+            tensor<fp16, [3, 8, 256]> q_125 = mul(x = var_7042_0, y = layers_3_self_attn_q_norm_weight)[name = string("q_125")];
+            tensor<int32, [4]> var_7049 = const()[name = string("op_7049"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_7050 = reshape(shape = var_7049, x = q_125)[name = string("op_7050")];
+            tensor<int32, [4]> var_7055 = const()[name = string("op_7055"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_127 = transpose(perm = var_7055, x = var_7050)[name = string("transpose_37")];
+            tensor<fp16, [1, 8, 3, 256]> var_7057_cast_fp16 = mul(x = q_127, y = cos_s)[name = string("op_7057_cast_fp16")];
+            tensor<int32, [2]> var_7058_split_sizes_0 = const()[name = string("op_7058_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_7058_axis_0 = const()[name = string("op_7058_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_7058_0, tensor<fp16, [1, 8, 3, 128]> var_7058_1 = split(axis = var_7058_axis_0, split_sizes = var_7058_split_sizes_0, x = q_127)[name = string("op_7058")];
+            fp16 const_121_promoted = const()[name = string("const_121_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_7060 = mul(x = var_7058_1, y = const_121_promoted)[name = string("op_7060")];
+            int32 var_7062 = const()[name = string("op_7062"), val = int32(-1)];
+            bool var_7063_interleave_0 = const()[name = string("op_7063_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_7063 = concat(axis = var_7062, interleave = var_7063_interleave_0, values = (var_7060, var_7058_0))[name = string("op_7063")];
+            tensor<fp16, [1, 8, 3, 256]> var_7064_cast_fp16 = mul(x = var_7063, y = sin_s)[name = string("op_7064_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_131_cast_fp16 = add(x = var_7057_cast_fp16, y = var_7064_cast_fp16)[name = string("q_131_cast_fp16")];
+            string k_63_pad_type_0 = const()[name = string("k_63_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_63_strides_0 = const()[name = string("k_63_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_63_pad_0 = const()[name = string("k_63_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_63_dilations_0 = const()[name = string("k_63_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_63_groups_0 = const()[name = string("k_63_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> k_63 = conv(dilations = k_63_dilations_0, groups = k_63_groups_0, pad = k_63_pad_0, pad_type = k_63_pad_type_0, strides = k_63_strides_0, weight = layers_10_self_attn_k_proj_weight_palettized, x = var_6984_cast_fp16)[name = string("k_63")];
+            tensor<int32, [4]> var_7082 = const()[name = string("op_7082"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_7083 = reshape(shape = var_7082, x = k_63)[name = string("op_7083")];
+            tensor<int32, [4]> transpose_79_perm_0 = const()[name = string("transpose_79_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            string v_23_pad_type_0 = const()[name = string("v_23_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_23_strides_0 = const()[name = string("v_23_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_23_pad_0 = const()[name = string("v_23_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_23_dilations_0 = const()[name = string("v_23_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_23_groups_0 = const()[name = string("v_23_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> v_23 = conv(dilations = v_23_dilations_0, groups = v_23_groups_0, pad = v_23_pad_0, pad_type = v_23_pad_type_0, strides = v_23_strides_0, weight = layers_10_self_attn_v_proj_weight_palettized, x = var_6984_cast_fp16)[name = string("v_23")];
+            tensor<int32, [4]> var_7110 = const()[name = string("op_7110"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_7111 = reshape(shape = var_7110, x = v_23)[name = string("op_7111")];
+            tensor<int32, [4]> var_7116 = const()[name = string("op_7116"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_7134 = const()[name = string("op_7134"), val = tensor<int32, [3]>([3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> transpose_79 = transpose(perm = transpose_79_perm_0, x = var_7083)[name = string("transpose_36")];
+            tensor<fp16, [3, 2, 256]> x_203 = reshape(shape = var_7134, x = transpose_79)[name = string("x_203")];
+            int32 var_7140 = const()[name = string("op_7140"), val = int32(-1)];
+            fp16 const_122_promoted = const()[name = string("const_122_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 2, 256]> var_7142 = mul(x = x_203, y = const_122_promoted)[name = string("op_7142")];
+            bool input_307_interleave_0 = const()[name = string("input_307_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 2, 512]> input_307 = concat(axis = var_7140, interleave = input_307_interleave_0, values = (x_203, var_7142))[name = string("input_307")];
+            tensor<int32, [1]> normed_293_axes_0 = const()[name = string("normed_293_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7137_to_fp16 = const()[name = string("op_7137_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 2, 512]> normed_293_cast_fp16 = layer_norm(axes = normed_293_axes_0, epsilon = var_7137_to_fp16, x = input_307)[name = string("normed_293_cast_fp16")];
+            tensor<int32, [2]> var_7147_split_sizes_0 = const()[name = string("op_7147_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_7147_axis_0 = const()[name = string("op_7147_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 2, 256]> var_7147_0, tensor<fp16, [3, 2, 256]> var_7147_1 = split(axis = var_7147_axis_0, split_sizes = var_7147_split_sizes_0, x = normed_293_cast_fp16)[name = string("op_7147")];
+            tensor<fp16, [3, 2, 256]> k_67 = mul(x = var_7147_0, y = layers_6_self_attn_k_norm_weight)[name = string("k_67")];
+            tensor<int32, [4]> var_7154 = const()[name = string("op_7154"), val = tensor<int32, [4]>([1, 3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> var_7155 = reshape(shape = var_7154, x = k_67)[name = string("op_7155")];
+            tensor<int32, [4]> var_7160 = const()[name = string("op_7160"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            fp16 var_7162_promoted = const()[name = string("op_7162_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 3, 256]> var_7117 = transpose(perm = var_7116, x = var_7111)[name = string("transpose_35")];
+            tensor<fp16, [1, 2, 3, 256]> var_7163 = pow(x = var_7117, y = var_7162_promoted)[name = string("op_7163")];
+            tensor<int32, [1]> var_7168_axes_0 = const()[name = string("op_7168_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_7168_keep_dims_0 = const()[name = string("op_7168_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 3, 1]> var_7168 = reduce_mean(axes = var_7168_axes_0, keep_dims = var_7168_keep_dims_0, x = var_7163)[name = string("op_7168")];
+            fp16 var_7170_to_fp16 = const()[name = string("op_7170_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 3, 1]> mean_sq_21_cast_fp16 = add(x = var_7168, y = var_7170_to_fp16)[name = string("mean_sq_21_cast_fp16")];
+            fp32 var_7172_epsilon_0 = const()[name = string("op_7172_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 3, 1]> var_7172_cast_fp16 = rsqrt(epsilon = var_7172_epsilon_0, x = mean_sq_21_cast_fp16)[name = string("op_7172_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_311_cast_fp16 = mul(x = var_7117, y = var_7172_cast_fp16)[name = string("input_311_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> q_129 = transpose(perm = var_7160, x = var_7155)[name = string("transpose_34")];
+            tensor<fp16, [1, 2, 3, 256]> var_7174_cast_fp16 = mul(x = q_129, y = cos_s)[name = string("op_7174_cast_fp16")];
+            tensor<int32, [2]> var_7175_split_sizes_0 = const()[name = string("op_7175_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_7175_axis_0 = const()[name = string("op_7175_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 3, 128]> var_7175_0, tensor<fp16, [1, 2, 3, 128]> var_7175_1 = split(axis = var_7175_axis_0, split_sizes = var_7175_split_sizes_0, x = q_129)[name = string("op_7175")];
+            fp16 const_123_promoted = const()[name = string("const_123_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 3, 128]> var_7177 = mul(x = var_7175_1, y = const_123_promoted)[name = string("op_7177")];
+            int32 var_7179 = const()[name = string("op_7179"), val = int32(-1)];
+            bool var_7180_interleave_0 = const()[name = string("op_7180_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 3, 256]> var_7180 = concat(axis = var_7179, interleave = var_7180_interleave_0, values = (var_7177, var_7175_0))[name = string("op_7180")];
+            tensor<fp16, [1, 2, 3, 256]> var_7181_cast_fp16 = mul(x = var_7180, y = sin_s)[name = string("op_7181_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_309_cast_fp16 = add(x = var_7174_cast_fp16, y = var_7181_cast_fp16)[name = string("input_309_cast_fp16")];
+            tensor<int32, [8]> k_padded_pad_0 = const()[name = string("k_padded_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_mode_0 = const()[name = string("k_padded_mode_0"), val = string("constant")];
+            fp16 const_124_to_fp16 = const()[name = string("const_124_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> k_padded_cast_fp16 = pad(constant_val = const_124_to_fp16, mode = k_padded_mode_0, pad = k_padded_pad_0, x = input_309_cast_fp16)[name = string("k_padded_cast_fp16")];
+            tensor<int32, [8]> v_padded_pad_0 = const()[name = string("v_padded_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_mode_0 = const()[name = string("v_padded_mode_0"), val = string("constant")];
+            fp16 const_125_to_fp16 = const()[name = string("const_125_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> v_padded_cast_fp16 = pad(constant_val = const_125_to_fp16, mode = v_padded_mode_0, pad = v_padded_pad_0, x = input_311_cast_fp16)[name = string("v_padded_cast_fp16")];
+            tensor<int32, [4]> slot_k_21_begin_0 = const()[name = string("slot_k_21_begin_0"), val = tensor<int32, [4]>([9, 0, 0, 0])];
+            tensor<int32, [4]> slot_k_21_end_0 = const()[name = string("slot_k_21_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> slot_k_21_end_mask_0 = const()[name = string("slot_k_21_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_k_21_cast_fp16 = slice_by_index(begin = slot_k_21_begin_0, end = slot_k_21_end_0, end_mask = slot_k_21_end_mask_0, x = K_sliding_out_17_cast_fp16)[name = string("slot_k_21_cast_fp16")];
+            tensor<int32, [4]> slot_v_21_begin_0 = const()[name = string("slot_v_21_begin_0"), val = tensor<int32, [4]>([9, 0, 0, 0])];
+            tensor<int32, [4]> slot_v_21_end_0 = const()[name = string("slot_v_21_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> slot_v_21_end_mask_0 = const()[name = string("slot_v_21_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_v_21_cast_fp16 = slice_by_index(begin = slot_v_21_begin_0, end = slot_v_21_end_0, end_mask = slot_v_21_end_mask_0, x = V_sliding_out_17_cast_fp16)[name = string("slot_v_21_cast_fp16")];
+            tensor<int32, [4]> var_7220_begin_0 = const()[name = string("op_7220_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_7220_end_0 = const()[name = string("op_7220_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_7220_end_mask_0 = const()[name = string("op_7220_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_7220_cast_fp16 = slice_by_index(begin = var_7220_begin_0, end = var_7220_end_0, end_mask = var_7220_end_mask_0, x = slot_k_21_cast_fp16)[name = string("op_7220_cast_fp16")];
+            int32 var_7227 = const()[name = string("op_7227"), val = int32(2)];
+            bool new_k_21_interleave_0 = const()[name = string("new_k_21_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_k_21_cast_fp16 = concat(axis = var_7227, interleave = new_k_21_interleave_0, values = (var_7220_cast_fp16, k_padded_cast_fp16))[name = string("new_k_21_cast_fp16")];
+            tensor<int32, [4]> var_7243_begin_0 = const()[name = string("op_7243_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_7243_end_0 = const()[name = string("op_7243_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_7243_end_mask_0 = const()[name = string("op_7243_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_7243_cast_fp16 = slice_by_index(begin = var_7243_begin_0, end = var_7243_end_0, end_mask = var_7243_end_mask_0, x = slot_v_21_cast_fp16)[name = string("op_7243_cast_fp16")];
+            int32 var_7250 = const()[name = string("op_7250"), val = int32(2)];
+            bool new_v_21_interleave_0 = const()[name = string("new_v_21_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_v_21_cast_fp16 = concat(axis = var_7250, interleave = new_v_21_interleave_0, values = (var_7243_cast_fp16, v_padded_cast_fp16))[name = string("new_v_21_cast_fp16")];
+            tensor<int32, [4]> var_7256_begin_0 = const()[name = string("op_7256_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_7256_end_0 = const()[name = string("op_7256_end_0"), val = tensor<int32, [4]>([9, 2, 512, 512])];
+            tensor<bool, [4]> var_7256_end_mask_0 = const()[name = string("op_7256_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [9, 2, 512, 512]> var_7256_cast_fp16 = slice_by_index(begin = var_7256_begin_0, end = var_7256_end_0, end_mask = var_7256_end_mask_0, x = K_sliding_out_17_cast_fp16)[name = string("op_7256_cast_fp16")];
+            int32 var_7263 = const()[name = string("op_7263"), val = int32(0)];
+            bool K_sliding_out_interleave_0 = const()[name = string("K_sliding_out_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> K_sliding_out = concat(axis = var_7263, interleave = K_sliding_out_interleave_0, values = (var_7256_cast_fp16, new_k_21_cast_fp16))[name = string("K_sliding_out_cast_fp16")];
+            tensor<int32, [4]> var_7269_begin_0 = const()[name = string("op_7269_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_7269_end_0 = const()[name = string("op_7269_end_0"), val = tensor<int32, [4]>([9, 2, 512, 512])];
+            tensor<bool, [4]> var_7269_end_mask_0 = const()[name = string("op_7269_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [9, 2, 512, 512]> var_7269_cast_fp16 = slice_by_index(begin = var_7269_begin_0, end = var_7269_end_0, end_mask = var_7269_end_mask_0, x = V_sliding_out_17_cast_fp16)[name = string("op_7269_cast_fp16")];
+            int32 var_7276 = const()[name = string("op_7276"), val = int32(0)];
+            bool V_sliding_out_interleave_0 = const()[name = string("V_sliding_out_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> V_sliding_out = concat(axis = var_7276, interleave = V_sliding_out_interleave_0, values = (var_7269_cast_fp16, new_v_21_cast_fp16))[name = string("V_sliding_out_cast_fp16")];
+            tensor<int32, [4]> var_7282_begin_0 = const()[name = string("op_7282_begin_0"), val = tensor<int32, [4]>([9, 0, 0, 0])];
+            tensor<int32, [4]> var_7282_end_0 = const()[name = string("op_7282_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_7282_end_mask_0 = const()[name = string("op_7282_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_7282_cast_fp16 = slice_by_index(begin = var_7282_begin_0, end = var_7282_end_0, end_mask = var_7282_end_mask_0, x = K_sliding_out)[name = string("op_7282_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_21_begin_0 = const()[name = string("K_for_attn_21_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_21_end_0 = const()[name = string("K_for_attn_21_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_21_end_mask_0 = const()[name = string("K_for_attn_21_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_21_cast_fp16 = slice_by_index(begin = K_for_attn_21_begin_0, end = K_for_attn_21_end_0, end_mask = K_for_attn_21_end_mask_0, x = var_7282_cast_fp16)[name = string("K_for_attn_21_cast_fp16")];
+            tensor<int32, [4]> var_7292_begin_0 = const()[name = string("op_7292_begin_0"), val = tensor<int32, [4]>([9, 0, 0, 0])];
+            tensor<int32, [4]> var_7292_end_0 = const()[name = string("op_7292_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_7292_end_mask_0 = const()[name = string("op_7292_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_7292_cast_fp16 = slice_by_index(begin = var_7292_begin_0, end = var_7292_end_0, end_mask = var_7292_end_mask_0, x = V_sliding_out)[name = string("op_7292_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_21_begin_0 = const()[name = string("V_for_attn_21_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_21_end_0 = const()[name = string("V_for_attn_21_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_21_end_mask_0 = const()[name = string("V_for_attn_21_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_21_cast_fp16 = slice_by_index(begin = V_for_attn_21_begin_0, end = V_for_attn_21_end_0, end_mask = V_for_attn_21_end_mask_0, x = var_7292_cast_fp16)[name = string("V_for_attn_21_cast_fp16")];
+            tensor<int32, [4]> transpose_40_perm_0 = const()[name = string("transpose_40_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_20_reps_0 = const()[name = string("tile_20_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_40_cast_fp16 = transpose(perm = transpose_40_perm_0, x = K_for_attn_21_cast_fp16)[name = string("transpose_33")];
+            tensor<fp16, [8, 1, 512, 256]> tile_20_cast_fp16 = tile(reps = tile_20_reps_0, x = transpose_40_cast_fp16)[name = string("tile_20_cast_fp16")];
+            tensor<int32, [5]> concat_42 = const()[name = string("concat_42"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_40_cast_fp16 = reshape(shape = concat_42, x = tile_20_cast_fp16)[name = string("reshape_40_cast_fp16")];
+            tensor<int32, [5]> transpose_41_perm_0 = const()[name = string("transpose_41_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_43 = const()[name = string("concat_43"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_41_cast_fp16 = transpose(perm = transpose_41_perm_0, x = reshape_40_cast_fp16)[name = string("transpose_32")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_41_cast_fp16 = reshape(shape = concat_43, x = transpose_41_cast_fp16)[name = string("reshape_41_cast_fp16")];
+            tensor<int32, [4]> transpose_80_perm_0 = const()[name = string("transpose_80_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_42_perm_0 = const()[name = string("transpose_42_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_21_reps_0 = const()[name = string("tile_21_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_42_cast_fp16 = transpose(perm = transpose_42_perm_0, x = V_for_attn_21_cast_fp16)[name = string("transpose_31")];
+            tensor<fp16, [8, 1, 512, 256]> tile_21_cast_fp16 = tile(reps = tile_21_reps_0, x = transpose_42_cast_fp16)[name = string("tile_21_cast_fp16")];
+            tensor<int32, [5]> concat_44 = const()[name = string("concat_44"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_42_cast_fp16 = reshape(shape = concat_44, x = tile_21_cast_fp16)[name = string("reshape_42_cast_fp16")];
+            tensor<int32, [5]> transpose_43_perm_0 = const()[name = string("transpose_43_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_45 = const()[name = string("concat_45"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_43_cast_fp16 = transpose(perm = transpose_43_perm_0, x = reshape_42_cast_fp16)[name = string("transpose_30")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_43_cast_fp16 = reshape(shape = concat_45, x = transpose_43_cast_fp16)[name = string("reshape_43_cast_fp16")];
+            tensor<int32, [4]> V_expanded_21_perm_0 = const()[name = string("V_expanded_21_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_41_transpose_x_0 = const()[name = string("attn_weights_41_transpose_x_0"), val = bool(false)];
+            bool attn_weights_41_transpose_y_0 = const()[name = string("attn_weights_41_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_80_cast_fp16 = transpose(perm = transpose_80_perm_0, x = reshape_41_cast_fp16)[name = string("transpose_29")];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_41_cast_fp16 = matmul(transpose_x = attn_weights_41_transpose_x_0, transpose_y = attn_weights_41_transpose_y_0, x = q_131_cast_fp16, y = transpose_80_cast_fp16)[name = string("attn_weights_41_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_207_cast_fp16 = add(x = attn_weights_41_cast_fp16, y = causal_mask_sliding)[name = string("x_207_cast_fp16")];
+            tensor<int32, [1]> reduce_max_10_axes_0 = const()[name = string("reduce_max_10_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_10_keep_dims_0 = const()[name = string("reduce_max_10_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_10 = reduce_max(axes = reduce_max_10_axes_0, keep_dims = reduce_max_10_keep_dims_0, x = x_207_cast_fp16)[name = string("reduce_max_10")];
+            tensor<fp16, [1, 8, 3, 512]> var_7327 = sub(x = x_207_cast_fp16, y = reduce_max_10)[name = string("op_7327")];
+            tensor<fp16, [1, 8, 3, 512]> var_7333 = exp(x = var_7327)[name = string("op_7333")];
+            tensor<int32, [1]> var_7343_axes_0 = const()[name = string("op_7343_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_7343_keep_dims_0 = const()[name = string("op_7343_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_7343 = reduce_sum(axes = var_7343_axes_0, keep_dims = var_7343_keep_dims_0, x = var_7333)[name = string("op_7343")];
+            tensor<fp16, [1, 8, 3, 512]> var_7349_cast_fp16 = real_div(x = var_7333, y = var_7343)[name = string("op_7349_cast_fp16")];
+            bool attn_output_61_transpose_x_0 = const()[name = string("attn_output_61_transpose_x_0"), val = bool(false)];
+            bool attn_output_61_transpose_y_0 = const()[name = string("attn_output_61_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_21_cast_fp16 = transpose(perm = V_expanded_21_perm_0, x = reshape_43_cast_fp16)[name = string("transpose_28")];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_61_cast_fp16 = matmul(transpose_x = attn_output_61_transpose_x_0, transpose_y = attn_output_61_transpose_y_0, x = var_7349_cast_fp16, y = V_expanded_21_cast_fp16)[name = string("attn_output_61_cast_fp16")];
+            tensor<int32, [4]> var_7360 = const()[name = string("op_7360"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_7367 = const()[name = string("op_7367"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_7361_cast_fp16 = transpose(perm = var_7360, x = attn_output_61_cast_fp16)[name = string("transpose_27")];
+            tensor<fp16, [1, 3, 2048]> attn_output_63_cast_fp16 = reshape(shape = var_7367, x = var_7361_cast_fp16)[name = string("attn_output_63_cast_fp16")];
+            tensor<int32, [3]> var_7372 = const()[name = string("op_7372"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_7388_pad_type_0 = const()[name = string("op_7388_pad_type_0"), val = string("valid")];
+            int32 var_7388_groups_0 = const()[name = string("op_7388_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_7388_strides_0 = const()[name = string("op_7388_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_7388_pad_0 = const()[name = string("op_7388_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_7388_dilations_0 = const()[name = string("op_7388_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_10_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(577403712))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(580025216))))[name = string("squeeze_10_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_7373_cast_fp16 = transpose(perm = var_7372, x = attn_output_63_cast_fp16)[name = string("transpose_26")];
+            tensor<fp16, [1, 2560, 3]> var_7388_cast_fp16 = conv(dilations = var_7388_dilations_0, groups = var_7388_groups_0, pad = var_7388_pad_0, pad_type = var_7388_pad_type_0, strides = var_7388_strides_0, weight = squeeze_10_cast_fp16_to_fp32_to_fp16_palettized, x = var_7373_cast_fp16)[name = string("op_7388_cast_fp16")];
+            tensor<int32, [3]> var_7392 = const()[name = string("op_7392"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_7398 = const()[name = string("op_7398"), val = int32(-1)];
+            fp16 const_126_promoted_to_fp16 = const()[name = string("const_126_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_211_cast_fp16 = transpose(perm = var_7392, x = var_7388_cast_fp16)[name = string("transpose_25")];
+            tensor<fp16, [1, 3, 2560]> var_7400_cast_fp16 = mul(x = x_211_cast_fp16, y = const_126_promoted_to_fp16)[name = string("op_7400_cast_fp16")];
+            bool input_315_interleave_0 = const()[name = string("input_315_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_315_cast_fp16 = concat(axis = var_7398, interleave = input_315_interleave_0, values = (x_211_cast_fp16, var_7400_cast_fp16))[name = string("input_315_cast_fp16")];
+            tensor<int32, [1]> normed_297_axes_0 = const()[name = string("normed_297_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7395_to_fp16 = const()[name = string("op_7395_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_297_cast_fp16 = layer_norm(axes = normed_297_axes_0, epsilon = var_7395_to_fp16, x = input_315_cast_fp16)[name = string("normed_297_cast_fp16")];
+            tensor<int32, [2]> var_7405_split_sizes_0 = const()[name = string("op_7405_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7405_axis_0 = const()[name = string("op_7405_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_7405_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_7405_cast_fp16_1 = split(axis = var_7405_axis_0, split_sizes = var_7405_split_sizes_0, x = normed_297_cast_fp16)[name = string("op_7405_cast_fp16")];
+            tensor<fp16, [2560]> layers_10_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_10_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(580027840)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_65_cast_fp16 = mul(x = var_7405_cast_fp16_0, y = layers_10_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_65_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_213_cast_fp16 = add(x = x_199_cast_fp16, y = attn_output_65_cast_fp16)[name = string("x_213_cast_fp16")];
+            int32 var_7414 = const()[name = string("op_7414"), val = int32(-1)];
+            fp16 const_127_promoted_to_fp16 = const()[name = string("const_127_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_7416_cast_fp16 = mul(x = x_213_cast_fp16, y = const_127_promoted_to_fp16)[name = string("op_7416_cast_fp16")];
+            bool input_317_interleave_0 = const()[name = string("input_317_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_317_cast_fp16 = concat(axis = var_7414, interleave = input_317_interleave_0, values = (x_213_cast_fp16, var_7416_cast_fp16))[name = string("input_317_cast_fp16")];
+            tensor<int32, [1]> normed_301_axes_0 = const()[name = string("normed_301_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7411_to_fp16 = const()[name = string("op_7411_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_301_cast_fp16 = layer_norm(axes = normed_301_axes_0, epsilon = var_7411_to_fp16, x = input_317_cast_fp16)[name = string("normed_301_cast_fp16")];
+            tensor<int32, [2]> var_7421_split_sizes_0 = const()[name = string("op_7421_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7421_axis_0 = const()[name = string("op_7421_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_7421_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_7421_cast_fp16_1 = split(axis = var_7421_axis_0, split_sizes = var_7421_split_sizes_0, x = normed_301_cast_fp16)[name = string("op_7421_cast_fp16")];
+            tensor<fp16, [2560]> layers_10_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_10_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(580033024)))];
+            tensor<fp16, [1, 3, 2560]> h_63_cast_fp16 = mul(x = var_7421_cast_fp16_0, y = layers_10_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_63_cast_fp16")];
+            tensor<int32, [3]> var_7432 = const()[name = string("op_7432"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_319_axes_0 = const()[name = string("input_319_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_7433 = transpose(perm = var_7432, x = h_63_cast_fp16)[name = string("transpose_24")];
+            tensor<fp16, [1, 2560, 1, 3]> input_319 = expand_dims(axes = input_319_axes_0, x = var_7433)[name = string("input_319")];
+            string gate_41_pad_type_0 = const()[name = string("gate_41_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_41_strides_0 = const()[name = string("gate_41_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_41_pad_0 = const()[name = string("gate_41_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_41_dilations_0 = const()[name = string("gate_41_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_41_groups_0 = const()[name = string("gate_41_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_41 = conv(dilations = gate_41_dilations_0, groups = gate_41_groups_0, pad = gate_41_pad_0, pad_type = gate_41_pad_type_0, strides = gate_41_strides_0, weight = layers_10_mlp_gate_proj_weight_palettized, x = input_319)[name = string("gate_41")];
+            string up_21_pad_type_0 = const()[name = string("up_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_21_strides_0 = const()[name = string("up_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_21_pad_0 = const()[name = string("up_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_21_dilations_0 = const()[name = string("up_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_21_groups_0 = const()[name = string("up_21_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_21 = conv(dilations = up_21_dilations_0, groups = up_21_groups_0, pad = up_21_pad_0, pad_type = up_21_pad_type_0, strides = up_21_strides_0, weight = layers_10_mlp_up_proj_weight_palettized, x = input_319)[name = string("up_21")];
+            string gate_43_mode_0 = const()[name = string("gate_43_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_43 = gelu(mode = gate_43_mode_0, x = gate_41)[name = string("gate_43")];
+            tensor<fp16, [1, 10240, 1, 3]> input_321 = mul(x = gate_43, y = up_21)[name = string("input_321")];
+            string mlp_out_21_pad_type_0 = const()[name = string("mlp_out_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_21_strides_0 = const()[name = string("mlp_out_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_21_pad_0 = const()[name = string("mlp_out_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_21_dilations_0 = const()[name = string("mlp_out_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_21_groups_0 = const()[name = string("mlp_out_21_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_21 = conv(dilations = mlp_out_21_dilations_0, groups = mlp_out_21_groups_0, pad = mlp_out_21_pad_0, pad_type = mlp_out_21_pad_type_0, strides = mlp_out_21_strides_0, weight = layers_10_mlp_down_proj_weight_palettized, x = input_321)[name = string("mlp_out_21")];
+            tensor<int32, [1]> var_7473_axes_0 = const()[name = string("op_7473_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_7473 = squeeze(axes = var_7473_axes_0, x = mlp_out_21)[name = string("op_7473")];
+            tensor<int32, [3]> var_7477 = const()[name = string("op_7477"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_7483 = const()[name = string("op_7483"), val = int32(-1)];
+            fp16 const_128_promoted = const()[name = string("const_128_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_215 = transpose(perm = var_7477, x = var_7473)[name = string("transpose_23")];
+            tensor<fp16, [1, 3, 2560]> var_7485 = mul(x = x_215, y = const_128_promoted)[name = string("op_7485")];
+            bool input_323_interleave_0 = const()[name = string("input_323_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_323 = concat(axis = var_7483, interleave = input_323_interleave_0, values = (x_215, var_7485))[name = string("input_323")];
+            tensor<int32, [1]> normed_305_axes_0 = const()[name = string("normed_305_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7480_to_fp16 = const()[name = string("op_7480_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_305_cast_fp16 = layer_norm(axes = normed_305_axes_0, epsilon = var_7480_to_fp16, x = input_323)[name = string("normed_305_cast_fp16")];
+            tensor<int32, [2]> var_7490_split_sizes_0 = const()[name = string("op_7490_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7490_axis_0 = const()[name = string("op_7490_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_7490_0, tensor<fp16, [1, 3, 2560]> var_7490_1 = split(axis = var_7490_axis_0, split_sizes = var_7490_split_sizes_0, x = normed_305_cast_fp16)[name = string("op_7490")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_103 = mul(x = var_7490_0, y = layers_10_post_feedforward_layernorm_weight)[name = string("hidden_states_103")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_105_cast_fp16 = add(x = x_213_cast_fp16, y = hidden_states_103)[name = string("hidden_states_105_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_21_begin_0 = const()[name = string("per_layer_slice_21_begin_0"), val = tensor<int32, [3]>([0, 0, 2560])];
+            tensor<int32, [3]> per_layer_slice_21_end_0 = const()[name = string("per_layer_slice_21_end_0"), val = tensor<int32, [3]>([1, 3, 2816])];
+            tensor<bool, [3]> per_layer_slice_21_end_mask_0 = const()[name = string("per_layer_slice_21_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_21_cast_fp16 = slice_by_index(begin = per_layer_slice_21_begin_0, end = per_layer_slice_21_end_0, end_mask = per_layer_slice_21_end_mask_0, x = per_layer_combined_out)[name = string("per_layer_slice_21_cast_fp16")];
+            tensor<int32, [3]> var_7518 = const()[name = string("op_7518"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_325_axes_0 = const()[name = string("input_325_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_7519 = transpose(perm = var_7518, x = hidden_states_105_cast_fp16)[name = string("transpose_22")];
+            tensor<fp16, [1, 2560, 1, 3]> input_325 = expand_dims(axes = input_325_axes_0, x = var_7519)[name = string("input_325")];
+            string gated_61_pad_type_0 = const()[name = string("gated_61_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_61_strides_0 = const()[name = string("gated_61_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_61_pad_0 = const()[name = string("gated_61_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_61_dilations_0 = const()[name = string("gated_61_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_61_groups_0 = const()[name = string("gated_61_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_61 = conv(dilations = gated_61_dilations_0, groups = gated_61_groups_0, pad = gated_61_pad_0, pad_type = gated_61_pad_type_0, strides = gated_61_strides_0, weight = layers_10_per_layer_input_gate_weight_palettized, x = input_325)[name = string("gated_61")];
+            string gated_63_mode_0 = const()[name = string("gated_63_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_63 = gelu(mode = gated_63_mode_0, x = gated_61)[name = string("gated_63")];
+            tensor<int32, [3]> var_7538 = const()[name = string("op_7538"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_21_axes_0 = const()[name = string("per_layer_slice_conv_21_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_7539_cast_fp16 = transpose(perm = var_7538, x = per_layer_slice_21_cast_fp16)[name = string("transpose_21")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_21_cast_fp16 = expand_dims(axes = per_layer_slice_conv_21_axes_0, x = var_7539_cast_fp16)[name = string("per_layer_slice_conv_21_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_327_cast_fp16 = mul(x = gated_63, y = per_layer_slice_conv_21_cast_fp16)[name = string("input_327_cast_fp16")];
+            string gated_65_pad_type_0 = const()[name = string("gated_65_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_65_strides_0 = const()[name = string("gated_65_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_65_pad_0 = const()[name = string("gated_65_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_65_dilations_0 = const()[name = string("gated_65_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_65_groups_0 = const()[name = string("gated_65_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_10_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(580038208))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(580365952))))[name = string("layers_10_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_65_cast_fp16 = conv(dilations = gated_65_dilations_0, groups = gated_65_groups_0, pad = gated_65_pad_0, pad_type = gated_65_pad_type_0, strides = gated_65_strides_0, weight = layers_10_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_327_cast_fp16)[name = string("gated_65_cast_fp16")];
+            tensor<int32, [1]> var_7555_axes_0 = const()[name = string("op_7555_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_7555_cast_fp16 = squeeze(axes = var_7555_axes_0, x = gated_65_cast_fp16)[name = string("op_7555_cast_fp16")];
+            tensor<int32, [3]> var_7559 = const()[name = string("op_7559"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_7565 = const()[name = string("op_7565"), val = int32(-1)];
+            fp16 const_129_promoted_to_fp16 = const()[name = string("const_129_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_217_cast_fp16 = transpose(perm = var_7559, x = var_7555_cast_fp16)[name = string("transpose_20")];
+            tensor<fp16, [1, 3, 2560]> var_7567_cast_fp16 = mul(x = x_217_cast_fp16, y = const_129_promoted_to_fp16)[name = string("op_7567_cast_fp16")];
+            bool input_329_interleave_0 = const()[name = string("input_329_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_329_cast_fp16 = concat(axis = var_7565, interleave = input_329_interleave_0, values = (x_217_cast_fp16, var_7567_cast_fp16))[name = string("input_329_cast_fp16")];
+            tensor<int32, [1]> normed_309_axes_0 = const()[name = string("normed_309_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7562_to_fp16 = const()[name = string("op_7562_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_309_cast_fp16 = layer_norm(axes = normed_309_axes_0, epsilon = var_7562_to_fp16, x = input_329_cast_fp16)[name = string("normed_309_cast_fp16")];
+            tensor<int32, [2]> var_7572_split_sizes_0 = const()[name = string("op_7572_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7572_axis_0 = const()[name = string("op_7572_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_7572_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_7572_cast_fp16_1 = split(axis = var_7572_axis_0, split_sizes = var_7572_split_sizes_0, x = normed_309_cast_fp16)[name = string("op_7572_cast_fp16")];
+            tensor<fp16, [2560]> layers_10_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_10_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(580368576)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_109_cast_fp16 = mul(x = var_7572_cast_fp16_0, y = layers_10_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_109_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_111_cast_fp16 = add(x = hidden_states_105_cast_fp16, y = hidden_states_109_cast_fp16)[name = string("hidden_states_111_cast_fp16")];
+            tensor<fp16, [1]> const_130_promoted_to_fp16 = const()[name = string("const_130_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.3ep-1])];
+            tensor<fp16, [1, 3, 2560]> x_219_cast_fp16 = mul(x = hidden_states_111_cast_fp16, y = const_130_promoted_to_fp16)[name = string("x_219_cast_fp16")];
+            int32 var_7587 = const()[name = string("op_7587"), val = int32(-1)];
+            fp16 const_131_promoted_to_fp16 = const()[name = string("const_131_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_7589_cast_fp16 = mul(x = x_219_cast_fp16, y = const_131_promoted_to_fp16)[name = string("op_7589_cast_fp16")];
+            bool input_331_interleave_0 = const()[name = string("input_331_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_331_cast_fp16 = concat(axis = var_7587, interleave = input_331_interleave_0, values = (x_219_cast_fp16, var_7589_cast_fp16))[name = string("input_331_cast_fp16")];
+            tensor<int32, [1]> normed_313_axes_0 = const()[name = string("normed_313_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7584_to_fp16 = const()[name = string("op_7584_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_313_cast_fp16 = layer_norm(axes = normed_313_axes_0, epsilon = var_7584_to_fp16, x = input_331_cast_fp16)[name = string("normed_313_cast_fp16")];
+            tensor<int32, [2]> var_7594_split_sizes_0 = const()[name = string("op_7594_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7594_axis_0 = const()[name = string("op_7594_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_7594_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_7594_cast_fp16_1 = split(axis = var_7594_axis_0, split_sizes = var_7594_split_sizes_0, x = normed_313_cast_fp16)[name = string("op_7594_cast_fp16")];
+            tensor<fp16, [2560]> layers_11_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_11_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(580373760)))];
+            tensor<fp16, [1, 3, 2560]> h_67_cast_fp16 = mul(x = var_7594_cast_fp16_0, y = layers_11_input_layernorm_weight_promoted_to_fp16)[name = string("h_67_cast_fp16")];
+            tensor<int32, [3]> var_7600 = const()[name = string("op_7600"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_7603_axes_0 = const()[name = string("op_7603_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_7601_cast_fp16 = transpose(perm = var_7600, x = h_67_cast_fp16)[name = string("transpose_19")];
+            tensor<fp16, [1, 2560, 1, 3]> var_7603_cast_fp16 = expand_dims(axes = var_7603_axes_0, x = var_7601_cast_fp16)[name = string("op_7603_cast_fp16")];
+            string q_133_pad_type_0 = const()[name = string("q_133_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_133_strides_0 = const()[name = string("q_133_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_133_pad_0 = const()[name = string("q_133_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_133_dilations_0 = const()[name = string("q_133_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_133_groups_0 = const()[name = string("q_133_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 4096, 1, 3]> q_133 = conv(dilations = q_133_dilations_0, groups = q_133_groups_0, pad = q_133_pad_0, pad_type = q_133_pad_type_0, strides = q_133_strides_0, weight = layers_11_self_attn_q_proj_weight_palettized, x = var_7603_cast_fp16)[name = string("q_133")];
+            tensor<int32, [4]> var_7624 = const()[name = string("op_7624"), val = tensor<int32, [4]>([1, 8, 512, 3])];
+            tensor<fp16, [1, 8, 512, 3]> var_7625 = reshape(shape = var_7624, x = q_133)[name = string("op_7625")];
+            tensor<int32, [4]> transpose_81_perm_0 = const()[name = string("transpose_81_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_7648 = const()[name = string("op_7648"), val = tensor<int32, [3]>([3, 8, 512])];
+            tensor<fp16, [1, 3, 8, 512]> transpose_81 = transpose(perm = transpose_81_perm_0, x = var_7625)[name = string("transpose_18")];
+            tensor<fp16, [3, 8, 512]> x_221 = reshape(shape = var_7648, x = transpose_81)[name = string("x_221")];
+            int32 var_7654 = const()[name = string("op_7654"), val = int32(-1)];
+            fp16 const_132_promoted = const()[name = string("const_132_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 512]> var_7656 = mul(x = x_221, y = const_132_promoted)[name = string("op_7656")];
+            bool input_335_interleave_0 = const()[name = string("input_335_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 1024]> input_335 = concat(axis = var_7654, interleave = input_335_interleave_0, values = (x_221, var_7656))[name = string("input_335")];
+            tensor<int32, [1]> normed_317_axes_0 = const()[name = string("normed_317_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7651_to_fp16 = const()[name = string("op_7651_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 1024]> normed_317_cast_fp16 = layer_norm(axes = normed_317_axes_0, epsilon = var_7651_to_fp16, x = input_335)[name = string("normed_317_cast_fp16")];
+            tensor<int32, [2]> var_7661_split_sizes_0 = const()[name = string("op_7661_split_sizes_0"), val = tensor<int32, [2]>([512, 512])];
+            int32 var_7661_axis_0 = const()[name = string("op_7661_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 512]> var_7661_0, tensor<fp16, [3, 8, 512]> var_7661_1 = split(axis = var_7661_axis_0, split_sizes = var_7661_split_sizes_0, x = normed_317_cast_fp16)[name = string("op_7661")];
+            tensor<fp16, [3, 8, 512]> q_137 = mul(x = var_7661_0, y = layers_11_self_attn_q_norm_weight)[name = string("q_137")];
+            tensor<int32, [4]> var_7668 = const()[name = string("op_7668"), val = tensor<int32, [4]>([1, 3, 8, 512])];
+            tensor<fp16, [1, 3, 8, 512]> var_7669 = reshape(shape = var_7668, x = q_137)[name = string("op_7669")];
+            tensor<int32, [4]> var_7674 = const()[name = string("op_7674"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 512]> q_139 = transpose(perm = var_7674, x = var_7669)[name = string("transpose_17")];
+            tensor<fp16, [1, 8, 3, 512]> var_7676_cast_fp16 = mul(x = q_139, y = cos_f)[name = string("op_7676_cast_fp16")];
+            tensor<int32, [2]> var_7677_split_sizes_0 = const()[name = string("op_7677_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_7677_axis_0 = const()[name = string("op_7677_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 256]> var_7677_0, tensor<fp16, [1, 8, 3, 256]> var_7677_1 = split(axis = var_7677_axis_0, split_sizes = var_7677_split_sizes_0, x = q_139)[name = string("op_7677")];
+            fp16 const_133_promoted = const()[name = string("const_133_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 256]> var_7679 = mul(x = var_7677_1, y = const_133_promoted)[name = string("op_7679")];
+            int32 var_7681 = const()[name = string("op_7681"), val = int32(-1)];
+            bool var_7682_interleave_0 = const()[name = string("op_7682_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 512]> var_7682 = concat(axis = var_7681, interleave = var_7682_interleave_0, values = (var_7679, var_7677_0))[name = string("op_7682")];
+            tensor<fp16, [1, 8, 3, 512]> var_7683_cast_fp16 = mul(x = var_7682, y = sin_f)[name = string("op_7683_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> q_cast_fp16 = add(x = var_7676_cast_fp16, y = var_7683_cast_fp16)[name = string("q_cast_fp16")];
+            string k_69_pad_type_0 = const()[name = string("k_69_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_69_strides_0 = const()[name = string("k_69_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_69_pad_0 = const()[name = string("k_69_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_69_dilations_0 = const()[name = string("k_69_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_69_groups_0 = const()[name = string("k_69_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 3]> k_69 = conv(dilations = k_69_dilations_0, groups = k_69_groups_0, pad = k_69_pad_0, pad_type = k_69_pad_type_0, strides = k_69_strides_0, weight = layers_11_self_attn_k_proj_weight_palettized, x = var_7603_cast_fp16)[name = string("k_69")];
+            tensor<int32, [4]> var_7701 = const()[name = string("op_7701"), val = tensor<int32, [4]>([1, 2, 512, 3])];
+            tensor<fp16, [1, 2, 512, 3]> var_7702 = reshape(shape = var_7701, x = k_69)[name = string("op_7702")];
+            tensor<int32, [4]> transpose_82_perm_0 = const()[name = string("transpose_82_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            string v_25_pad_type_0 = const()[name = string("v_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_25_strides_0 = const()[name = string("v_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_25_pad_0 = const()[name = string("v_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_25_dilations_0 = const()[name = string("v_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_25_groups_0 = const()[name = string("v_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 3]> v_25 = conv(dilations = v_25_dilations_0, groups = v_25_groups_0, pad = v_25_pad_0, pad_type = v_25_pad_type_0, strides = v_25_strides_0, weight = layers_11_self_attn_v_proj_weight_palettized, x = var_7603_cast_fp16)[name = string("v_25")];
+            tensor<int32, [4]> var_7729 = const()[name = string("op_7729"), val = tensor<int32, [4]>([1, 2, 512, 3])];
+            tensor<fp16, [1, 2, 512, 3]> var_7730 = reshape(shape = var_7729, x = v_25)[name = string("op_7730")];
+            tensor<int32, [4]> var_7735 = const()[name = string("op_7735"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_7753 = const()[name = string("op_7753"), val = tensor<int32, [3]>([3, 2, 512])];
+            tensor<fp16, [1, 3, 2, 512]> transpose_82 = transpose(perm = transpose_82_perm_0, x = var_7702)[name = string("transpose_16")];
+            tensor<fp16, [3, 2, 512]> x_223 = reshape(shape = var_7753, x = transpose_82)[name = string("x_223")];
+            int32 var_7759 = const()[name = string("op_7759"), val = int32(-1)];
+            fp16 const_134_promoted = const()[name = string("const_134_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 2, 512]> var_7761 = mul(x = x_223, y = const_134_promoted)[name = string("op_7761")];
+            bool input_337_interleave_0 = const()[name = string("input_337_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 2, 1024]> input_337 = concat(axis = var_7759, interleave = input_337_interleave_0, values = (x_223, var_7761))[name = string("input_337")];
+            tensor<int32, [1]> normed_321_axes_0 = const()[name = string("normed_321_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7756_to_fp16 = const()[name = string("op_7756_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 2, 1024]> normed_321_cast_fp16 = layer_norm(axes = normed_321_axes_0, epsilon = var_7756_to_fp16, x = input_337)[name = string("normed_321_cast_fp16")];
+            tensor<int32, [2]> var_7766_split_sizes_0 = const()[name = string("op_7766_split_sizes_0"), val = tensor<int32, [2]>([512, 512])];
+            int32 var_7766_axis_0 = const()[name = string("op_7766_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 2, 512]> var_7766_0, tensor<fp16, [3, 2, 512]> var_7766_1 = split(axis = var_7766_axis_0, split_sizes = var_7766_split_sizes_0, x = normed_321_cast_fp16)[name = string("op_7766")];
+            tensor<fp16, [3, 2, 512]> k_73 = mul(x = var_7766_0, y = layers_11_self_attn_k_norm_weight)[name = string("k_73")];
+            tensor<int32, [4]> var_7773 = const()[name = string("op_7773"), val = tensor<int32, [4]>([1, 3, 2, 512])];
+            tensor<fp16, [1, 3, 2, 512]> var_7774 = reshape(shape = var_7773, x = k_73)[name = string("op_7774")];
+            tensor<int32, [4]> var_7779 = const()[name = string("op_7779"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            fp16 var_7781_promoted = const()[name = string("op_7781_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 3, 512]> var_7736 = transpose(perm = var_7735, x = var_7730)[name = string("transpose_15")];
+            tensor<fp16, [1, 2, 3, 512]> var_7782 = pow(x = var_7736, y = var_7781_promoted)[name = string("op_7782")];
+            tensor<int32, [1]> var_7787_axes_0 = const()[name = string("op_7787_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_7787_keep_dims_0 = const()[name = string("op_7787_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 3, 1]> var_7787 = reduce_mean(axes = var_7787_axes_0, keep_dims = var_7787_keep_dims_0, x = var_7782)[name = string("op_7787")];
+            fp16 var_7789_to_fp16 = const()[name = string("op_7789_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 3, 1]> mean_sq_cast_fp16 = add(x = var_7787, y = var_7789_to_fp16)[name = string("mean_sq_cast_fp16")];
+            fp32 var_7791_epsilon_0 = const()[name = string("op_7791_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 3, 1]> var_7791_cast_fp16 = rsqrt(epsilon = var_7791_epsilon_0, x = mean_sq_cast_fp16)[name = string("op_7791_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 512]> v_cast_fp16 = mul(x = var_7736, y = var_7791_cast_fp16)[name = string("v_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 512]> q_141 = transpose(perm = var_7779, x = var_7774)[name = string("transpose_14")];
+            tensor<fp16, [1, 2, 3, 512]> var_7793_cast_fp16 = mul(x = q_141, y = cos_f)[name = string("op_7793_cast_fp16")];
+            tensor<int32, [2]> var_7794_split_sizes_0 = const()[name = string("op_7794_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_7794_axis_0 = const()[name = string("op_7794_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 3, 256]> var_7794_0, tensor<fp16, [1, 2, 3, 256]> var_7794_1 = split(axis = var_7794_axis_0, split_sizes = var_7794_split_sizes_0, x = q_141)[name = string("op_7794")];
+            fp16 const_135_promoted = const()[name = string("const_135_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 3, 256]> var_7796 = mul(x = var_7794_1, y = const_135_promoted)[name = string("op_7796")];
+            int32 var_7798 = const()[name = string("op_7798"), val = int32(-1)];
+            bool var_7799_interleave_0 = const()[name = string("op_7799_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 3, 512]> var_7799 = concat(axis = var_7798, interleave = var_7799_interleave_0, values = (var_7796, var_7794_0))[name = string("op_7799")];
+            tensor<fp16, [1, 2, 3, 512]> var_7800_cast_fp16 = mul(x = var_7799, y = sin_f)[name = string("op_7800_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 512]> k_cast_fp16 = add(x = var_7793_cast_fp16, y = var_7800_cast_fp16)[name = string("k_cast_fp16")];
+            bool k_scattered_transpose_x_0 = const()[name = string("k_scattered_transpose_x_0"), val = bool(false)];
+            bool k_scattered_transpose_y_0 = const()[name = string("k_scattered_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 2048, 512]> k_scattered_cast_fp16 = matmul(transpose_x = k_scattered_transpose_x_0, transpose_y = k_scattered_transpose_y_0, x = var_4120_cast_fp16, y = k_cast_fp16)[name = string("k_scattered_cast_fp16")];
+            bool v_scattered_transpose_x_0 = const()[name = string("v_scattered_transpose_x_0"), val = bool(false)];
+            bool v_scattered_transpose_y_0 = const()[name = string("v_scattered_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 2048, 512]> v_scattered_cast_fp16 = matmul(transpose_x = v_scattered_transpose_x_0, transpose_y = v_scattered_transpose_y_0, x = var_4120_cast_fp16, y = v_cast_fp16)[name = string("v_scattered_cast_fp16")];
+            tensor<int32, [4]> slot_k_begin_0 = const()[name = string("slot_k_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> slot_k_end_0 = const()[name = string("slot_k_end_0"), val = tensor<int32, [4]>([1, 2, 2048, 512])];
+            tensor<bool, [4]> slot_k_end_mask_0 = const()[name = string("slot_k_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 2048, 512]> slot_k_cast_fp16 = slice_by_index(begin = slot_k_begin_0, end = slot_k_end_0, end_mask = slot_k_end_mask_0, x = K_full_out_1_cast_fp16)[name = string("slot_k_cast_fp16")];
+            tensor<int32, [4]> slot_v_begin_0 = const()[name = string("slot_v_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> slot_v_end_0 = const()[name = string("slot_v_end_0"), val = tensor<int32, [4]>([1, 2, 2048, 512])];
+            tensor<bool, [4]> slot_v_end_mask_0 = const()[name = string("slot_v_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 2048, 512]> slot_v_cast_fp16 = slice_by_index(begin = slot_v_begin_0, end = slot_v_end_0, end_mask = slot_v_end_mask_0, x = V_full_out_1_cast_fp16)[name = string("slot_v_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_7837_cast_fp16 = mul(x = slot_k_cast_fp16, y = var_4147_cast_fp16)[name = string("op_7837_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> new_k_cast_fp16 = add(x = var_7837_cast_fp16, y = k_scattered_cast_fp16)[name = string("new_k_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_7843_cast_fp16 = mul(x = slot_v_cast_fp16, y = var_4147_cast_fp16)[name = string("op_7843_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> new_v_cast_fp16 = add(x = var_7843_cast_fp16, y = v_scattered_cast_fp16)[name = string("new_v_cast_fp16")];
+            int32 var_7857 = const()[name = string("op_7857"), val = int32(0)];
+            bool K_full_out_interleave_0 = const()[name = string("K_full_out_interleave_0"), val = bool(false)];
+            tensor<fp16, [2, 2, 2048, 512]> K_full_out = concat(axis = var_7857, interleave = K_full_out_interleave_0, values = (var_4187_cast_fp16, new_k_cast_fp16))[name = string("K_full_out_cast_fp16")];
+            int32 var_7870 = const()[name = string("op_7870"), val = int32(0)];
+            bool V_full_out_interleave_0 = const()[name = string("V_full_out_interleave_0"), val = bool(false)];
+            tensor<fp16, [2, 2, 2048, 512]> V_full_out = concat(axis = var_7870, interleave = V_full_out_interleave_0, values = (var_4197_cast_fp16, new_v_cast_fp16))[name = string("V_full_out_cast_fp16")];
+            tensor<int32, [4]> var_7876_begin_0 = const()[name = string("op_7876_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> var_7876_end_0 = const()[name = string("op_7876_end_0"), val = tensor<int32, [4]>([1, 2, 2048, 512])];
+            tensor<bool, [4]> var_7876_end_mask_0 = const()[name = string("op_7876_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 2048, 512]> var_7876_cast_fp16 = slice_by_index(begin = var_7876_begin_0, end = var_7876_end_0, end_mask = var_7876_end_mask_0, x = K_full_out)[name = string("op_7876_cast_fp16")];
+            tensor<int32, [4]> var_7886_begin_0 = const()[name = string("op_7886_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> var_7886_end_0 = const()[name = string("op_7886_end_0"), val = tensor<int32, [4]>([1, 2, 2048, 512])];
+            tensor<bool, [4]> var_7886_end_mask_0 = const()[name = string("op_7886_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 2048, 512]> var_7886_cast_fp16 = slice_by_index(begin = var_7886_begin_0, end = var_7886_end_0, end_mask = var_7886_end_mask_0, x = V_full_out)[name = string("op_7886_cast_fp16")];
+            tensor<int32, [4]> transpose_44_perm_0 = const()[name = string("transpose_44_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_22_reps_0 = const()[name = string("tile_22_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 2048, 512]> transpose_44_cast_fp16 = transpose(perm = transpose_44_perm_0, x = var_7876_cast_fp16)[name = string("transpose_13")];
+            tensor<fp16, [8, 1, 2048, 512]> tile_22_cast_fp16 = tile(reps = tile_22_reps_0, x = transpose_44_cast_fp16)[name = string("tile_22_cast_fp16")];
+            tensor<int32, [5]> concat_48 = const()[name = string("concat_48"), val = tensor<int32, [5]>([4, 2, 1, 2048, 512])];
+            tensor<fp16, [4, 2, 1, 2048, 512]> reshape_44_cast_fp16 = reshape(shape = concat_48, x = tile_22_cast_fp16)[name = string("reshape_44_cast_fp16")];
+            tensor<int32, [5]> transpose_45_perm_0 = const()[name = string("transpose_45_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_49 = const()[name = string("concat_49"), val = tensor<int32, [4]>([-1, 1, 2048, 512])];
+            tensor<fp16, [2, 4, 1, 2048, 512]> transpose_45_cast_fp16 = transpose(perm = transpose_45_perm_0, x = reshape_44_cast_fp16)[name = string("transpose_12")];
+            tensor<fp16, [8, 1, 2048, 512]> reshape_45_cast_fp16 = reshape(shape = concat_49, x = transpose_45_cast_fp16)[name = string("reshape_45_cast_fp16")];
+            tensor<int32, [4]> transpose_83_perm_0 = const()[name = string("transpose_83_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_46_perm_0 = const()[name = string("transpose_46_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_23_reps_0 = const()[name = string("tile_23_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 2048, 512]> transpose_46_cast_fp16 = transpose(perm = transpose_46_perm_0, x = var_7886_cast_fp16)[name = string("transpose_11")];
+            tensor<fp16, [8, 1, 2048, 512]> tile_23_cast_fp16 = tile(reps = tile_23_reps_0, x = transpose_46_cast_fp16)[name = string("tile_23_cast_fp16")];
+            tensor<int32, [5]> concat_50 = const()[name = string("concat_50"), val = tensor<int32, [5]>([4, 2, 1, 2048, 512])];
+            tensor<fp16, [4, 2, 1, 2048, 512]> reshape_46_cast_fp16 = reshape(shape = concat_50, x = tile_23_cast_fp16)[name = string("reshape_46_cast_fp16")];
+            tensor<int32, [5]> transpose_47_perm_0 = const()[name = string("transpose_47_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_51 = const()[name = string("concat_51"), val = tensor<int32, [4]>([-1, 1, 2048, 512])];
+            tensor<fp16, [2, 4, 1, 2048, 512]> transpose_47_cast_fp16 = transpose(perm = transpose_47_perm_0, x = reshape_46_cast_fp16)[name = string("transpose_10")];
+            tensor<fp16, [8, 1, 2048, 512]> reshape_47_cast_fp16 = reshape(shape = concat_51, x = transpose_47_cast_fp16)[name = string("reshape_47_cast_fp16")];
+            tensor<int32, [4]> V_expanded_perm_0 = const()[name = string("V_expanded_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_45_transpose_x_0 = const()[name = string("attn_weights_45_transpose_x_0"), val = bool(false)];
+            bool attn_weights_45_transpose_y_0 = const()[name = string("attn_weights_45_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 2048]> transpose_83_cast_fp16 = transpose(perm = transpose_83_perm_0, x = reshape_45_cast_fp16)[name = string("transpose_9")];
+            tensor<fp16, [1, 8, 3, 2048]> attn_weights_45_cast_fp16 = matmul(transpose_x = attn_weights_45_transpose_x_0, transpose_y = attn_weights_45_transpose_y_0, x = q_cast_fp16, y = transpose_83_cast_fp16)[name = string("attn_weights_45_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 2048]> x_227_cast_fp16 = add(x = attn_weights_45_cast_fp16, y = causal_mask_full)[name = string("x_227_cast_fp16")];
+            tensor<int32, [1]> reduce_max_11_axes_0 = const()[name = string("reduce_max_11_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_11_keep_dims_0 = const()[name = string("reduce_max_11_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_11 = reduce_max(axes = reduce_max_11_axes_0, keep_dims = reduce_max_11_keep_dims_0, x = x_227_cast_fp16)[name = string("reduce_max_11")];
+            tensor<fp16, [1, 8, 3, 2048]> var_7921 = sub(x = x_227_cast_fp16, y = reduce_max_11)[name = string("op_7921")];
+            tensor<fp16, [1, 8, 3, 2048]> var_7927 = exp(x = var_7921)[name = string("op_7927")];
+            tensor<int32, [1]> var_7937_axes_0 = const()[name = string("op_7937_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_7937_keep_dims_0 = const()[name = string("op_7937_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_7937 = reduce_sum(axes = var_7937_axes_0, keep_dims = var_7937_keep_dims_0, x = var_7927)[name = string("op_7937")];
+            tensor<fp16, [1, 8, 3, 2048]> var_7943_cast_fp16 = real_div(x = var_7927, y = var_7937)[name = string("op_7943_cast_fp16")];
+            bool attn_output_67_transpose_x_0 = const()[name = string("attn_output_67_transpose_x_0"), val = bool(false)];
+            bool attn_output_67_transpose_y_0 = const()[name = string("attn_output_67_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 2048, 512]> V_expanded_cast_fp16 = transpose(perm = V_expanded_perm_0, x = reshape_47_cast_fp16)[name = string("transpose_8")];
+            tensor<fp16, [1, 8, 3, 512]> attn_output_67_cast_fp16 = matmul(transpose_x = attn_output_67_transpose_x_0, transpose_y = attn_output_67_transpose_y_0, x = var_7943_cast_fp16, y = V_expanded_cast_fp16)[name = string("attn_output_67_cast_fp16")];
+            tensor<int32, [4]> var_7954 = const()[name = string("op_7954"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_7961 = const()[name = string("op_7961"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 512]> var_7955_cast_fp16 = transpose(perm = var_7954, x = attn_output_67_cast_fp16)[name = string("transpose_7")];
+            tensor<fp16, [1, 3, 4096]> attn_output_69_cast_fp16 = reshape(shape = var_7961, x = var_7955_cast_fp16)[name = string("attn_output_69_cast_fp16")];
+            tensor<int32, [3]> var_7966 = const()[name = string("op_7966"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_7982_pad_type_0 = const()[name = string("op_7982_pad_type_0"), val = string("valid")];
+            int32 var_7982_groups_0 = const()[name = string("op_7982_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_7982_strides_0 = const()[name = string("op_7982_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_7982_pad_0 = const()[name = string("op_7982_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_7982_dilations_0 = const()[name = string("op_7982_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 4096, 1]> squeeze_11_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 4096, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(580378944))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(585621888))))[name = string("squeeze_11_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 4096, 3]> var_7967_cast_fp16 = transpose(perm = var_7966, x = attn_output_69_cast_fp16)[name = string("transpose_6")];
+            tensor<fp16, [1, 2560, 3]> var_7982_cast_fp16 = conv(dilations = var_7982_dilations_0, groups = var_7982_groups_0, pad = var_7982_pad_0, pad_type = var_7982_pad_type_0, strides = var_7982_strides_0, weight = squeeze_11_cast_fp16_to_fp32_to_fp16_palettized, x = var_7967_cast_fp16)[name = string("op_7982_cast_fp16")];
+            tensor<int32, [3]> var_7986 = const()[name = string("op_7986"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_7992 = const()[name = string("op_7992"), val = int32(-1)];
+            fp16 const_136_promoted_to_fp16 = const()[name = string("const_136_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_231_cast_fp16 = transpose(perm = var_7986, x = var_7982_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 3, 2560]> var_7994_cast_fp16 = mul(x = x_231_cast_fp16, y = const_136_promoted_to_fp16)[name = string("op_7994_cast_fp16")];
+            bool input_341_interleave_0 = const()[name = string("input_341_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_341_cast_fp16 = concat(axis = var_7992, interleave = input_341_interleave_0, values = (x_231_cast_fp16, var_7994_cast_fp16))[name = string("input_341_cast_fp16")];
+            tensor<int32, [1]> normed_325_axes_0 = const()[name = string("normed_325_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7989_to_fp16 = const()[name = string("op_7989_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_325_cast_fp16 = layer_norm(axes = normed_325_axes_0, epsilon = var_7989_to_fp16, x = input_341_cast_fp16)[name = string("normed_325_cast_fp16")];
+            tensor<int32, [2]> var_7999_split_sizes_0 = const()[name = string("op_7999_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7999_axis_0 = const()[name = string("op_7999_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_7999_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_7999_cast_fp16_1 = split(axis = var_7999_axis_0, split_sizes = var_7999_split_sizes_0, x = normed_325_cast_fp16)[name = string("op_7999_cast_fp16")];
+            tensor<fp16, [2560]> layers_11_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_11_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(585624512)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_cast_fp16 = mul(x = var_7999_cast_fp16_0, y = layers_11_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_233_cast_fp16 = add(x = x_219_cast_fp16, y = attn_output_cast_fp16)[name = string("x_233_cast_fp16")];
+            int32 var_8008 = const()[name = string("op_8008"), val = int32(-1)];
+            fp16 const_137_promoted_to_fp16 = const()[name = string("const_137_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_8010_cast_fp16 = mul(x = x_233_cast_fp16, y = const_137_promoted_to_fp16)[name = string("op_8010_cast_fp16")];
+            bool input_343_interleave_0 = const()[name = string("input_343_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_343_cast_fp16 = concat(axis = var_8008, interleave = input_343_interleave_0, values = (x_233_cast_fp16, var_8010_cast_fp16))[name = string("input_343_cast_fp16")];
+            tensor<int32, [1]> normed_329_axes_0 = const()[name = string("normed_329_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_8005_to_fp16 = const()[name = string("op_8005_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_329_cast_fp16 = layer_norm(axes = normed_329_axes_0, epsilon = var_8005_to_fp16, x = input_343_cast_fp16)[name = string("normed_329_cast_fp16")];
+            tensor<int32, [2]> var_8015_split_sizes_0 = const()[name = string("op_8015_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_8015_axis_0 = const()[name = string("op_8015_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_8015_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_8015_cast_fp16_1 = split(axis = var_8015_axis_0, split_sizes = var_8015_split_sizes_0, x = normed_329_cast_fp16)[name = string("op_8015_cast_fp16")];
+            tensor<fp16, [2560]> layers_11_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_11_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(585629696)))];
+            tensor<fp16, [1, 3, 2560]> h_69_cast_fp16 = mul(x = var_8015_cast_fp16_0, y = layers_11_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_69_cast_fp16")];
+            tensor<int32, [3]> var_8026 = const()[name = string("op_8026"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_345_axes_0 = const()[name = string("input_345_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_8027 = transpose(perm = var_8026, x = h_69_cast_fp16)[name = string("transpose_4")];
+            tensor<fp16, [1, 2560, 1, 3]> input_345 = expand_dims(axes = input_345_axes_0, x = var_8027)[name = string("input_345")];
+            string gate_45_pad_type_0 = const()[name = string("gate_45_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_45_strides_0 = const()[name = string("gate_45_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_45_pad_0 = const()[name = string("gate_45_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_45_dilations_0 = const()[name = string("gate_45_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_45_groups_0 = const()[name = string("gate_45_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_45 = conv(dilations = gate_45_dilations_0, groups = gate_45_groups_0, pad = gate_45_pad_0, pad_type = gate_45_pad_type_0, strides = gate_45_strides_0, weight = layers_11_mlp_gate_proj_weight_palettized, x = input_345)[name = string("gate_45")];
+            string up_pad_type_0 = const()[name = string("up_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_strides_0 = const()[name = string("up_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_pad_0 = const()[name = string("up_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_dilations_0 = const()[name = string("up_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_groups_0 = const()[name = string("up_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up = conv(dilations = up_dilations_0, groups = up_groups_0, pad = up_pad_0, pad_type = up_pad_type_0, strides = up_strides_0, weight = layers_11_mlp_up_proj_weight_palettized, x = input_345)[name = string("up")];
+            string gate_mode_0 = const()[name = string("gate_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate = gelu(mode = gate_mode_0, x = gate_45)[name = string("gate")];
+            tensor<fp16, [1, 10240, 1, 3]> input_347 = mul(x = gate, y = up)[name = string("input_347")];
+            string mlp_out_pad_type_0 = const()[name = string("mlp_out_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_strides_0 = const()[name = string("mlp_out_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_pad_0 = const()[name = string("mlp_out_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_dilations_0 = const()[name = string("mlp_out_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_groups_0 = const()[name = string("mlp_out_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out = conv(dilations = mlp_out_dilations_0, groups = mlp_out_groups_0, pad = mlp_out_pad_0, pad_type = mlp_out_pad_type_0, strides = mlp_out_strides_0, weight = layers_11_mlp_down_proj_weight_palettized, x = input_347)[name = string("mlp_out")];
+            tensor<int32, [1]> var_8067_axes_0 = const()[name = string("op_8067_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_8067 = squeeze(axes = var_8067_axes_0, x = mlp_out)[name = string("op_8067")];
+            tensor<int32, [3]> var_8071 = const()[name = string("op_8071"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_8077 = const()[name = string("op_8077"), val = int32(-1)];
+            fp16 const_138_promoted = const()[name = string("const_138_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_235 = transpose(perm = var_8071, x = var_8067)[name = string("transpose_3")];
+            tensor<fp16, [1, 3, 2560]> var_8079 = mul(x = x_235, y = const_138_promoted)[name = string("op_8079")];
+            bool input_349_interleave_0 = const()[name = string("input_349_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_349 = concat(axis = var_8077, interleave = input_349_interleave_0, values = (x_235, var_8079))[name = string("input_349")];
+            tensor<int32, [1]> normed_333_axes_0 = const()[name = string("normed_333_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_8074_to_fp16 = const()[name = string("op_8074_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_333_cast_fp16 = layer_norm(axes = normed_333_axes_0, epsilon = var_8074_to_fp16, x = input_349)[name = string("normed_333_cast_fp16")];
+            tensor<int32, [2]> var_8084_split_sizes_0 = const()[name = string("op_8084_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_8084_axis_0 = const()[name = string("op_8084_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_8084_0, tensor<fp16, [1, 3, 2560]> var_8084_1 = split(axis = var_8084_axis_0, split_sizes = var_8084_split_sizes_0, x = normed_333_cast_fp16)[name = string("op_8084")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_113 = mul(x = var_8084_0, y = layers_11_post_feedforward_layernorm_weight)[name = string("hidden_states_113")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_115_cast_fp16 = add(x = x_233_cast_fp16, y = hidden_states_113)[name = string("hidden_states_115_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_begin_0 = const()[name = string("per_layer_slice_begin_0"), val = tensor<int32, [3]>([0, 0, 2816])];
+            tensor<int32, [3]> per_layer_slice_end_0 = const()[name = string("per_layer_slice_end_0"), val = tensor<int32, [3]>([1, 3, 3072])];
+            tensor<bool, [3]> per_layer_slice_end_mask_0 = const()[name = string("per_layer_slice_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_cast_fp16 = slice_by_index(begin = per_layer_slice_begin_0, end = per_layer_slice_end_0, end_mask = per_layer_slice_end_mask_0, x = per_layer_combined_out)[name = string("per_layer_slice_cast_fp16")];
+            tensor<int32, [3]> var_8112 = const()[name = string("op_8112"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_351_axes_0 = const()[name = string("input_351_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_8113 = transpose(perm = var_8112, x = hidden_states_115_cast_fp16)[name = string("transpose_2")];
+            tensor<fp16, [1, 2560, 1, 3]> input_351 = expand_dims(axes = input_351_axes_0, x = var_8113)[name = string("input_351")];
+            string gated_67_pad_type_0 = const()[name = string("gated_67_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_67_strides_0 = const()[name = string("gated_67_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_67_pad_0 = const()[name = string("gated_67_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_67_dilations_0 = const()[name = string("gated_67_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_67_groups_0 = const()[name = string("gated_67_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_67 = conv(dilations = gated_67_dilations_0, groups = gated_67_groups_0, pad = gated_67_pad_0, pad_type = gated_67_pad_type_0, strides = gated_67_strides_0, weight = layers_11_per_layer_input_gate_weight_palettized, x = input_351)[name = string("gated_67")];
+            string gated_69_mode_0 = const()[name = string("gated_69_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_69 = gelu(mode = gated_69_mode_0, x = gated_67)[name = string("gated_69")];
+            tensor<int32, [3]> var_8132 = const()[name = string("op_8132"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_axes_0 = const()[name = string("per_layer_slice_conv_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_8133_cast_fp16 = transpose(perm = var_8132, x = per_layer_slice_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_cast_fp16 = expand_dims(axes = per_layer_slice_conv_axes_0, x = var_8133_cast_fp16)[name = string("per_layer_slice_conv_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_353_cast_fp16 = mul(x = gated_69, y = per_layer_slice_conv_cast_fp16)[name = string("input_353_cast_fp16")];
+            string gated_pad_type_0 = const()[name = string("gated_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_strides_0 = const()[name = string("gated_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_pad_0 = const()[name = string("gated_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_dilations_0 = const()[name = string("gated_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_groups_0 = const()[name = string("gated_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_11_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(585634880))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(585962624))))[name = string("layers_11_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_cast_fp16 = conv(dilations = gated_dilations_0, groups = gated_groups_0, pad = gated_pad_0, pad_type = gated_pad_type_0, strides = gated_strides_0, weight = layers_11_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_353_cast_fp16)[name = string("gated_cast_fp16")];
+            tensor<int32, [1]> var_8149_axes_0 = const()[name = string("op_8149_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_8149_cast_fp16 = squeeze(axes = var_8149_axes_0, x = gated_cast_fp16)[name = string("op_8149_cast_fp16")];
+            tensor<int32, [3]> var_8153 = const()[name = string("op_8153"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_8159 = const()[name = string("op_8159"), val = int32(-1)];
+            fp16 const_139_promoted_to_fp16 = const()[name = string("const_139_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_cast_fp16 = transpose(perm = var_8153, x = var_8149_cast_fp16)[name = string("transpose_0")];
+            tensor<fp16, [1, 3, 2560]> var_8161_cast_fp16 = mul(x = x_cast_fp16, y = const_139_promoted_to_fp16)[name = string("op_8161_cast_fp16")];
+            bool input_interleave_0 = const()[name = string("input_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_cast_fp16 = concat(axis = var_8159, interleave = input_interleave_0, values = (x_cast_fp16, var_8161_cast_fp16))[name = string("input_cast_fp16")];
+            tensor<int32, [1]> normed_337_axes_0 = const()[name = string("normed_337_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_8156_to_fp16 = const()[name = string("op_8156_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_337_cast_fp16 = layer_norm(axes = normed_337_axes_0, epsilon = var_8156_to_fp16, x = input_cast_fp16)[name = string("normed_337_cast_fp16")];
+            tensor<int32, [2]> var_8166_split_sizes_0 = const()[name = string("op_8166_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_8166_axis_0 = const()[name = string("op_8166_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_8166_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_8166_cast_fp16_1 = split(axis = var_8166_axis_0, split_sizes = var_8166_split_sizes_0, x = normed_337_cast_fp16)[name = string("op_8166_cast_fp16")];
+            tensor<fp16, [2560]> layers_11_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_11_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(585965248)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_119_cast_fp16 = mul(x = var_8166_cast_fp16_0, y = layers_11_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_119_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_cast_fp16 = add(x = hidden_states_115_cast_fp16, y = hidden_states_119_cast_fp16)[name = string("hidden_states_cast_fp16")];
+            tensor<fp16, [1]> const_140_promoted_to_fp16 = const()[name = string("const_140_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.0ap-1])];
+            tensor<fp16, [1, 3, 2560]> hidden_states_out = mul(x = hidden_states_cast_fp16, y = const_140_promoted_to_fp16)[name = string("op_8176_cast_fp16")];
+        } -> (hidden_states_out, K_sliding_out, V_sliding_out, K_full_out, V_full_out, per_layer_combined_out);
+}
\ No newline at end of file
diff --git a/chunk1.mlmodelc/weights/weight.bin b/chunk1.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..53ec3a0699a82617697453305636b207603c4050
--- /dev/null
+++ b/chunk1.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:67c868123b2e7b2182d97a0aaca1d4e33f861ee446eac5b038bdb9f0e2c6e787
+size 585970432
diff --git a/chunk2.mlmodelc/analytics/coremldata.bin b/chunk2.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..61743beff7f654283c262d418dd79a7213652857
--- /dev/null
+++ b/chunk2.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0c032a454d2eaeea9fd5bfdfe3a2caf53e1a79d401fe8007c986abcc44469a19
+size 243
diff --git a/chunk2.mlmodelc/coremldata.bin b/chunk2.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..695ba28f5c68f164a8d779ebf6f31049bcc05106
--- /dev/null
+++ b/chunk2.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8046d35bb019573a7541c830b6b712d5176c199f51e2b1fec3a17342d86a3ac6
+size 1471
diff --git a/chunk2.mlmodelc/model.mil b/chunk2.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..fd3317ef9827b6ddda940f06043caaa8a2823b40
--- /dev/null
+++ b/chunk2.mlmodelc/model.mil
@@ -0,0 +1,8361 @@
+program(1.3)
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3500.14.1"}, {"coremlc-version", "3500.32.1"}})]
+{
+    func decode_q1<ios18>(tensor<fp16, [2, 2, 2048, 512]> K_full_in, tensor<fp16, [10, 2, 512, 512]> K_sliding_in, tensor<fp16, [2, 2, 2048, 512]> V_full_in, tensor<fp16, [10, 2, 512, 512]> V_sliding_in, tensor<fp16, [1, 1, 1, 2048]> causal_mask_full, tensor<fp16, [1, 1, 1, 512]> causal_mask_sliding, tensor<fp16, [1, 1, 1, 512]> cos_f, tensor<fp16, [1, 1, 1, 256]> cos_s, tensor<fp16, [1, 1, 2560]> hidden_states, tensor<fp16, [1, 1, 10752]> per_layer_combined, tensor<fp16, [1, 1, 1, 512]> sin_f, tensor<fp16, [1, 1, 1, 256]> sin_s, tensor<fp16, [1, 1, 2048, 1]> update_mask) {
+            tensor<fp16, [2048, 2560, 1, 1]> layers_0_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2621568))))[name = string("layers_0_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_0_self_attn_q_norm_weight = const()[name = string("layers_0_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2623680)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_0_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2624256))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(3279680))))[name = string("layers_0_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_0_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(3280256))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(3935680))))[name = string("layers_0_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_0_self_attn_k_norm_weight = const()[name = string("layers_0_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(3936256)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_0_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(3936832))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17044096))))[name = string("layers_0_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_0_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17054400))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(30161664))))[name = string("layers_0_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_0_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(30171968))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(43279232))))[name = string("layers_0_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_0_post_feedforward_layernorm_weight = const()[name = string("layers_0_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(43281856)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_0_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(43287040))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(43614784))))[name = string("layers_0_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_1_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(43615104))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(46236608))))[name = string("layers_1_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_1_self_attn_q_norm_weight = const()[name = string("layers_1_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(46238720)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_1_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(46239296))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(46894720))))[name = string("layers_1_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_1_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(46895296))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(47550720))))[name = string("layers_1_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_1_self_attn_k_norm_weight = const()[name = string("layers_1_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(47551296)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_1_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(47551872))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(60659136))))[name = string("layers_1_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_1_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(60669440))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(73776704))))[name = string("layers_1_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_1_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(73787008))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(86894272))))[name = string("layers_1_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_1_post_feedforward_layernorm_weight = const()[name = string("layers_1_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(86896896)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_1_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(86902080))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(87229824))))[name = string("layers_1_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_2_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(87230144))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(89851648))))[name = string("layers_2_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_2_self_attn_q_norm_weight = const()[name = string("layers_2_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(89853760)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_2_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(89854336))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(90509760))))[name = string("layers_2_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_2_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(90510336))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91165760))))[name = string("layers_2_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_2_self_attn_k_norm_weight = const()[name = string("layers_2_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91166336)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_2_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91166912))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(104274176))))[name = string("layers_2_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_2_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(104284480))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(117391744))))[name = string("layers_2_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_2_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(117402048))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(130509312))))[name = string("layers_2_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_2_post_feedforward_layernorm_weight = const()[name = string("layers_2_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(130511936)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_2_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(130517120))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(130844864))))[name = string("layers_2_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_3_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(130845184))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(133466688))))[name = string("layers_3_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_3_self_attn_q_norm_weight = const()[name = string("layers_3_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(133468800)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_3_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(133469376))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(134124800))))[name = string("layers_3_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_3_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(134125376))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(134780800))))[name = string("layers_3_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_3_self_attn_k_norm_weight = const()[name = string("layers_3_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(134781376)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_3_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(134781952))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(147889216))))[name = string("layers_3_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_3_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(147899520))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(161006784))))[name = string("layers_3_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_3_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(161017088))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(174124352))))[name = string("layers_3_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_3_post_feedforward_layernorm_weight = const()[name = string("layers_3_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(174126976)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_3_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(174132160))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(174459904))))[name = string("layers_3_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_4_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(174460224))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(177081728))))[name = string("layers_4_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_4_self_attn_q_norm_weight = const()[name = string("layers_4_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(177083840)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_4_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(177084416))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(177739840))))[name = string("layers_4_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_4_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(177740416))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(178395840))))[name = string("layers_4_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_4_self_attn_k_norm_weight = const()[name = string("layers_4_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(178396416)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_4_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(178396992))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(191504256))))[name = string("layers_4_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_4_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(191514560))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(204621824))))[name = string("layers_4_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_4_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(204632128))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(217739392))))[name = string("layers_4_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_4_post_feedforward_layernorm_weight = const()[name = string("layers_4_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(217742016)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_4_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(217747200))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(218074944))))[name = string("layers_4_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [4096, 2560, 1, 1]> layers_5_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(218075264))), lut = tensor<fp16, [128, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(223318208))))[name = string("layers_5_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [512]> layers_5_self_attn_q_norm_weight = const()[name = string("layers_5_self_attn_q_norm_weight"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(223322368)))];
+            tensor<fp16, [1024, 2560, 1, 1]> layers_5_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(223323456))), lut = tensor<fp16, [32, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(224634240))))[name = string("layers_5_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [1024, 2560, 1, 1]> layers_5_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(224635328))), lut = tensor<fp16, [32, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(225946112))))[name = string("layers_5_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [512]> layers_5_self_attn_k_norm_weight = const()[name = string("layers_5_self_attn_k_norm_weight"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(225947200)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_5_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(225948288))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(239055552))))[name = string("layers_5_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_5_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(239065856))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(252173120))))[name = string("layers_5_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_5_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(252183424))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(265290688))))[name = string("layers_5_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_5_post_feedforward_layernorm_weight = const()[name = string("layers_5_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(265293312)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_5_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(265298496))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(265626240))))[name = string("layers_5_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_6_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(265626560))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(268248064))))[name = string("layers_6_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_6_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(268250176))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(268905600))))[name = string("layers_6_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_6_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(268906176))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(269561600))))[name = string("layers_6_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_6_self_attn_k_norm_weight = const()[name = string("layers_6_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(269562176)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_6_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(269562752))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(282670016))))[name = string("layers_6_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_6_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(282680320))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(295787584))))[name = string("layers_6_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_6_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(295797888))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(308905152))))[name = string("layers_6_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_6_post_feedforward_layernorm_weight = const()[name = string("layers_6_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(308907776)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_6_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(308912960))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(309240704))))[name = string("layers_6_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_7_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(309241024))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(311862528))))[name = string("layers_7_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_7_self_attn_q_norm_weight = const()[name = string("layers_7_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(311864640)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_7_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(311865216))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(312520640))))[name = string("layers_7_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_7_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(312521216))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(313176640))))[name = string("layers_7_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_7_self_attn_k_norm_weight = const()[name = string("layers_7_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(313177216)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_7_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(313177792))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(326285056))))[name = string("layers_7_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_7_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(326295360))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(339402624))))[name = string("layers_7_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_7_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(339412928))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(352520192))))[name = string("layers_7_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_7_post_feedforward_layernorm_weight = const()[name = string("layers_7_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(352522816)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_7_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(352528000))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(352855744))))[name = string("layers_7_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_8_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(352856064))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(355477568))))[name = string("layers_8_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_8_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(355479680))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(356135104))))[name = string("layers_8_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_8_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(356135680))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(356791104))))[name = string("layers_8_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_8_self_attn_k_norm_weight = const()[name = string("layers_8_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(356791680)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_8_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(356792256))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(369899520))))[name = string("layers_8_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_8_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(369909824))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(383017088))))[name = string("layers_8_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_8_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(383027392))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(396134656))))[name = string("layers_8_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_8_post_feedforward_layernorm_weight = const()[name = string("layers_8_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(396137280)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_8_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(396142464))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(396470208))))[name = string("layers_8_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_9_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(396470528))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(399092032))))[name = string("layers_9_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_9_self_attn_q_norm_weight = const()[name = string("layers_9_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(399094144)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_9_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(399094720))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(399750144))))[name = string("layers_9_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_9_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(399750720))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400406144))))[name = string("layers_9_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_9_self_attn_k_norm_weight = const()[name = string("layers_9_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400406720)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_9_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400407296))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(413514560))))[name = string("layers_9_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_9_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(413524864))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(426632128))))[name = string("layers_9_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_9_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(426642432))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(439749696))))[name = string("layers_9_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_9_post_feedforward_layernorm_weight = const()[name = string("layers_9_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(439752320)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_9_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(439757504))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(440085248))))[name = string("layers_9_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_10_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(440085568))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(442707072))))[name = string("layers_10_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_10_self_attn_q_norm_weight = const()[name = string("layers_10_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(442709184)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_10_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(442709760))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(443365184))))[name = string("layers_10_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_10_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(443365760))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(444021184))))[name = string("layers_10_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_10_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(444021760))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(457129024))))[name = string("layers_10_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_10_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(457139328))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(470246592))))[name = string("layers_10_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_10_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(470256896))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(483364160))))[name = string("layers_10_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_10_post_feedforward_layernorm_weight = const()[name = string("layers_10_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(483366784)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_10_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(483371968))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(483699712))))[name = string("layers_10_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [4096, 2560, 1, 1]> layers_11_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(483700032))), lut = tensor<fp16, [128, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(488942976))))[name = string("layers_11_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [512]> layers_11_self_attn_q_norm_weight = const()[name = string("layers_11_self_attn_q_norm_weight"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(488947136)))];
+            tensor<fp16, [1024, 2560, 1, 1]> layers_11_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(488948224))), lut = tensor<fp16, [32, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(490259008))))[name = string("layers_11_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [1024, 2560, 1, 1]> layers_11_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(490260096))), lut = tensor<fp16, [32, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(491570880))))[name = string("layers_11_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [512]> layers_11_self_attn_k_norm_weight = const()[name = string("layers_11_self_attn_k_norm_weight"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(491571968)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_11_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(491573056))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(504680320))))[name = string("layers_11_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_11_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(504690624))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(517797888))))[name = string("layers_11_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_11_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(517808192))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(530915456))))[name = string("layers_11_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_11_post_feedforward_layernorm_weight = const()[name = string("layers_11_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(530918080)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_11_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(530923264))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(531251008))))[name = string("layers_11_per_layer_input_gate_weight_palettized")];
+            tensor<int32, [4]> var_736_begin_0 = const()[name = string("op_736_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_736_end_0 = const()[name = string("op_736_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_736_end_mask_0 = const()[name = string("op_736_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_736_squeeze_mask_0 = const()[name = string("op_736_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_736_cast_fp16 = slice_by_index(begin = var_736_begin_0, end = var_736_end_0, end_mask = var_736_end_mask_0, squeeze_mask = var_736_squeeze_mask_0, x = K_sliding_in)[name = string("op_736_cast_fp16")];
+            tensor<int32, [1]> K_sliding_slot_1_axes_0 = const()[name = string("K_sliding_slot_1_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_slot_1_cast_fp16 = expand_dims(axes = K_sliding_slot_1_axes_0, x = var_736_cast_fp16)[name = string("K_sliding_slot_1_cast_fp16")];
+            tensor<int32, [4]> var_741_begin_0 = const()[name = string("op_741_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_741_end_0 = const()[name = string("op_741_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_741_end_mask_0 = const()[name = string("op_741_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_741_squeeze_mask_0 = const()[name = string("op_741_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_741_cast_fp16 = slice_by_index(begin = var_741_begin_0, end = var_741_end_0, end_mask = var_741_end_mask_0, squeeze_mask = var_741_squeeze_mask_0, x = V_sliding_in)[name = string("op_741_cast_fp16")];
+            tensor<int32, [1]> V_sliding_slot_1_axes_0 = const()[name = string("V_sliding_slot_1_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_slot_1_cast_fp16 = expand_dims(axes = V_sliding_slot_1_axes_0, x = var_741_cast_fp16)[name = string("V_sliding_slot_1_cast_fp16")];
+            int32 var_748 = const()[name = string("op_748"), val = int32(-1)];
+            fp16 const_0_promoted_to_fp16 = const()[name = string("const_0_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_750_cast_fp16 = mul(x = hidden_states, y = const_0_promoted_to_fp16)[name = string("op_750_cast_fp16")];
+            bool input_1_interleave_0 = const()[name = string("input_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_1_cast_fp16 = concat(axis = var_748, interleave = input_1_interleave_0, values = (hidden_states, var_750_cast_fp16))[name = string("input_1_cast_fp16")];
+            tensor<int32, [1]> normed_1_axes_0 = const()[name = string("normed_1_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_745_to_fp16 = const()[name = string("op_745_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_1_cast_fp16 = layer_norm(axes = normed_1_axes_0, epsilon = var_745_to_fp16, x = input_1_cast_fp16)[name = string("normed_1_cast_fp16")];
+            tensor<int32, [2]> var_755_split_sizes_0 = const()[name = string("op_755_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_755_axis_0 = const()[name = string("op_755_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_755_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_755_cast_fp16_1 = split(axis = var_755_axis_0, split_sizes = var_755_split_sizes_0, x = normed_1_cast_fp16)[name = string("op_755_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_0_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(531251328)))];
+            tensor<fp16, [1, 1, 2560]> h_1_cast_fp16 = mul(x = var_755_cast_fp16_0, y = layers_0_input_layernorm_weight_promoted_to_fp16)[name = string("h_1_cast_fp16")];
+            tensor<int32, [3]> var_761 = const()[name = string("op_761"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_764_axes_0 = const()[name = string("op_764_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_762_cast_fp16 = transpose(perm = var_761, x = h_1_cast_fp16)[name = string("transpose_215")];
+            tensor<fp16, [1, 2560, 1, 1]> var_764_cast_fp16 = expand_dims(axes = var_764_axes_0, x = var_762_cast_fp16)[name = string("op_764_cast_fp16")];
+            string var_780_pad_type_0 = const()[name = string("op_780_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_780_strides_0 = const()[name = string("op_780_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_780_pad_0 = const()[name = string("op_780_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_780_dilations_0 = const()[name = string("op_780_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_780_groups_0 = const()[name = string("op_780_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_780 = conv(dilations = var_780_dilations_0, groups = var_780_groups_0, pad = var_780_pad_0, pad_type = var_780_pad_type_0, strides = var_780_strides_0, weight = layers_0_self_attn_q_proj_weight_palettized, x = var_764_cast_fp16)[name = string("op_780")];
+            tensor<int32, [4]> var_785 = const()[name = string("op_785"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_786 = reshape(shape = var_785, x = var_780)[name = string("op_786")];
+            tensor<int32, [4]> var_791 = const()[name = string("op_791"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_801 = const()[name = string("op_801"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_792 = transpose(perm = var_791, x = var_786)[name = string("transpose_214")];
+            tensor<fp16, [1, 8, 256]> x_1 = reshape(shape = var_801, x = var_792)[name = string("x_1")];
+            int32 var_807 = const()[name = string("op_807"), val = int32(-1)];
+            fp16 const_1_promoted = const()[name = string("const_1_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_809 = mul(x = x_1, y = const_1_promoted)[name = string("op_809")];
+            bool input_5_interleave_0 = const()[name = string("input_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_5 = concat(axis = var_807, interleave = input_5_interleave_0, values = (x_1, var_809))[name = string("input_5")];
+            tensor<int32, [1]> normed_5_axes_0 = const()[name = string("normed_5_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_804_to_fp16 = const()[name = string("op_804_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_5_cast_fp16 = layer_norm(axes = normed_5_axes_0, epsilon = var_804_to_fp16, x = input_5)[name = string("normed_5_cast_fp16")];
+            tensor<int32, [2]> var_814_split_sizes_0 = const()[name = string("op_814_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_814_axis_0 = const()[name = string("op_814_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_814_0, tensor<fp16, [1, 8, 256]> var_814_1 = split(axis = var_814_axis_0, split_sizes = var_814_split_sizes_0, x = normed_5_cast_fp16)[name = string("op_814")];
+            tensor<fp16, [1, 8, 256]> var_816 = mul(x = var_814_0, y = layers_0_self_attn_q_norm_weight)[name = string("op_816")];
+            tensor<int32, [4]> var_821 = const()[name = string("op_821"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_3 = reshape(shape = var_821, x = var_816)[name = string("q_3")];
+            tensor<fp16, [1, 8, 1, 256]> var_823_cast_fp16 = mul(x = q_3, y = cos_s)[name = string("op_823_cast_fp16")];
+            tensor<int32, [2]> var_824_split_sizes_0 = const()[name = string("op_824_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_824_axis_0 = const()[name = string("op_824_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_824_0, tensor<fp16, [1, 8, 1, 128]> var_824_1 = split(axis = var_824_axis_0, split_sizes = var_824_split_sizes_0, x = q_3)[name = string("op_824")];
+            fp16 const_2_promoted = const()[name = string("const_2_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_826 = mul(x = var_824_1, y = const_2_promoted)[name = string("op_826")];
+            int32 var_828 = const()[name = string("op_828"), val = int32(-1)];
+            bool var_829_interleave_0 = const()[name = string("op_829_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_829 = concat(axis = var_828, interleave = var_829_interleave_0, values = (var_826, var_824_0))[name = string("op_829")];
+            tensor<fp16, [1, 8, 1, 256]> var_830_cast_fp16 = mul(x = var_829, y = sin_s)[name = string("op_830_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_7_cast_fp16 = add(x = var_823_cast_fp16, y = var_830_cast_fp16)[name = string("q_7_cast_fp16")];
+            string var_843_pad_type_0 = const()[name = string("op_843_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_843_strides_0 = const()[name = string("op_843_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_843_pad_0 = const()[name = string("op_843_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_843_dilations_0 = const()[name = string("op_843_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_843_groups_0 = const()[name = string("op_843_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_843 = conv(dilations = var_843_dilations_0, groups = var_843_groups_0, pad = var_843_pad_0, pad_type = var_843_pad_type_0, strides = var_843_strides_0, weight = layers_0_self_attn_k_proj_weight_palettized, x = var_764_cast_fp16)[name = string("op_843")];
+            tensor<int32, [4]> var_848 = const()[name = string("op_848"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_849 = reshape(shape = var_848, x = var_843)[name = string("op_849")];
+            tensor<int32, [4]> var_854 = const()[name = string("op_854"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_871_pad_type_0 = const()[name = string("op_871_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_871_strides_0 = const()[name = string("op_871_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_871_pad_0 = const()[name = string("op_871_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_871_dilations_0 = const()[name = string("op_871_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_871_groups_0 = const()[name = string("op_871_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_871 = conv(dilations = var_871_dilations_0, groups = var_871_groups_0, pad = var_871_pad_0, pad_type = var_871_pad_type_0, strides = var_871_strides_0, weight = layers_0_self_attn_v_proj_weight_palettized, x = var_764_cast_fp16)[name = string("op_871")];
+            tensor<int32, [4]> var_876 = const()[name = string("op_876"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_877 = reshape(shape = var_876, x = var_871)[name = string("op_877")];
+            tensor<int32, [4]> var_882 = const()[name = string("op_882"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_892 = const()[name = string("op_892"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp16, [1, 2, 1, 256]> var_855 = transpose(perm = var_854, x = var_849)[name = string("transpose_213")];
+            tensor<fp16, [1, 2, 256]> x_3 = reshape(shape = var_892, x = var_855)[name = string("x_3")];
+            int32 var_898 = const()[name = string("op_898"), val = int32(-1)];
+            fp16 const_3_promoted = const()[name = string("const_3_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 256]> var_900 = mul(x = x_3, y = const_3_promoted)[name = string("op_900")];
+            bool input_7_interleave_0 = const()[name = string("input_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512]> input_7 = concat(axis = var_898, interleave = input_7_interleave_0, values = (x_3, var_900))[name = string("input_7")];
+            tensor<int32, [1]> normed_9_axes_0 = const()[name = string("normed_9_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_895_to_fp16 = const()[name = string("op_895_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 512]> normed_9_cast_fp16 = layer_norm(axes = normed_9_axes_0, epsilon = var_895_to_fp16, x = input_7)[name = string("normed_9_cast_fp16")];
+            tensor<int32, [2]> var_905_split_sizes_0 = const()[name = string("op_905_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_905_axis_0 = const()[name = string("op_905_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 256]> var_905_0, tensor<fp16, [1, 2, 256]> var_905_1 = split(axis = var_905_axis_0, split_sizes = var_905_split_sizes_0, x = normed_9_cast_fp16)[name = string("op_905")];
+            tensor<fp16, [1, 2, 256]> var_907 = mul(x = var_905_0, y = layers_0_self_attn_k_norm_weight)[name = string("op_907")];
+            tensor<int32, [4]> var_912 = const()[name = string("op_912"), val = tensor<int32, [4]>([1, 2, 1, 256])];
+            tensor<fp16, [1, 2, 1, 256]> q_5 = reshape(shape = var_912, x = var_907)[name = string("q_5")];
+            fp16 var_914_promoted = const()[name = string("op_914_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 256]> var_883 = transpose(perm = var_882, x = var_877)[name = string("transpose_212")];
+            tensor<fp16, [1, 2, 1, 256]> var_915 = pow(x = var_883, y = var_914_promoted)[name = string("op_915")];
+            tensor<int32, [1]> var_920_axes_0 = const()[name = string("op_920_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_920_keep_dims_0 = const()[name = string("op_920_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_920 = reduce_mean(axes = var_920_axes_0, keep_dims = var_920_keep_dims_0, x = var_915)[name = string("op_920")];
+            fp16 var_922_to_fp16 = const()[name = string("op_922_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_1_cast_fp16 = add(x = var_920, y = var_922_to_fp16)[name = string("mean_sq_1_cast_fp16")];
+            fp32 var_924_epsilon_0 = const()[name = string("op_924_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_924_cast_fp16 = rsqrt(epsilon = var_924_epsilon_0, x = mean_sq_1_cast_fp16)[name = string("op_924_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_11_cast_fp16 = mul(x = var_883, y = var_924_cast_fp16)[name = string("input_11_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> var_926_cast_fp16 = mul(x = q_5, y = cos_s)[name = string("op_926_cast_fp16")];
+            tensor<int32, [2]> var_927_split_sizes_0 = const()[name = string("op_927_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_927_axis_0 = const()[name = string("op_927_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 128]> var_927_0, tensor<fp16, [1, 2, 1, 128]> var_927_1 = split(axis = var_927_axis_0, split_sizes = var_927_split_sizes_0, x = q_5)[name = string("op_927")];
+            fp16 const_4_promoted = const()[name = string("const_4_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 128]> var_929 = mul(x = var_927_1, y = const_4_promoted)[name = string("op_929")];
+            int32 var_931 = const()[name = string("op_931"), val = int32(-1)];
+            bool var_932_interleave_0 = const()[name = string("op_932_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 256]> var_932 = concat(axis = var_931, interleave = var_932_interleave_0, values = (var_929, var_927_0))[name = string("op_932")];
+            tensor<fp16, [1, 2, 1, 256]> var_933_cast_fp16 = mul(x = var_932, y = sin_s)[name = string("op_933_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_9_cast_fp16 = add(x = var_926_cast_fp16, y = var_933_cast_fp16)[name = string("input_9_cast_fp16")];
+            tensor<int32, [8]> k_padded_1_pad_0 = const()[name = string("k_padded_1_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_1_mode_0 = const()[name = string("k_padded_1_mode_0"), val = string("constant")];
+            fp16 const_5_to_fp16 = const()[name = string("const_5_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> k_padded_1_cast_fp16 = pad(constant_val = const_5_to_fp16, mode = k_padded_1_mode_0, pad = k_padded_1_pad_0, x = input_9_cast_fp16)[name = string("k_padded_1_cast_fp16")];
+            tensor<int32, [8]> v_padded_1_pad_0 = const()[name = string("v_padded_1_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_1_mode_0 = const()[name = string("v_padded_1_mode_0"), val = string("constant")];
+            fp16 const_6_to_fp16 = const()[name = string("const_6_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> v_padded_1_cast_fp16 = pad(constant_val = const_6_to_fp16, mode = v_padded_1_mode_0, pad = v_padded_1_pad_0, x = input_11_cast_fp16)[name = string("v_padded_1_cast_fp16")];
+            tensor<int32, [4]> var_962_begin_0 = const()[name = string("op_962_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_962_end_0 = const()[name = string("op_962_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_962_end_mask_0 = const()[name = string("op_962_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_962_cast_fp16 = slice_by_index(begin = var_962_begin_0, end = var_962_end_0, end_mask = var_962_end_mask_0, x = K_sliding_slot_1_cast_fp16)[name = string("op_962_cast_fp16")];
+            int32 var_969 = const()[name = string("op_969"), val = int32(2)];
+            bool K_sliding_out_1_interleave_0 = const()[name = string("K_sliding_out_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_out_1_cast_fp16 = concat(axis = var_969, interleave = K_sliding_out_1_interleave_0, values = (var_962_cast_fp16, k_padded_1_cast_fp16))[name = string("K_sliding_out_1_cast_fp16")];
+            tensor<int32, [4]> var_985_begin_0 = const()[name = string("op_985_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_985_end_0 = const()[name = string("op_985_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_985_end_mask_0 = const()[name = string("op_985_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_985_cast_fp16 = slice_by_index(begin = var_985_begin_0, end = var_985_end_0, end_mask = var_985_end_mask_0, x = V_sliding_slot_1_cast_fp16)[name = string("op_985_cast_fp16")];
+            int32 var_992 = const()[name = string("op_992"), val = int32(2)];
+            bool V_sliding_out_1_interleave_0 = const()[name = string("V_sliding_out_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_out_1_cast_fp16 = concat(axis = var_992, interleave = V_sliding_out_1_interleave_0, values = (var_985_cast_fp16, v_padded_1_cast_fp16))[name = string("V_sliding_out_1_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_1_begin_0 = const()[name = string("K_for_attn_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_1_end_0 = const()[name = string("K_for_attn_1_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_1_end_mask_0 = const()[name = string("K_for_attn_1_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_1_cast_fp16 = slice_by_index(begin = K_for_attn_1_begin_0, end = K_for_attn_1_end_0, end_mask = K_for_attn_1_end_mask_0, x = K_sliding_out_1_cast_fp16)[name = string("K_for_attn_1_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_1_begin_0 = const()[name = string("V_for_attn_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_1_end_0 = const()[name = string("V_for_attn_1_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_1_end_mask_0 = const()[name = string("V_for_attn_1_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_1_cast_fp16 = slice_by_index(begin = V_for_attn_1_begin_0, end = V_for_attn_1_end_0, end_mask = V_for_attn_1_end_mask_0, x = V_sliding_out_1_cast_fp16)[name = string("V_for_attn_1_cast_fp16")];
+            tensor<int32, [4]> transpose_0_perm_0 = const()[name = string("transpose_0_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_0_reps_0 = const()[name = string("tile_0_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_0_cast_fp16 = transpose(perm = transpose_0_perm_0, x = K_for_attn_1_cast_fp16)[name = string("transpose_211")];
+            tensor<fp16, [8, 1, 512, 256]> tile_0_cast_fp16 = tile(reps = tile_0_reps_0, x = transpose_0_cast_fp16)[name = string("tile_0_cast_fp16")];
+            tensor<int32, [5]> concat_0 = const()[name = string("concat_0"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_0_cast_fp16 = reshape(shape = concat_0, x = tile_0_cast_fp16)[name = string("reshape_0_cast_fp16")];
+            tensor<int32, [5]> transpose_1_perm_0 = const()[name = string("transpose_1_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_1 = const()[name = string("concat_1"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_1_cast_fp16 = transpose(perm = transpose_1_perm_0, x = reshape_0_cast_fp16)[name = string("transpose_210")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_1_cast_fp16 = reshape(shape = concat_1, x = transpose_1_cast_fp16)[name = string("reshape_1_cast_fp16")];
+            tensor<int32, [4]> transpose_48_perm_0 = const()[name = string("transpose_48_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_2_perm_0 = const()[name = string("transpose_2_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_1_reps_0 = const()[name = string("tile_1_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_2_cast_fp16 = transpose(perm = transpose_2_perm_0, x = V_for_attn_1_cast_fp16)[name = string("transpose_209")];
+            tensor<fp16, [8, 1, 512, 256]> tile_1_cast_fp16 = tile(reps = tile_1_reps_0, x = transpose_2_cast_fp16)[name = string("tile_1_cast_fp16")];
+            tensor<int32, [5]> concat_2 = const()[name = string("concat_2"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_2_cast_fp16 = reshape(shape = concat_2, x = tile_1_cast_fp16)[name = string("reshape_2_cast_fp16")];
+            tensor<int32, [5]> transpose_3_perm_0 = const()[name = string("transpose_3_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_3 = const()[name = string("concat_3"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_3_cast_fp16 = transpose(perm = transpose_3_perm_0, x = reshape_2_cast_fp16)[name = string("transpose_208")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_3_cast_fp16 = reshape(shape = concat_3, x = transpose_3_cast_fp16)[name = string("reshape_3_cast_fp16")];
+            tensor<int32, [4]> V_expanded_1_perm_0 = const()[name = string("V_expanded_1_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(false)];
+            bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_48_cast_fp16 = transpose(perm = transpose_48_perm_0, x = reshape_1_cast_fp16)[name = string("transpose_207")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = q_7_cast_fp16, y = transpose_48_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_7_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = causal_mask_sliding)[name = string("x_7_cast_fp16")];
+            tensor<int32, [1]> reduce_max_0_axes_0 = const()[name = string("reduce_max_0_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_0_keep_dims_0 = const()[name = string("reduce_max_0_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_0 = reduce_max(axes = reduce_max_0_axes_0, keep_dims = reduce_max_0_keep_dims_0, x = x_7_cast_fp16)[name = string("reduce_max_0")];
+            tensor<fp16, [1, 8, 1, 512]> var_1033 = sub(x = x_7_cast_fp16, y = reduce_max_0)[name = string("op_1033")];
+            tensor<fp16, [1, 8, 1, 512]> var_1039 = exp(x = var_1033)[name = string("op_1039")];
+            tensor<int32, [1]> var_1049_axes_0 = const()[name = string("op_1049_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1049_keep_dims_0 = const()[name = string("op_1049_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_1049 = reduce_sum(axes = var_1049_axes_0, keep_dims = var_1049_keep_dims_0, x = var_1039)[name = string("op_1049")];
+            tensor<fp16, [1, 8, 1, 512]> var_1055_cast_fp16 = real_div(x = var_1039, y = var_1049)[name = string("op_1055_cast_fp16")];
+            bool attn_output_1_transpose_x_0 = const()[name = string("attn_output_1_transpose_x_0"), val = bool(false)];
+            bool attn_output_1_transpose_y_0 = const()[name = string("attn_output_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_1_cast_fp16 = transpose(perm = V_expanded_1_perm_0, x = reshape_3_cast_fp16)[name = string("transpose_206")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_1_cast_fp16 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = var_1055_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_1_cast_fp16")];
+            tensor<int32, [4]> var_1066 = const()[name = string("op_1066"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1073 = const()[name = string("op_1073"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_1067_cast_fp16 = transpose(perm = var_1066, x = attn_output_1_cast_fp16)[name = string("transpose_205")];
+            tensor<fp16, [1, 1, 2048]> attn_output_3_cast_fp16 = reshape(shape = var_1073, x = var_1067_cast_fp16)[name = string("attn_output_3_cast_fp16")];
+            tensor<int32, [3]> var_1078 = const()[name = string("op_1078"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_1094_pad_type_0 = const()[name = string("op_1094_pad_type_0"), val = string("valid")];
+            int32 var_1094_groups_0 = const()[name = string("op_1094_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_1094_strides_0 = const()[name = string("op_1094_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_1094_pad_0 = const()[name = string("op_1094_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_1094_dilations_0 = const()[name = string("op_1094_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_0_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(531256512))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(533878016))))[name = string("squeeze_0_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_1079_cast_fp16 = transpose(perm = var_1078, x = attn_output_3_cast_fp16)[name = string("transpose_204")];
+            tensor<fp16, [1, 2560, 1]> var_1094_cast_fp16 = conv(dilations = var_1094_dilations_0, groups = var_1094_groups_0, pad = var_1094_pad_0, pad_type = var_1094_pad_type_0, strides = var_1094_strides_0, weight = squeeze_0_cast_fp16_to_fp32_to_fp16_palettized, x = var_1079_cast_fp16)[name = string("op_1094_cast_fp16")];
+            tensor<int32, [3]> var_1098 = const()[name = string("op_1098"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1104 = const()[name = string("op_1104"), val = int32(-1)];
+            fp16 const_7_promoted_to_fp16 = const()[name = string("const_7_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_11_cast_fp16 = transpose(perm = var_1098, x = var_1094_cast_fp16)[name = string("transpose_203")];
+            tensor<fp16, [1, 1, 2560]> var_1106_cast_fp16 = mul(x = x_11_cast_fp16, y = const_7_promoted_to_fp16)[name = string("op_1106_cast_fp16")];
+            bool input_15_interleave_0 = const()[name = string("input_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_15_cast_fp16 = concat(axis = var_1104, interleave = input_15_interleave_0, values = (x_11_cast_fp16, var_1106_cast_fp16))[name = string("input_15_cast_fp16")];
+            tensor<int32, [1]> normed_13_axes_0 = const()[name = string("normed_13_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1101_to_fp16 = const()[name = string("op_1101_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_13_cast_fp16 = layer_norm(axes = normed_13_axes_0, epsilon = var_1101_to_fp16, x = input_15_cast_fp16)[name = string("normed_13_cast_fp16")];
+            tensor<int32, [2]> var_1111_split_sizes_0 = const()[name = string("op_1111_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1111_axis_0 = const()[name = string("op_1111_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1111_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1111_cast_fp16_1 = split(axis = var_1111_axis_0, split_sizes = var_1111_split_sizes_0, x = normed_13_cast_fp16)[name = string("op_1111_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_0_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(533880640)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_5_cast_fp16 = mul(x = var_1111_cast_fp16_0, y = layers_0_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_5_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_13_cast_fp16 = add(x = hidden_states, y = attn_output_5_cast_fp16)[name = string("x_13_cast_fp16")];
+            int32 var_1120 = const()[name = string("op_1120"), val = int32(-1)];
+            fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_1122_cast_fp16 = mul(x = x_13_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_1122_cast_fp16")];
+            bool input_17_interleave_0 = const()[name = string("input_17_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_17_cast_fp16 = concat(axis = var_1120, interleave = input_17_interleave_0, values = (x_13_cast_fp16, var_1122_cast_fp16))[name = string("input_17_cast_fp16")];
+            tensor<int32, [1]> normed_17_axes_0 = const()[name = string("normed_17_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1117_to_fp16 = const()[name = string("op_1117_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_17_cast_fp16 = layer_norm(axes = normed_17_axes_0, epsilon = var_1117_to_fp16, x = input_17_cast_fp16)[name = string("normed_17_cast_fp16")];
+            tensor<int32, [2]> var_1127_split_sizes_0 = const()[name = string("op_1127_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1127_axis_0 = const()[name = string("op_1127_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1127_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1127_cast_fp16_1 = split(axis = var_1127_axis_0, split_sizes = var_1127_split_sizes_0, x = normed_17_cast_fp16)[name = string("op_1127_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_0_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(533885824)))];
+            tensor<fp16, [1, 1, 2560]> h_3_cast_fp16 = mul(x = var_1127_cast_fp16_0, y = layers_0_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_3_cast_fp16")];
+            tensor<int32, [3]> var_1138 = const()[name = string("op_1138"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_19_axes_0 = const()[name = string("input_19_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1139 = transpose(perm = var_1138, x = h_3_cast_fp16)[name = string("transpose_202")];
+            tensor<fp16, [1, 2560, 1, 1]> input_19 = expand_dims(axes = input_19_axes_0, x = var_1139)[name = string("input_19")];
+            string gate_1_pad_type_0 = const()[name = string("gate_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_1_strides_0 = const()[name = string("gate_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_1_pad_0 = const()[name = string("gate_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_1_dilations_0 = const()[name = string("gate_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_1_groups_0 = const()[name = string("gate_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_1 = conv(dilations = gate_1_dilations_0, groups = gate_1_groups_0, pad = gate_1_pad_0, pad_type = gate_1_pad_type_0, strides = gate_1_strides_0, weight = layers_0_mlp_gate_proj_weight_palettized, x = input_19)[name = string("gate_1")];
+            string up_1_pad_type_0 = const()[name = string("up_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_1_strides_0 = const()[name = string("up_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_1_pad_0 = const()[name = string("up_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_1_dilations_0 = const()[name = string("up_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_1_groups_0 = const()[name = string("up_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_1 = conv(dilations = up_1_dilations_0, groups = up_1_groups_0, pad = up_1_pad_0, pad_type = up_1_pad_type_0, strides = up_1_strides_0, weight = layers_0_mlp_up_proj_weight_palettized, x = input_19)[name = string("up_1")];
+            string gate_3_mode_0 = const()[name = string("gate_3_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_3 = gelu(mode = gate_3_mode_0, x = gate_1)[name = string("gate_3")];
+            tensor<fp16, [1, 10240, 1, 1]> input_21 = mul(x = gate_3, y = up_1)[name = string("input_21")];
+            string mlp_out_1_pad_type_0 = const()[name = string("mlp_out_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_1_strides_0 = const()[name = string("mlp_out_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_1_pad_0 = const()[name = string("mlp_out_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_1_dilations_0 = const()[name = string("mlp_out_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_1_groups_0 = const()[name = string("mlp_out_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_1 = conv(dilations = mlp_out_1_dilations_0, groups = mlp_out_1_groups_0, pad = mlp_out_1_pad_0, pad_type = mlp_out_1_pad_type_0, strides = mlp_out_1_strides_0, weight = layers_0_mlp_down_proj_weight_palettized, x = input_21)[name = string("mlp_out_1")];
+            tensor<int32, [1]> var_1179_axes_0 = const()[name = string("op_1179_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1179 = squeeze(axes = var_1179_axes_0, x = mlp_out_1)[name = string("op_1179")];
+            tensor<int32, [3]> var_1183 = const()[name = string("op_1183"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1189 = const()[name = string("op_1189"), val = int32(-1)];
+            fp16 const_9_promoted = const()[name = string("const_9_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_15 = transpose(perm = var_1183, x = var_1179)[name = string("transpose_201")];
+            tensor<fp16, [1, 1, 2560]> var_1191 = mul(x = x_15, y = const_9_promoted)[name = string("op_1191")];
+            bool input_23_interleave_0 = const()[name = string("input_23_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_23 = concat(axis = var_1189, interleave = input_23_interleave_0, values = (x_15, var_1191))[name = string("input_23")];
+            tensor<int32, [1]> normed_21_axes_0 = const()[name = string("normed_21_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1186_to_fp16 = const()[name = string("op_1186_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_21_cast_fp16 = layer_norm(axes = normed_21_axes_0, epsilon = var_1186_to_fp16, x = input_23)[name = string("normed_21_cast_fp16")];
+            tensor<int32, [2]> var_1196_split_sizes_0 = const()[name = string("op_1196_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1196_axis_0 = const()[name = string("op_1196_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1196_0, tensor<fp16, [1, 1, 2560]> var_1196_1 = split(axis = var_1196_axis_0, split_sizes = var_1196_split_sizes_0, x = normed_21_cast_fp16)[name = string("op_1196")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_3 = mul(x = var_1196_0, y = layers_0_post_feedforward_layernorm_weight)[name = string("hidden_states_3")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_5_cast_fp16 = add(x = x_13_cast_fp16, y = hidden_states_3)[name = string("hidden_states_5_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_1_begin_0 = const()[name = string("per_layer_slice_1_begin_0"), val = tensor<int32, [3]>([0, 0, 3072])];
+            tensor<int32, [3]> per_layer_slice_1_end_0 = const()[name = string("per_layer_slice_1_end_0"), val = tensor<int32, [3]>([1, 1, 3328])];
+            tensor<bool, [3]> per_layer_slice_1_end_mask_0 = const()[name = string("per_layer_slice_1_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_1_cast_fp16 = slice_by_index(begin = per_layer_slice_1_begin_0, end = per_layer_slice_1_end_0, end_mask = per_layer_slice_1_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_1_cast_fp16")];
+            tensor<int32, [3]> var_1224 = const()[name = string("op_1224"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_25_axes_0 = const()[name = string("input_25_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1225 = transpose(perm = var_1224, x = hidden_states_5_cast_fp16)[name = string("transpose_200")];
+            tensor<fp16, [1, 2560, 1, 1]> input_25 = expand_dims(axes = input_25_axes_0, x = var_1225)[name = string("input_25")];
+            string gated_1_pad_type_0 = const()[name = string("gated_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_1_strides_0 = const()[name = string("gated_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_1_pad_0 = const()[name = string("gated_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_1_dilations_0 = const()[name = string("gated_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_1_groups_0 = const()[name = string("gated_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_1 = conv(dilations = gated_1_dilations_0, groups = gated_1_groups_0, pad = gated_1_pad_0, pad_type = gated_1_pad_type_0, strides = gated_1_strides_0, weight = layers_0_per_layer_input_gate_weight_palettized, x = input_25)[name = string("gated_1")];
+            string gated_3_mode_0 = const()[name = string("gated_3_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_3 = gelu(mode = gated_3_mode_0, x = gated_1)[name = string("gated_3")];
+            tensor<int32, [3]> var_1244 = const()[name = string("op_1244"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_1_axes_0 = const()[name = string("per_layer_slice_conv_1_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_1245_cast_fp16 = transpose(perm = var_1244, x = per_layer_slice_1_cast_fp16)[name = string("transpose_199")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_1_cast_fp16 = expand_dims(axes = per_layer_slice_conv_1_axes_0, x = var_1245_cast_fp16)[name = string("per_layer_slice_conv_1_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_27_cast_fp16 = mul(x = gated_3, y = per_layer_slice_conv_1_cast_fp16)[name = string("input_27_cast_fp16")];
+            string gated_5_pad_type_0 = const()[name = string("gated_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_5_strides_0 = const()[name = string("gated_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_5_pad_0 = const()[name = string("gated_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_5_dilations_0 = const()[name = string("gated_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_5_groups_0 = const()[name = string("gated_5_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_0_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(533891008))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(534218752))))[name = string("layers_0_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_5_cast_fp16 = conv(dilations = gated_5_dilations_0, groups = gated_5_groups_0, pad = gated_5_pad_0, pad_type = gated_5_pad_type_0, strides = gated_5_strides_0, weight = layers_0_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_27_cast_fp16)[name = string("gated_5_cast_fp16")];
+            tensor<int32, [1]> var_1261_axes_0 = const()[name = string("op_1261_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1261_cast_fp16 = squeeze(axes = var_1261_axes_0, x = gated_5_cast_fp16)[name = string("op_1261_cast_fp16")];
+            tensor<int32, [3]> var_1265 = const()[name = string("op_1265"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1271 = const()[name = string("op_1271"), val = int32(-1)];
+            fp16 const_10_promoted_to_fp16 = const()[name = string("const_10_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_17_cast_fp16 = transpose(perm = var_1265, x = var_1261_cast_fp16)[name = string("transpose_198")];
+            tensor<fp16, [1, 1, 2560]> var_1273_cast_fp16 = mul(x = x_17_cast_fp16, y = const_10_promoted_to_fp16)[name = string("op_1273_cast_fp16")];
+            bool input_29_interleave_0 = const()[name = string("input_29_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_29_cast_fp16 = concat(axis = var_1271, interleave = input_29_interleave_0, values = (x_17_cast_fp16, var_1273_cast_fp16))[name = string("input_29_cast_fp16")];
+            tensor<int32, [1]> normed_25_axes_0 = const()[name = string("normed_25_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1268_to_fp16 = const()[name = string("op_1268_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_25_cast_fp16 = layer_norm(axes = normed_25_axes_0, epsilon = var_1268_to_fp16, x = input_29_cast_fp16)[name = string("normed_25_cast_fp16")];
+            tensor<int32, [2]> var_1278_split_sizes_0 = const()[name = string("op_1278_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1278_axis_0 = const()[name = string("op_1278_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1278_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1278_cast_fp16_1 = split(axis = var_1278_axis_0, split_sizes = var_1278_split_sizes_0, x = normed_25_cast_fp16)[name = string("op_1278_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_0_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(534221376)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_9_cast_fp16 = mul(x = var_1278_cast_fp16_0, y = layers_0_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_9_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_11_cast_fp16 = add(x = hidden_states_5_cast_fp16, y = hidden_states_9_cast_fp16)[name = string("hidden_states_11_cast_fp16")];
+            tensor<fp16, [1]> const_11_promoted_to_fp16 = const()[name = string("const_11_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.7ep-1])];
+            tensor<fp16, [1, 1, 2560]> x_19_cast_fp16 = mul(x = hidden_states_11_cast_fp16, y = const_11_promoted_to_fp16)[name = string("x_19_cast_fp16")];
+            tensor<int32, [1]> var_1290_axes_0 = const()[name = string("op_1290_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_1290_cast_fp16 = squeeze(axes = var_1290_axes_0, x = K_sliding_out_1_cast_fp16)[name = string("op_1290_cast_fp16")];
+            tensor<int32, [1]> var_1292_axes_0 = const()[name = string("op_1292_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_1292_cast_fp16 = squeeze(axes = var_1292_axes_0, x = V_sliding_out_1_cast_fp16)[name = string("op_1292_cast_fp16")];
+            tensor<int32, [4]> var_1295_begin_0 = const()[name = string("op_1295_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> var_1295_end_0 = const()[name = string("op_1295_end_0"), val = tensor<int32, [4]>([2, 2, 512, 512])];
+            tensor<bool, [4]> var_1295_end_mask_0 = const()[name = string("op_1295_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_1295_squeeze_mask_0 = const()[name = string("op_1295_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_1295_cast_fp16 = slice_by_index(begin = var_1295_begin_0, end = var_1295_end_0, end_mask = var_1295_end_mask_0, squeeze_mask = var_1295_squeeze_mask_0, x = K_sliding_in)[name = string("op_1295_cast_fp16")];
+            tensor<int32, [1]> K_sliding_slot_3_axes_0 = const()[name = string("K_sliding_slot_3_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_slot_3_cast_fp16 = expand_dims(axes = K_sliding_slot_3_axes_0, x = var_1295_cast_fp16)[name = string("K_sliding_slot_3_cast_fp16")];
+            tensor<int32, [4]> var_1300_begin_0 = const()[name = string("op_1300_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> var_1300_end_0 = const()[name = string("op_1300_end_0"), val = tensor<int32, [4]>([2, 2, 512, 512])];
+            tensor<bool, [4]> var_1300_end_mask_0 = const()[name = string("op_1300_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_1300_squeeze_mask_0 = const()[name = string("op_1300_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_1300_cast_fp16 = slice_by_index(begin = var_1300_begin_0, end = var_1300_end_0, end_mask = var_1300_end_mask_0, squeeze_mask = var_1300_squeeze_mask_0, x = V_sliding_in)[name = string("op_1300_cast_fp16")];
+            tensor<int32, [1]> V_sliding_slot_3_axes_0 = const()[name = string("V_sliding_slot_3_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_slot_3_cast_fp16 = expand_dims(axes = V_sliding_slot_3_axes_0, x = var_1300_cast_fp16)[name = string("V_sliding_slot_3_cast_fp16")];
+            int32 var_1307 = const()[name = string("op_1307"), val = int32(-1)];
+            fp16 const_12_promoted_to_fp16 = const()[name = string("const_12_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_1309_cast_fp16 = mul(x = x_19_cast_fp16, y = const_12_promoted_to_fp16)[name = string("op_1309_cast_fp16")];
+            bool input_31_interleave_0 = const()[name = string("input_31_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_31_cast_fp16 = concat(axis = var_1307, interleave = input_31_interleave_0, values = (x_19_cast_fp16, var_1309_cast_fp16))[name = string("input_31_cast_fp16")];
+            tensor<int32, [1]> normed_29_axes_0 = const()[name = string("normed_29_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1304_to_fp16 = const()[name = string("op_1304_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_29_cast_fp16 = layer_norm(axes = normed_29_axes_0, epsilon = var_1304_to_fp16, x = input_31_cast_fp16)[name = string("normed_29_cast_fp16")];
+            tensor<int32, [2]> var_1314_split_sizes_0 = const()[name = string("op_1314_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1314_axis_0 = const()[name = string("op_1314_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1314_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1314_cast_fp16_1 = split(axis = var_1314_axis_0, split_sizes = var_1314_split_sizes_0, x = normed_29_cast_fp16)[name = string("op_1314_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_1_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(534226560)))];
+            tensor<fp16, [1, 1, 2560]> h_7_cast_fp16 = mul(x = var_1314_cast_fp16_0, y = layers_1_input_layernorm_weight_promoted_to_fp16)[name = string("h_7_cast_fp16")];
+            tensor<int32, [3]> var_1320 = const()[name = string("op_1320"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1323_axes_0 = const()[name = string("op_1323_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1321_cast_fp16 = transpose(perm = var_1320, x = h_7_cast_fp16)[name = string("transpose_197")];
+            tensor<fp16, [1, 2560, 1, 1]> var_1323_cast_fp16 = expand_dims(axes = var_1323_axes_0, x = var_1321_cast_fp16)[name = string("op_1323_cast_fp16")];
+            string var_1339_pad_type_0 = const()[name = string("op_1339_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1339_strides_0 = const()[name = string("op_1339_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1339_pad_0 = const()[name = string("op_1339_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1339_dilations_0 = const()[name = string("op_1339_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1339_groups_0 = const()[name = string("op_1339_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_1339 = conv(dilations = var_1339_dilations_0, groups = var_1339_groups_0, pad = var_1339_pad_0, pad_type = var_1339_pad_type_0, strides = var_1339_strides_0, weight = layers_1_self_attn_q_proj_weight_palettized, x = var_1323_cast_fp16)[name = string("op_1339")];
+            tensor<int32, [4]> var_1344 = const()[name = string("op_1344"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_1345 = reshape(shape = var_1344, x = var_1339)[name = string("op_1345")];
+            tensor<int32, [4]> var_1350 = const()[name = string("op_1350"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_1360 = const()[name = string("op_1360"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_1351 = transpose(perm = var_1350, x = var_1345)[name = string("transpose_196")];
+            tensor<fp16, [1, 8, 256]> x_21 = reshape(shape = var_1360, x = var_1351)[name = string("x_21")];
+            int32 var_1366 = const()[name = string("op_1366"), val = int32(-1)];
+            fp16 const_13_promoted = const()[name = string("const_13_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_1368 = mul(x = x_21, y = const_13_promoted)[name = string("op_1368")];
+            bool input_35_interleave_0 = const()[name = string("input_35_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_35 = concat(axis = var_1366, interleave = input_35_interleave_0, values = (x_21, var_1368))[name = string("input_35")];
+            tensor<int32, [1]> normed_33_axes_0 = const()[name = string("normed_33_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1363_to_fp16 = const()[name = string("op_1363_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_33_cast_fp16 = layer_norm(axes = normed_33_axes_0, epsilon = var_1363_to_fp16, x = input_35)[name = string("normed_33_cast_fp16")];
+            tensor<int32, [2]> var_1373_split_sizes_0 = const()[name = string("op_1373_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_1373_axis_0 = const()[name = string("op_1373_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_1373_0, tensor<fp16, [1, 8, 256]> var_1373_1 = split(axis = var_1373_axis_0, split_sizes = var_1373_split_sizes_0, x = normed_33_cast_fp16)[name = string("op_1373")];
+            tensor<fp16, [1, 8, 256]> var_1375 = mul(x = var_1373_0, y = layers_1_self_attn_q_norm_weight)[name = string("op_1375")];
+            tensor<int32, [4]> var_1380 = const()[name = string("op_1380"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_11 = reshape(shape = var_1380, x = var_1375)[name = string("q_11")];
+            tensor<fp16, [1, 8, 1, 256]> var_1382_cast_fp16 = mul(x = q_11, y = cos_s)[name = string("op_1382_cast_fp16")];
+            tensor<int32, [2]> var_1383_split_sizes_0 = const()[name = string("op_1383_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_1383_axis_0 = const()[name = string("op_1383_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_1383_0, tensor<fp16, [1, 8, 1, 128]> var_1383_1 = split(axis = var_1383_axis_0, split_sizes = var_1383_split_sizes_0, x = q_11)[name = string("op_1383")];
+            fp16 const_14_promoted = const()[name = string("const_14_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_1385 = mul(x = var_1383_1, y = const_14_promoted)[name = string("op_1385")];
+            int32 var_1387 = const()[name = string("op_1387"), val = int32(-1)];
+            bool var_1388_interleave_0 = const()[name = string("op_1388_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_1388 = concat(axis = var_1387, interleave = var_1388_interleave_0, values = (var_1385, var_1383_0))[name = string("op_1388")];
+            tensor<fp16, [1, 8, 1, 256]> var_1389_cast_fp16 = mul(x = var_1388, y = sin_s)[name = string("op_1389_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_15_cast_fp16 = add(x = var_1382_cast_fp16, y = var_1389_cast_fp16)[name = string("q_15_cast_fp16")];
+            string var_1402_pad_type_0 = const()[name = string("op_1402_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1402_strides_0 = const()[name = string("op_1402_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1402_pad_0 = const()[name = string("op_1402_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1402_dilations_0 = const()[name = string("op_1402_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1402_groups_0 = const()[name = string("op_1402_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_1402 = conv(dilations = var_1402_dilations_0, groups = var_1402_groups_0, pad = var_1402_pad_0, pad_type = var_1402_pad_type_0, strides = var_1402_strides_0, weight = layers_1_self_attn_k_proj_weight_palettized, x = var_1323_cast_fp16)[name = string("op_1402")];
+            tensor<int32, [4]> var_1407 = const()[name = string("op_1407"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_1408 = reshape(shape = var_1407, x = var_1402)[name = string("op_1408")];
+            tensor<int32, [4]> var_1413 = const()[name = string("op_1413"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_1430_pad_type_0 = const()[name = string("op_1430_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1430_strides_0 = const()[name = string("op_1430_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1430_pad_0 = const()[name = string("op_1430_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1430_dilations_0 = const()[name = string("op_1430_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1430_groups_0 = const()[name = string("op_1430_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_1430 = conv(dilations = var_1430_dilations_0, groups = var_1430_groups_0, pad = var_1430_pad_0, pad_type = var_1430_pad_type_0, strides = var_1430_strides_0, weight = layers_1_self_attn_v_proj_weight_palettized, x = var_1323_cast_fp16)[name = string("op_1430")];
+            tensor<int32, [4]> var_1435 = const()[name = string("op_1435"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_1436 = reshape(shape = var_1435, x = var_1430)[name = string("op_1436")];
+            tensor<int32, [4]> var_1441 = const()[name = string("op_1441"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_1451 = const()[name = string("op_1451"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp16, [1, 2, 1, 256]> var_1414 = transpose(perm = var_1413, x = var_1408)[name = string("transpose_195")];
+            tensor<fp16, [1, 2, 256]> x_23 = reshape(shape = var_1451, x = var_1414)[name = string("x_23")];
+            int32 var_1457 = const()[name = string("op_1457"), val = int32(-1)];
+            fp16 const_15_promoted = const()[name = string("const_15_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 256]> var_1459 = mul(x = x_23, y = const_15_promoted)[name = string("op_1459")];
+            bool input_37_interleave_0 = const()[name = string("input_37_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512]> input_37 = concat(axis = var_1457, interleave = input_37_interleave_0, values = (x_23, var_1459))[name = string("input_37")];
+            tensor<int32, [1]> normed_37_axes_0 = const()[name = string("normed_37_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1454_to_fp16 = const()[name = string("op_1454_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 512]> normed_37_cast_fp16 = layer_norm(axes = normed_37_axes_0, epsilon = var_1454_to_fp16, x = input_37)[name = string("normed_37_cast_fp16")];
+            tensor<int32, [2]> var_1464_split_sizes_0 = const()[name = string("op_1464_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_1464_axis_0 = const()[name = string("op_1464_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 256]> var_1464_0, tensor<fp16, [1, 2, 256]> var_1464_1 = split(axis = var_1464_axis_0, split_sizes = var_1464_split_sizes_0, x = normed_37_cast_fp16)[name = string("op_1464")];
+            tensor<fp16, [1, 2, 256]> var_1466 = mul(x = var_1464_0, y = layers_1_self_attn_k_norm_weight)[name = string("op_1466")];
+            tensor<int32, [4]> var_1471 = const()[name = string("op_1471"), val = tensor<int32, [4]>([1, 2, 1, 256])];
+            tensor<fp16, [1, 2, 1, 256]> q_13 = reshape(shape = var_1471, x = var_1466)[name = string("q_13")];
+            fp16 var_1473_promoted = const()[name = string("op_1473_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 256]> var_1442 = transpose(perm = var_1441, x = var_1436)[name = string("transpose_194")];
+            tensor<fp16, [1, 2, 1, 256]> var_1474 = pow(x = var_1442, y = var_1473_promoted)[name = string("op_1474")];
+            tensor<int32, [1]> var_1479_axes_0 = const()[name = string("op_1479_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1479_keep_dims_0 = const()[name = string("op_1479_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_1479 = reduce_mean(axes = var_1479_axes_0, keep_dims = var_1479_keep_dims_0, x = var_1474)[name = string("op_1479")];
+            fp16 var_1481_to_fp16 = const()[name = string("op_1481_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_3_cast_fp16 = add(x = var_1479, y = var_1481_to_fp16)[name = string("mean_sq_3_cast_fp16")];
+            fp32 var_1483_epsilon_0 = const()[name = string("op_1483_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_1483_cast_fp16 = rsqrt(epsilon = var_1483_epsilon_0, x = mean_sq_3_cast_fp16)[name = string("op_1483_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_41_cast_fp16 = mul(x = var_1442, y = var_1483_cast_fp16)[name = string("input_41_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> var_1485_cast_fp16 = mul(x = q_13, y = cos_s)[name = string("op_1485_cast_fp16")];
+            tensor<int32, [2]> var_1486_split_sizes_0 = const()[name = string("op_1486_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_1486_axis_0 = const()[name = string("op_1486_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 128]> var_1486_0, tensor<fp16, [1, 2, 1, 128]> var_1486_1 = split(axis = var_1486_axis_0, split_sizes = var_1486_split_sizes_0, x = q_13)[name = string("op_1486")];
+            fp16 const_16_promoted = const()[name = string("const_16_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 128]> var_1488 = mul(x = var_1486_1, y = const_16_promoted)[name = string("op_1488")];
+            int32 var_1490 = const()[name = string("op_1490"), val = int32(-1)];
+            bool var_1491_interleave_0 = const()[name = string("op_1491_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 256]> var_1491 = concat(axis = var_1490, interleave = var_1491_interleave_0, values = (var_1488, var_1486_0))[name = string("op_1491")];
+            tensor<fp16, [1, 2, 1, 256]> var_1492_cast_fp16 = mul(x = var_1491, y = sin_s)[name = string("op_1492_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_39_cast_fp16 = add(x = var_1485_cast_fp16, y = var_1492_cast_fp16)[name = string("input_39_cast_fp16")];
+            tensor<int32, [8]> k_padded_3_pad_0 = const()[name = string("k_padded_3_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_3_mode_0 = const()[name = string("k_padded_3_mode_0"), val = string("constant")];
+            fp16 const_17_to_fp16 = const()[name = string("const_17_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> k_padded_3_cast_fp16 = pad(constant_val = const_17_to_fp16, mode = k_padded_3_mode_0, pad = k_padded_3_pad_0, x = input_39_cast_fp16)[name = string("k_padded_3_cast_fp16")];
+            tensor<int32, [8]> v_padded_3_pad_0 = const()[name = string("v_padded_3_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_3_mode_0 = const()[name = string("v_padded_3_mode_0"), val = string("constant")];
+            fp16 const_18_to_fp16 = const()[name = string("const_18_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> v_padded_3_cast_fp16 = pad(constant_val = const_18_to_fp16, mode = v_padded_3_mode_0, pad = v_padded_3_pad_0, x = input_41_cast_fp16)[name = string("v_padded_3_cast_fp16")];
+            tensor<int32, [4]> var_1521_begin_0 = const()[name = string("op_1521_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_1521_end_0 = const()[name = string("op_1521_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_1521_end_mask_0 = const()[name = string("op_1521_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_1521_cast_fp16 = slice_by_index(begin = var_1521_begin_0, end = var_1521_end_0, end_mask = var_1521_end_mask_0, x = K_sliding_slot_3_cast_fp16)[name = string("op_1521_cast_fp16")];
+            int32 var_1528 = const()[name = string("op_1528"), val = int32(2)];
+            bool K_sliding_out_3_interleave_0 = const()[name = string("K_sliding_out_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_out_3_cast_fp16 = concat(axis = var_1528, interleave = K_sliding_out_3_interleave_0, values = (var_1521_cast_fp16, k_padded_3_cast_fp16))[name = string("K_sliding_out_3_cast_fp16")];
+            tensor<int32, [4]> var_1544_begin_0 = const()[name = string("op_1544_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_1544_end_0 = const()[name = string("op_1544_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_1544_end_mask_0 = const()[name = string("op_1544_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_1544_cast_fp16 = slice_by_index(begin = var_1544_begin_0, end = var_1544_end_0, end_mask = var_1544_end_mask_0, x = V_sliding_slot_3_cast_fp16)[name = string("op_1544_cast_fp16")];
+            int32 var_1551 = const()[name = string("op_1551"), val = int32(2)];
+            bool V_sliding_out_3_interleave_0 = const()[name = string("V_sliding_out_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_out_3_cast_fp16 = concat(axis = var_1551, interleave = V_sliding_out_3_interleave_0, values = (var_1544_cast_fp16, v_padded_3_cast_fp16))[name = string("V_sliding_out_3_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_3_begin_0 = const()[name = string("K_for_attn_3_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_3_end_0 = const()[name = string("K_for_attn_3_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_3_end_mask_0 = const()[name = string("K_for_attn_3_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_3_cast_fp16 = slice_by_index(begin = K_for_attn_3_begin_0, end = K_for_attn_3_end_0, end_mask = K_for_attn_3_end_mask_0, x = K_sliding_out_3_cast_fp16)[name = string("K_for_attn_3_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_3_begin_0 = const()[name = string("V_for_attn_3_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_3_end_0 = const()[name = string("V_for_attn_3_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_3_end_mask_0 = const()[name = string("V_for_attn_3_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_3_cast_fp16 = slice_by_index(begin = V_for_attn_3_begin_0, end = V_for_attn_3_end_0, end_mask = V_for_attn_3_end_mask_0, x = V_sliding_out_3_cast_fp16)[name = string("V_for_attn_3_cast_fp16")];
+            tensor<int32, [4]> transpose_4_perm_0 = const()[name = string("transpose_4_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_2_reps_0 = const()[name = string("tile_2_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_4_cast_fp16 = transpose(perm = transpose_4_perm_0, x = K_for_attn_3_cast_fp16)[name = string("transpose_193")];
+            tensor<fp16, [8, 1, 512, 256]> tile_2_cast_fp16 = tile(reps = tile_2_reps_0, x = transpose_4_cast_fp16)[name = string("tile_2_cast_fp16")];
+            tensor<int32, [5]> concat_4 = const()[name = string("concat_4"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_4_cast_fp16 = reshape(shape = concat_4, x = tile_2_cast_fp16)[name = string("reshape_4_cast_fp16")];
+            tensor<int32, [5]> transpose_5_perm_0 = const()[name = string("transpose_5_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_5 = const()[name = string("concat_5"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_5_cast_fp16 = transpose(perm = transpose_5_perm_0, x = reshape_4_cast_fp16)[name = string("transpose_192")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_5_cast_fp16 = reshape(shape = concat_5, x = transpose_5_cast_fp16)[name = string("reshape_5_cast_fp16")];
+            tensor<int32, [4]> transpose_49_perm_0 = const()[name = string("transpose_49_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_6_perm_0 = const()[name = string("transpose_6_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_3_reps_0 = const()[name = string("tile_3_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_6_cast_fp16 = transpose(perm = transpose_6_perm_0, x = V_for_attn_3_cast_fp16)[name = string("transpose_191")];
+            tensor<fp16, [8, 1, 512, 256]> tile_3_cast_fp16 = tile(reps = tile_3_reps_0, x = transpose_6_cast_fp16)[name = string("tile_3_cast_fp16")];
+            tensor<int32, [5]> concat_6 = const()[name = string("concat_6"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_6_cast_fp16 = reshape(shape = concat_6, x = tile_3_cast_fp16)[name = string("reshape_6_cast_fp16")];
+            tensor<int32, [5]> transpose_7_perm_0 = const()[name = string("transpose_7_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_7 = const()[name = string("concat_7"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_7_cast_fp16 = transpose(perm = transpose_7_perm_0, x = reshape_6_cast_fp16)[name = string("transpose_190")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_7_cast_fp16 = reshape(shape = concat_7, x = transpose_7_cast_fp16)[name = string("reshape_7_cast_fp16")];
+            tensor<int32, [4]> V_expanded_3_perm_0 = const()[name = string("V_expanded_3_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(false)];
+            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_49_cast_fp16 = transpose(perm = transpose_49_perm_0, x = reshape_5_cast_fp16)[name = string("transpose_189")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = q_15_cast_fp16, y = transpose_49_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_27_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = causal_mask_sliding)[name = string("x_27_cast_fp16")];
+            tensor<int32, [1]> reduce_max_1_axes_0 = const()[name = string("reduce_max_1_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_1_keep_dims_0 = const()[name = string("reduce_max_1_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_1 = reduce_max(axes = reduce_max_1_axes_0, keep_dims = reduce_max_1_keep_dims_0, x = x_27_cast_fp16)[name = string("reduce_max_1")];
+            tensor<fp16, [1, 8, 1, 512]> var_1592 = sub(x = x_27_cast_fp16, y = reduce_max_1)[name = string("op_1592")];
+            tensor<fp16, [1, 8, 1, 512]> var_1598 = exp(x = var_1592)[name = string("op_1598")];
+            tensor<int32, [1]> var_1608_axes_0 = const()[name = string("op_1608_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1608_keep_dims_0 = const()[name = string("op_1608_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_1608 = reduce_sum(axes = var_1608_axes_0, keep_dims = var_1608_keep_dims_0, x = var_1598)[name = string("op_1608")];
+            tensor<fp16, [1, 8, 1, 512]> var_1614_cast_fp16 = real_div(x = var_1598, y = var_1608)[name = string("op_1614_cast_fp16")];
+            bool attn_output_7_transpose_x_0 = const()[name = string("attn_output_7_transpose_x_0"), val = bool(false)];
+            bool attn_output_7_transpose_y_0 = const()[name = string("attn_output_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_3_cast_fp16 = transpose(perm = V_expanded_3_perm_0, x = reshape_7_cast_fp16)[name = string("transpose_188")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_7_cast_fp16 = matmul(transpose_x = attn_output_7_transpose_x_0, transpose_y = attn_output_7_transpose_y_0, x = var_1614_cast_fp16, y = V_expanded_3_cast_fp16)[name = string("attn_output_7_cast_fp16")];
+            tensor<int32, [4]> var_1625 = const()[name = string("op_1625"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1632 = const()[name = string("op_1632"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_1626_cast_fp16 = transpose(perm = var_1625, x = attn_output_7_cast_fp16)[name = string("transpose_187")];
+            tensor<fp16, [1, 1, 2048]> attn_output_9_cast_fp16 = reshape(shape = var_1632, x = var_1626_cast_fp16)[name = string("attn_output_9_cast_fp16")];
+            tensor<int32, [3]> var_1637 = const()[name = string("op_1637"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_1653_pad_type_0 = const()[name = string("op_1653_pad_type_0"), val = string("valid")];
+            int32 var_1653_groups_0 = const()[name = string("op_1653_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_1653_strides_0 = const()[name = string("op_1653_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_1653_pad_0 = const()[name = string("op_1653_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_1653_dilations_0 = const()[name = string("op_1653_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_1_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(534231744))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(536853248))))[name = string("squeeze_1_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_1638_cast_fp16 = transpose(perm = var_1637, x = attn_output_9_cast_fp16)[name = string("transpose_186")];
+            tensor<fp16, [1, 2560, 1]> var_1653_cast_fp16 = conv(dilations = var_1653_dilations_0, groups = var_1653_groups_0, pad = var_1653_pad_0, pad_type = var_1653_pad_type_0, strides = var_1653_strides_0, weight = squeeze_1_cast_fp16_to_fp32_to_fp16_palettized, x = var_1638_cast_fp16)[name = string("op_1653_cast_fp16")];
+            tensor<int32, [3]> var_1657 = const()[name = string("op_1657"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1663 = const()[name = string("op_1663"), val = int32(-1)];
+            fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_31_cast_fp16 = transpose(perm = var_1657, x = var_1653_cast_fp16)[name = string("transpose_185")];
+            tensor<fp16, [1, 1, 2560]> var_1665_cast_fp16 = mul(x = x_31_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_1665_cast_fp16")];
+            bool input_45_interleave_0 = const()[name = string("input_45_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_45_cast_fp16 = concat(axis = var_1663, interleave = input_45_interleave_0, values = (x_31_cast_fp16, var_1665_cast_fp16))[name = string("input_45_cast_fp16")];
+            tensor<int32, [1]> normed_41_axes_0 = const()[name = string("normed_41_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1660_to_fp16 = const()[name = string("op_1660_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_41_cast_fp16 = layer_norm(axes = normed_41_axes_0, epsilon = var_1660_to_fp16, x = input_45_cast_fp16)[name = string("normed_41_cast_fp16")];
+            tensor<int32, [2]> var_1670_split_sizes_0 = const()[name = string("op_1670_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1670_axis_0 = const()[name = string("op_1670_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1670_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1670_cast_fp16_1 = split(axis = var_1670_axis_0, split_sizes = var_1670_split_sizes_0, x = normed_41_cast_fp16)[name = string("op_1670_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_1_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(536855872)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_11_cast_fp16 = mul(x = var_1670_cast_fp16_0, y = layers_1_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_11_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_33_cast_fp16 = add(x = x_19_cast_fp16, y = attn_output_11_cast_fp16)[name = string("x_33_cast_fp16")];
+            int32 var_1679 = const()[name = string("op_1679"), val = int32(-1)];
+            fp16 const_20_promoted_to_fp16 = const()[name = string("const_20_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_1681_cast_fp16 = mul(x = x_33_cast_fp16, y = const_20_promoted_to_fp16)[name = string("op_1681_cast_fp16")];
+            bool input_47_interleave_0 = const()[name = string("input_47_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_47_cast_fp16 = concat(axis = var_1679, interleave = input_47_interleave_0, values = (x_33_cast_fp16, var_1681_cast_fp16))[name = string("input_47_cast_fp16")];
+            tensor<int32, [1]> normed_45_axes_0 = const()[name = string("normed_45_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1676_to_fp16 = const()[name = string("op_1676_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_45_cast_fp16 = layer_norm(axes = normed_45_axes_0, epsilon = var_1676_to_fp16, x = input_47_cast_fp16)[name = string("normed_45_cast_fp16")];
+            tensor<int32, [2]> var_1686_split_sizes_0 = const()[name = string("op_1686_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1686_axis_0 = const()[name = string("op_1686_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1686_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1686_cast_fp16_1 = split(axis = var_1686_axis_0, split_sizes = var_1686_split_sizes_0, x = normed_45_cast_fp16)[name = string("op_1686_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_1_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(536861056)))];
+            tensor<fp16, [1, 1, 2560]> h_9_cast_fp16 = mul(x = var_1686_cast_fp16_0, y = layers_1_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_9_cast_fp16")];
+            tensor<int32, [3]> var_1697 = const()[name = string("op_1697"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_49_axes_0 = const()[name = string("input_49_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1698 = transpose(perm = var_1697, x = h_9_cast_fp16)[name = string("transpose_184")];
+            tensor<fp16, [1, 2560, 1, 1]> input_49 = expand_dims(axes = input_49_axes_0, x = var_1698)[name = string("input_49")];
+            string gate_5_pad_type_0 = const()[name = string("gate_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_5_strides_0 = const()[name = string("gate_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_5_pad_0 = const()[name = string("gate_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_5_dilations_0 = const()[name = string("gate_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_5_groups_0 = const()[name = string("gate_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_5 = conv(dilations = gate_5_dilations_0, groups = gate_5_groups_0, pad = gate_5_pad_0, pad_type = gate_5_pad_type_0, strides = gate_5_strides_0, weight = layers_1_mlp_gate_proj_weight_palettized, x = input_49)[name = string("gate_5")];
+            string up_3_pad_type_0 = const()[name = string("up_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_3_strides_0 = const()[name = string("up_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_3_pad_0 = const()[name = string("up_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_3_dilations_0 = const()[name = string("up_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_3_groups_0 = const()[name = string("up_3_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_3 = conv(dilations = up_3_dilations_0, groups = up_3_groups_0, pad = up_3_pad_0, pad_type = up_3_pad_type_0, strides = up_3_strides_0, weight = layers_1_mlp_up_proj_weight_palettized, x = input_49)[name = string("up_3")];
+            string gate_7_mode_0 = const()[name = string("gate_7_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_7 = gelu(mode = gate_7_mode_0, x = gate_5)[name = string("gate_7")];
+            tensor<fp16, [1, 10240, 1, 1]> input_51 = mul(x = gate_7, y = up_3)[name = string("input_51")];
+            string mlp_out_3_pad_type_0 = const()[name = string("mlp_out_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_3_strides_0 = const()[name = string("mlp_out_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_3_pad_0 = const()[name = string("mlp_out_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_3_dilations_0 = const()[name = string("mlp_out_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_3_groups_0 = const()[name = string("mlp_out_3_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_3 = conv(dilations = mlp_out_3_dilations_0, groups = mlp_out_3_groups_0, pad = mlp_out_3_pad_0, pad_type = mlp_out_3_pad_type_0, strides = mlp_out_3_strides_0, weight = layers_1_mlp_down_proj_weight_palettized, x = input_51)[name = string("mlp_out_3")];
+            tensor<int32, [1]> var_1738_axes_0 = const()[name = string("op_1738_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1738 = squeeze(axes = var_1738_axes_0, x = mlp_out_3)[name = string("op_1738")];
+            tensor<int32, [3]> var_1742 = const()[name = string("op_1742"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1748 = const()[name = string("op_1748"), val = int32(-1)];
+            fp16 const_21_promoted = const()[name = string("const_21_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_35 = transpose(perm = var_1742, x = var_1738)[name = string("transpose_183")];
+            tensor<fp16, [1, 1, 2560]> var_1750 = mul(x = x_35, y = const_21_promoted)[name = string("op_1750")];
+            bool input_53_interleave_0 = const()[name = string("input_53_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_53 = concat(axis = var_1748, interleave = input_53_interleave_0, values = (x_35, var_1750))[name = string("input_53")];
+            tensor<int32, [1]> normed_49_axes_0 = const()[name = string("normed_49_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1745_to_fp16 = const()[name = string("op_1745_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_49_cast_fp16 = layer_norm(axes = normed_49_axes_0, epsilon = var_1745_to_fp16, x = input_53)[name = string("normed_49_cast_fp16")];
+            tensor<int32, [2]> var_1755_split_sizes_0 = const()[name = string("op_1755_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1755_axis_0 = const()[name = string("op_1755_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1755_0, tensor<fp16, [1, 1, 2560]> var_1755_1 = split(axis = var_1755_axis_0, split_sizes = var_1755_split_sizes_0, x = normed_49_cast_fp16)[name = string("op_1755")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_13 = mul(x = var_1755_0, y = layers_1_post_feedforward_layernorm_weight)[name = string("hidden_states_13")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_15_cast_fp16 = add(x = x_33_cast_fp16, y = hidden_states_13)[name = string("hidden_states_15_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_3_begin_0 = const()[name = string("per_layer_slice_3_begin_0"), val = tensor<int32, [3]>([0, 0, 3328])];
+            tensor<int32, [3]> per_layer_slice_3_end_0 = const()[name = string("per_layer_slice_3_end_0"), val = tensor<int32, [3]>([1, 1, 3584])];
+            tensor<bool, [3]> per_layer_slice_3_end_mask_0 = const()[name = string("per_layer_slice_3_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_3_cast_fp16 = slice_by_index(begin = per_layer_slice_3_begin_0, end = per_layer_slice_3_end_0, end_mask = per_layer_slice_3_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_3_cast_fp16")];
+            tensor<int32, [3]> var_1783 = const()[name = string("op_1783"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_55_axes_0 = const()[name = string("input_55_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1784 = transpose(perm = var_1783, x = hidden_states_15_cast_fp16)[name = string("transpose_182")];
+            tensor<fp16, [1, 2560, 1, 1]> input_55 = expand_dims(axes = input_55_axes_0, x = var_1784)[name = string("input_55")];
+            string gated_7_pad_type_0 = const()[name = string("gated_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_7_strides_0 = const()[name = string("gated_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_7_pad_0 = const()[name = string("gated_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_7_dilations_0 = const()[name = string("gated_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_7_groups_0 = const()[name = string("gated_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_7 = conv(dilations = gated_7_dilations_0, groups = gated_7_groups_0, pad = gated_7_pad_0, pad_type = gated_7_pad_type_0, strides = gated_7_strides_0, weight = layers_1_per_layer_input_gate_weight_palettized, x = input_55)[name = string("gated_7")];
+            string gated_9_mode_0 = const()[name = string("gated_9_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_9 = gelu(mode = gated_9_mode_0, x = gated_7)[name = string("gated_9")];
+            tensor<int32, [3]> var_1803 = const()[name = string("op_1803"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_3_axes_0 = const()[name = string("per_layer_slice_conv_3_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_1804_cast_fp16 = transpose(perm = var_1803, x = per_layer_slice_3_cast_fp16)[name = string("transpose_181")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_3_cast_fp16 = expand_dims(axes = per_layer_slice_conv_3_axes_0, x = var_1804_cast_fp16)[name = string("per_layer_slice_conv_3_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_57_cast_fp16 = mul(x = gated_9, y = per_layer_slice_conv_3_cast_fp16)[name = string("input_57_cast_fp16")];
+            string gated_11_pad_type_0 = const()[name = string("gated_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_11_strides_0 = const()[name = string("gated_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_11_pad_0 = const()[name = string("gated_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_11_dilations_0 = const()[name = string("gated_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_11_groups_0 = const()[name = string("gated_11_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_1_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(536866240))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(537193984))))[name = string("layers_1_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_11_cast_fp16 = conv(dilations = gated_11_dilations_0, groups = gated_11_groups_0, pad = gated_11_pad_0, pad_type = gated_11_pad_type_0, strides = gated_11_strides_0, weight = layers_1_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_57_cast_fp16)[name = string("gated_11_cast_fp16")];
+            tensor<int32, [1]> var_1820_axes_0 = const()[name = string("op_1820_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1820_cast_fp16 = squeeze(axes = var_1820_axes_0, x = gated_11_cast_fp16)[name = string("op_1820_cast_fp16")];
+            tensor<int32, [3]> var_1824 = const()[name = string("op_1824"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1830 = const()[name = string("op_1830"), val = int32(-1)];
+            fp16 const_22_promoted_to_fp16 = const()[name = string("const_22_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_37_cast_fp16 = transpose(perm = var_1824, x = var_1820_cast_fp16)[name = string("transpose_180")];
+            tensor<fp16, [1, 1, 2560]> var_1832_cast_fp16 = mul(x = x_37_cast_fp16, y = const_22_promoted_to_fp16)[name = string("op_1832_cast_fp16")];
+            bool input_59_interleave_0 = const()[name = string("input_59_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_59_cast_fp16 = concat(axis = var_1830, interleave = input_59_interleave_0, values = (x_37_cast_fp16, var_1832_cast_fp16))[name = string("input_59_cast_fp16")];
+            tensor<int32, [1]> normed_53_axes_0 = const()[name = string("normed_53_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1827_to_fp16 = const()[name = string("op_1827_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_53_cast_fp16 = layer_norm(axes = normed_53_axes_0, epsilon = var_1827_to_fp16, x = input_59_cast_fp16)[name = string("normed_53_cast_fp16")];
+            tensor<int32, [2]> var_1837_split_sizes_0 = const()[name = string("op_1837_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1837_axis_0 = const()[name = string("op_1837_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1837_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1837_cast_fp16_1 = split(axis = var_1837_axis_0, split_sizes = var_1837_split_sizes_0, x = normed_53_cast_fp16)[name = string("op_1837_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_1_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(537196608)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_19_cast_fp16 = mul(x = var_1837_cast_fp16_0, y = layers_1_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_19_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_21_cast_fp16 = add(x = hidden_states_15_cast_fp16, y = hidden_states_19_cast_fp16)[name = string("hidden_states_21_cast_fp16")];
+            tensor<fp16, [1]> const_23_promoted_to_fp16 = const()[name = string("const_23_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.6cp-1])];
+            tensor<fp16, [1, 1, 2560]> x_39_cast_fp16 = mul(x = hidden_states_21_cast_fp16, y = const_23_promoted_to_fp16)[name = string("x_39_cast_fp16")];
+            tensor<int32, [1]> var_1849_axes_0 = const()[name = string("op_1849_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_1849_cast_fp16 = squeeze(axes = var_1849_axes_0, x = K_sliding_out_3_cast_fp16)[name = string("op_1849_cast_fp16")];
+            tensor<int32, [1]> var_1851_axes_0 = const()[name = string("op_1851_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_1851_cast_fp16 = squeeze(axes = var_1851_axes_0, x = V_sliding_out_3_cast_fp16)[name = string("op_1851_cast_fp16")];
+            tensor<int32, [4]> var_1854_begin_0 = const()[name = string("op_1854_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
+            tensor<int32, [4]> var_1854_end_0 = const()[name = string("op_1854_end_0"), val = tensor<int32, [4]>([3, 2, 512, 512])];
+            tensor<bool, [4]> var_1854_end_mask_0 = const()[name = string("op_1854_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_1854_squeeze_mask_0 = const()[name = string("op_1854_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_1854_cast_fp16 = slice_by_index(begin = var_1854_begin_0, end = var_1854_end_0, end_mask = var_1854_end_mask_0, squeeze_mask = var_1854_squeeze_mask_0, x = K_sliding_in)[name = string("op_1854_cast_fp16")];
+            tensor<int32, [1]> K_sliding_slot_5_axes_0 = const()[name = string("K_sliding_slot_5_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_slot_5_cast_fp16 = expand_dims(axes = K_sliding_slot_5_axes_0, x = var_1854_cast_fp16)[name = string("K_sliding_slot_5_cast_fp16")];
+            tensor<int32, [4]> var_1859_begin_0 = const()[name = string("op_1859_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
+            tensor<int32, [4]> var_1859_end_0 = const()[name = string("op_1859_end_0"), val = tensor<int32, [4]>([3, 2, 512, 512])];
+            tensor<bool, [4]> var_1859_end_mask_0 = const()[name = string("op_1859_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_1859_squeeze_mask_0 = const()[name = string("op_1859_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_1859_cast_fp16 = slice_by_index(begin = var_1859_begin_0, end = var_1859_end_0, end_mask = var_1859_end_mask_0, squeeze_mask = var_1859_squeeze_mask_0, x = V_sliding_in)[name = string("op_1859_cast_fp16")];
+            tensor<int32, [1]> V_sliding_slot_5_axes_0 = const()[name = string("V_sliding_slot_5_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_slot_5_cast_fp16 = expand_dims(axes = V_sliding_slot_5_axes_0, x = var_1859_cast_fp16)[name = string("V_sliding_slot_5_cast_fp16")];
+            int32 var_1866 = const()[name = string("op_1866"), val = int32(-1)];
+            fp16 const_24_promoted_to_fp16 = const()[name = string("const_24_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_1868_cast_fp16 = mul(x = x_39_cast_fp16, y = const_24_promoted_to_fp16)[name = string("op_1868_cast_fp16")];
+            bool input_61_interleave_0 = const()[name = string("input_61_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_61_cast_fp16 = concat(axis = var_1866, interleave = input_61_interleave_0, values = (x_39_cast_fp16, var_1868_cast_fp16))[name = string("input_61_cast_fp16")];
+            tensor<int32, [1]> normed_57_axes_0 = const()[name = string("normed_57_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1863_to_fp16 = const()[name = string("op_1863_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_57_cast_fp16 = layer_norm(axes = normed_57_axes_0, epsilon = var_1863_to_fp16, x = input_61_cast_fp16)[name = string("normed_57_cast_fp16")];
+            tensor<int32, [2]> var_1873_split_sizes_0 = const()[name = string("op_1873_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1873_axis_0 = const()[name = string("op_1873_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1873_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1873_cast_fp16_1 = split(axis = var_1873_axis_0, split_sizes = var_1873_split_sizes_0, x = normed_57_cast_fp16)[name = string("op_1873_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_2_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(537201792)))];
+            tensor<fp16, [1, 1, 2560]> h_13_cast_fp16 = mul(x = var_1873_cast_fp16_0, y = layers_2_input_layernorm_weight_promoted_to_fp16)[name = string("h_13_cast_fp16")];
+            tensor<int32, [3]> var_1879 = const()[name = string("op_1879"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1882_axes_0 = const()[name = string("op_1882_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1880_cast_fp16 = transpose(perm = var_1879, x = h_13_cast_fp16)[name = string("transpose_179")];
+            tensor<fp16, [1, 2560, 1, 1]> var_1882_cast_fp16 = expand_dims(axes = var_1882_axes_0, x = var_1880_cast_fp16)[name = string("op_1882_cast_fp16")];
+            string var_1898_pad_type_0 = const()[name = string("op_1898_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1898_strides_0 = const()[name = string("op_1898_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1898_pad_0 = const()[name = string("op_1898_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1898_dilations_0 = const()[name = string("op_1898_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1898_groups_0 = const()[name = string("op_1898_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_1898 = conv(dilations = var_1898_dilations_0, groups = var_1898_groups_0, pad = var_1898_pad_0, pad_type = var_1898_pad_type_0, strides = var_1898_strides_0, weight = layers_2_self_attn_q_proj_weight_palettized, x = var_1882_cast_fp16)[name = string("op_1898")];
+            tensor<int32, [4]> var_1903 = const()[name = string("op_1903"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_1904 = reshape(shape = var_1903, x = var_1898)[name = string("op_1904")];
+            tensor<int32, [4]> var_1909 = const()[name = string("op_1909"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_1919 = const()[name = string("op_1919"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_1910 = transpose(perm = var_1909, x = var_1904)[name = string("transpose_178")];
+            tensor<fp16, [1, 8, 256]> x_41 = reshape(shape = var_1919, x = var_1910)[name = string("x_41")];
+            int32 var_1925 = const()[name = string("op_1925"), val = int32(-1)];
+            fp16 const_25_promoted = const()[name = string("const_25_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_1927 = mul(x = x_41, y = const_25_promoted)[name = string("op_1927")];
+            bool input_65_interleave_0 = const()[name = string("input_65_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_65 = concat(axis = var_1925, interleave = input_65_interleave_0, values = (x_41, var_1927))[name = string("input_65")];
+            tensor<int32, [1]> normed_61_axes_0 = const()[name = string("normed_61_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1922_to_fp16 = const()[name = string("op_1922_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_61_cast_fp16 = layer_norm(axes = normed_61_axes_0, epsilon = var_1922_to_fp16, x = input_65)[name = string("normed_61_cast_fp16")];
+            tensor<int32, [2]> var_1932_split_sizes_0 = const()[name = string("op_1932_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_1932_axis_0 = const()[name = string("op_1932_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_1932_0, tensor<fp16, [1, 8, 256]> var_1932_1 = split(axis = var_1932_axis_0, split_sizes = var_1932_split_sizes_0, x = normed_61_cast_fp16)[name = string("op_1932")];
+            tensor<fp16, [1, 8, 256]> var_1934 = mul(x = var_1932_0, y = layers_2_self_attn_q_norm_weight)[name = string("op_1934")];
+            tensor<int32, [4]> var_1939 = const()[name = string("op_1939"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_19 = reshape(shape = var_1939, x = var_1934)[name = string("q_19")];
+            tensor<fp16, [1, 8, 1, 256]> var_1941_cast_fp16 = mul(x = q_19, y = cos_s)[name = string("op_1941_cast_fp16")];
+            tensor<int32, [2]> var_1942_split_sizes_0 = const()[name = string("op_1942_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_1942_axis_0 = const()[name = string("op_1942_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_1942_0, tensor<fp16, [1, 8, 1, 128]> var_1942_1 = split(axis = var_1942_axis_0, split_sizes = var_1942_split_sizes_0, x = q_19)[name = string("op_1942")];
+            fp16 const_26_promoted = const()[name = string("const_26_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_1944 = mul(x = var_1942_1, y = const_26_promoted)[name = string("op_1944")];
+            int32 var_1946 = const()[name = string("op_1946"), val = int32(-1)];
+            bool var_1947_interleave_0 = const()[name = string("op_1947_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_1947 = concat(axis = var_1946, interleave = var_1947_interleave_0, values = (var_1944, var_1942_0))[name = string("op_1947")];
+            tensor<fp16, [1, 8, 1, 256]> var_1948_cast_fp16 = mul(x = var_1947, y = sin_s)[name = string("op_1948_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_23_cast_fp16 = add(x = var_1941_cast_fp16, y = var_1948_cast_fp16)[name = string("q_23_cast_fp16")];
+            string var_1961_pad_type_0 = const()[name = string("op_1961_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1961_strides_0 = const()[name = string("op_1961_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1961_pad_0 = const()[name = string("op_1961_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1961_dilations_0 = const()[name = string("op_1961_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1961_groups_0 = const()[name = string("op_1961_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_1961 = conv(dilations = var_1961_dilations_0, groups = var_1961_groups_0, pad = var_1961_pad_0, pad_type = var_1961_pad_type_0, strides = var_1961_strides_0, weight = layers_2_self_attn_k_proj_weight_palettized, x = var_1882_cast_fp16)[name = string("op_1961")];
+            tensor<int32, [4]> var_1966 = const()[name = string("op_1966"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_1967 = reshape(shape = var_1966, x = var_1961)[name = string("op_1967")];
+            tensor<int32, [4]> var_1972 = const()[name = string("op_1972"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_1989_pad_type_0 = const()[name = string("op_1989_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1989_strides_0 = const()[name = string("op_1989_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1989_pad_0 = const()[name = string("op_1989_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1989_dilations_0 = const()[name = string("op_1989_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1989_groups_0 = const()[name = string("op_1989_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_1989 = conv(dilations = var_1989_dilations_0, groups = var_1989_groups_0, pad = var_1989_pad_0, pad_type = var_1989_pad_type_0, strides = var_1989_strides_0, weight = layers_2_self_attn_v_proj_weight_palettized, x = var_1882_cast_fp16)[name = string("op_1989")];
+            tensor<int32, [4]> var_1994 = const()[name = string("op_1994"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_1995 = reshape(shape = var_1994, x = var_1989)[name = string("op_1995")];
+            tensor<int32, [4]> var_2000 = const()[name = string("op_2000"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_2010 = const()[name = string("op_2010"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp16, [1, 2, 1, 256]> var_1973 = transpose(perm = var_1972, x = var_1967)[name = string("transpose_177")];
+            tensor<fp16, [1, 2, 256]> x_43 = reshape(shape = var_2010, x = var_1973)[name = string("x_43")];
+            int32 var_2016 = const()[name = string("op_2016"), val = int32(-1)];
+            fp16 const_27_promoted = const()[name = string("const_27_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 256]> var_2018 = mul(x = x_43, y = const_27_promoted)[name = string("op_2018")];
+            bool input_67_interleave_0 = const()[name = string("input_67_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512]> input_67 = concat(axis = var_2016, interleave = input_67_interleave_0, values = (x_43, var_2018))[name = string("input_67")];
+            tensor<int32, [1]> normed_65_axes_0 = const()[name = string("normed_65_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2013_to_fp16 = const()[name = string("op_2013_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 512]> normed_65_cast_fp16 = layer_norm(axes = normed_65_axes_0, epsilon = var_2013_to_fp16, x = input_67)[name = string("normed_65_cast_fp16")];
+            tensor<int32, [2]> var_2023_split_sizes_0 = const()[name = string("op_2023_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2023_axis_0 = const()[name = string("op_2023_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 256]> var_2023_0, tensor<fp16, [1, 2, 256]> var_2023_1 = split(axis = var_2023_axis_0, split_sizes = var_2023_split_sizes_0, x = normed_65_cast_fp16)[name = string("op_2023")];
+            tensor<fp16, [1, 2, 256]> var_2025 = mul(x = var_2023_0, y = layers_2_self_attn_k_norm_weight)[name = string("op_2025")];
+            tensor<int32, [4]> var_2030 = const()[name = string("op_2030"), val = tensor<int32, [4]>([1, 2, 1, 256])];
+            tensor<fp16, [1, 2, 1, 256]> q_21 = reshape(shape = var_2030, x = var_2025)[name = string("q_21")];
+            fp16 var_2032_promoted = const()[name = string("op_2032_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 256]> var_2001 = transpose(perm = var_2000, x = var_1995)[name = string("transpose_176")];
+            tensor<fp16, [1, 2, 1, 256]> var_2033 = pow(x = var_2001, y = var_2032_promoted)[name = string("op_2033")];
+            tensor<int32, [1]> var_2038_axes_0 = const()[name = string("op_2038_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2038_keep_dims_0 = const()[name = string("op_2038_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_2038 = reduce_mean(axes = var_2038_axes_0, keep_dims = var_2038_keep_dims_0, x = var_2033)[name = string("op_2038")];
+            fp16 var_2040_to_fp16 = const()[name = string("op_2040_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_5_cast_fp16 = add(x = var_2038, y = var_2040_to_fp16)[name = string("mean_sq_5_cast_fp16")];
+            fp32 var_2042_epsilon_0 = const()[name = string("op_2042_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_2042_cast_fp16 = rsqrt(epsilon = var_2042_epsilon_0, x = mean_sq_5_cast_fp16)[name = string("op_2042_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_71_cast_fp16 = mul(x = var_2001, y = var_2042_cast_fp16)[name = string("input_71_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> var_2044_cast_fp16 = mul(x = q_21, y = cos_s)[name = string("op_2044_cast_fp16")];
+            tensor<int32, [2]> var_2045_split_sizes_0 = const()[name = string("op_2045_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2045_axis_0 = const()[name = string("op_2045_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 128]> var_2045_0, tensor<fp16, [1, 2, 1, 128]> var_2045_1 = split(axis = var_2045_axis_0, split_sizes = var_2045_split_sizes_0, x = q_21)[name = string("op_2045")];
+            fp16 const_28_promoted = const()[name = string("const_28_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 128]> var_2047 = mul(x = var_2045_1, y = const_28_promoted)[name = string("op_2047")];
+            int32 var_2049 = const()[name = string("op_2049"), val = int32(-1)];
+            bool var_2050_interleave_0 = const()[name = string("op_2050_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 256]> var_2050 = concat(axis = var_2049, interleave = var_2050_interleave_0, values = (var_2047, var_2045_0))[name = string("op_2050")];
+            tensor<fp16, [1, 2, 1, 256]> var_2051_cast_fp16 = mul(x = var_2050, y = sin_s)[name = string("op_2051_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_69_cast_fp16 = add(x = var_2044_cast_fp16, y = var_2051_cast_fp16)[name = string("input_69_cast_fp16")];
+            tensor<int32, [8]> k_padded_5_pad_0 = const()[name = string("k_padded_5_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_5_mode_0 = const()[name = string("k_padded_5_mode_0"), val = string("constant")];
+            fp16 const_29_to_fp16 = const()[name = string("const_29_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> k_padded_5_cast_fp16 = pad(constant_val = const_29_to_fp16, mode = k_padded_5_mode_0, pad = k_padded_5_pad_0, x = input_69_cast_fp16)[name = string("k_padded_5_cast_fp16")];
+            tensor<int32, [8]> v_padded_5_pad_0 = const()[name = string("v_padded_5_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_5_mode_0 = const()[name = string("v_padded_5_mode_0"), val = string("constant")];
+            fp16 const_30_to_fp16 = const()[name = string("const_30_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> v_padded_5_cast_fp16 = pad(constant_val = const_30_to_fp16, mode = v_padded_5_mode_0, pad = v_padded_5_pad_0, x = input_71_cast_fp16)[name = string("v_padded_5_cast_fp16")];
+            tensor<int32, [4]> var_2080_begin_0 = const()[name = string("op_2080_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_2080_end_0 = const()[name = string("op_2080_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_2080_end_mask_0 = const()[name = string("op_2080_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_2080_cast_fp16 = slice_by_index(begin = var_2080_begin_0, end = var_2080_end_0, end_mask = var_2080_end_mask_0, x = K_sliding_slot_5_cast_fp16)[name = string("op_2080_cast_fp16")];
+            int32 var_2087 = const()[name = string("op_2087"), val = int32(2)];
+            bool K_sliding_out_5_interleave_0 = const()[name = string("K_sliding_out_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_out_5_cast_fp16 = concat(axis = var_2087, interleave = K_sliding_out_5_interleave_0, values = (var_2080_cast_fp16, k_padded_5_cast_fp16))[name = string("K_sliding_out_5_cast_fp16")];
+            tensor<int32, [4]> var_2103_begin_0 = const()[name = string("op_2103_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_2103_end_0 = const()[name = string("op_2103_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_2103_end_mask_0 = const()[name = string("op_2103_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_2103_cast_fp16 = slice_by_index(begin = var_2103_begin_0, end = var_2103_end_0, end_mask = var_2103_end_mask_0, x = V_sliding_slot_5_cast_fp16)[name = string("op_2103_cast_fp16")];
+            int32 var_2110 = const()[name = string("op_2110"), val = int32(2)];
+            bool V_sliding_out_5_interleave_0 = const()[name = string("V_sliding_out_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_out_5_cast_fp16 = concat(axis = var_2110, interleave = V_sliding_out_5_interleave_0, values = (var_2103_cast_fp16, v_padded_5_cast_fp16))[name = string("V_sliding_out_5_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_5_begin_0 = const()[name = string("K_for_attn_5_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_5_end_0 = const()[name = string("K_for_attn_5_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_5_end_mask_0 = const()[name = string("K_for_attn_5_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_5_cast_fp16 = slice_by_index(begin = K_for_attn_5_begin_0, end = K_for_attn_5_end_0, end_mask = K_for_attn_5_end_mask_0, x = K_sliding_out_5_cast_fp16)[name = string("K_for_attn_5_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_5_begin_0 = const()[name = string("V_for_attn_5_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_5_end_0 = const()[name = string("V_for_attn_5_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_5_end_mask_0 = const()[name = string("V_for_attn_5_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_5_cast_fp16 = slice_by_index(begin = V_for_attn_5_begin_0, end = V_for_attn_5_end_0, end_mask = V_for_attn_5_end_mask_0, x = V_sliding_out_5_cast_fp16)[name = string("V_for_attn_5_cast_fp16")];
+            tensor<int32, [4]> transpose_8_perm_0 = const()[name = string("transpose_8_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_4_reps_0 = const()[name = string("tile_4_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_8_cast_fp16 = transpose(perm = transpose_8_perm_0, x = K_for_attn_5_cast_fp16)[name = string("transpose_175")];
+            tensor<fp16, [8, 1, 512, 256]> tile_4_cast_fp16 = tile(reps = tile_4_reps_0, x = transpose_8_cast_fp16)[name = string("tile_4_cast_fp16")];
+            tensor<int32, [5]> concat_8 = const()[name = string("concat_8"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_8_cast_fp16 = reshape(shape = concat_8, x = tile_4_cast_fp16)[name = string("reshape_8_cast_fp16")];
+            tensor<int32, [5]> transpose_9_perm_0 = const()[name = string("transpose_9_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_9 = const()[name = string("concat_9"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_9_cast_fp16 = transpose(perm = transpose_9_perm_0, x = reshape_8_cast_fp16)[name = string("transpose_174")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_9_cast_fp16 = reshape(shape = concat_9, x = transpose_9_cast_fp16)[name = string("reshape_9_cast_fp16")];
+            tensor<int32, [4]> transpose_50_perm_0 = const()[name = string("transpose_50_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_10_perm_0 = const()[name = string("transpose_10_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_5_reps_0 = const()[name = string("tile_5_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_10_cast_fp16 = transpose(perm = transpose_10_perm_0, x = V_for_attn_5_cast_fp16)[name = string("transpose_173")];
+            tensor<fp16, [8, 1, 512, 256]> tile_5_cast_fp16 = tile(reps = tile_5_reps_0, x = transpose_10_cast_fp16)[name = string("tile_5_cast_fp16")];
+            tensor<int32, [5]> concat_10 = const()[name = string("concat_10"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_10_cast_fp16 = reshape(shape = concat_10, x = tile_5_cast_fp16)[name = string("reshape_10_cast_fp16")];
+            tensor<int32, [5]> transpose_11_perm_0 = const()[name = string("transpose_11_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_11 = const()[name = string("concat_11"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_11_cast_fp16 = transpose(perm = transpose_11_perm_0, x = reshape_10_cast_fp16)[name = string("transpose_172")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_11_cast_fp16 = reshape(shape = concat_11, x = transpose_11_cast_fp16)[name = string("reshape_11_cast_fp16")];
+            tensor<int32, [4]> V_expanded_5_perm_0 = const()[name = string("V_expanded_5_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(false)];
+            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_50_cast_fp16 = transpose(perm = transpose_50_perm_0, x = reshape_9_cast_fp16)[name = string("transpose_171")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = q_23_cast_fp16, y = transpose_50_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_47_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = causal_mask_sliding)[name = string("x_47_cast_fp16")];
+            tensor<int32, [1]> reduce_max_2_axes_0 = const()[name = string("reduce_max_2_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_2_keep_dims_0 = const()[name = string("reduce_max_2_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_2 = reduce_max(axes = reduce_max_2_axes_0, keep_dims = reduce_max_2_keep_dims_0, x = x_47_cast_fp16)[name = string("reduce_max_2")];
+            tensor<fp16, [1, 8, 1, 512]> var_2151 = sub(x = x_47_cast_fp16, y = reduce_max_2)[name = string("op_2151")];
+            tensor<fp16, [1, 8, 1, 512]> var_2157 = exp(x = var_2151)[name = string("op_2157")];
+            tensor<int32, [1]> var_2167_axes_0 = const()[name = string("op_2167_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2167_keep_dims_0 = const()[name = string("op_2167_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_2167 = reduce_sum(axes = var_2167_axes_0, keep_dims = var_2167_keep_dims_0, x = var_2157)[name = string("op_2167")];
+            tensor<fp16, [1, 8, 1, 512]> var_2173_cast_fp16 = real_div(x = var_2157, y = var_2167)[name = string("op_2173_cast_fp16")];
+            bool attn_output_13_transpose_x_0 = const()[name = string("attn_output_13_transpose_x_0"), val = bool(false)];
+            bool attn_output_13_transpose_y_0 = const()[name = string("attn_output_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_5_cast_fp16 = transpose(perm = V_expanded_5_perm_0, x = reshape_11_cast_fp16)[name = string("transpose_170")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_13_cast_fp16 = matmul(transpose_x = attn_output_13_transpose_x_0, transpose_y = attn_output_13_transpose_y_0, x = var_2173_cast_fp16, y = V_expanded_5_cast_fp16)[name = string("attn_output_13_cast_fp16")];
+            tensor<int32, [4]> var_2184 = const()[name = string("op_2184"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_2191 = const()[name = string("op_2191"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_2185_cast_fp16 = transpose(perm = var_2184, x = attn_output_13_cast_fp16)[name = string("transpose_169")];
+            tensor<fp16, [1, 1, 2048]> attn_output_15_cast_fp16 = reshape(shape = var_2191, x = var_2185_cast_fp16)[name = string("attn_output_15_cast_fp16")];
+            tensor<int32, [3]> var_2196 = const()[name = string("op_2196"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_2212_pad_type_0 = const()[name = string("op_2212_pad_type_0"), val = string("valid")];
+            int32 var_2212_groups_0 = const()[name = string("op_2212_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_2212_strides_0 = const()[name = string("op_2212_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_2212_pad_0 = const()[name = string("op_2212_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_2212_dilations_0 = const()[name = string("op_2212_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_2_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(537206976))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(539828480))))[name = string("squeeze_2_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_2197_cast_fp16 = transpose(perm = var_2196, x = attn_output_15_cast_fp16)[name = string("transpose_168")];
+            tensor<fp16, [1, 2560, 1]> var_2212_cast_fp16 = conv(dilations = var_2212_dilations_0, groups = var_2212_groups_0, pad = var_2212_pad_0, pad_type = var_2212_pad_type_0, strides = var_2212_strides_0, weight = squeeze_2_cast_fp16_to_fp32_to_fp16_palettized, x = var_2197_cast_fp16)[name = string("op_2212_cast_fp16")];
+            tensor<int32, [3]> var_2216 = const()[name = string("op_2216"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2222 = const()[name = string("op_2222"), val = int32(-1)];
+            fp16 const_31_promoted_to_fp16 = const()[name = string("const_31_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_51_cast_fp16 = transpose(perm = var_2216, x = var_2212_cast_fp16)[name = string("transpose_167")];
+            tensor<fp16, [1, 1, 2560]> var_2224_cast_fp16 = mul(x = x_51_cast_fp16, y = const_31_promoted_to_fp16)[name = string("op_2224_cast_fp16")];
+            bool input_75_interleave_0 = const()[name = string("input_75_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_75_cast_fp16 = concat(axis = var_2222, interleave = input_75_interleave_0, values = (x_51_cast_fp16, var_2224_cast_fp16))[name = string("input_75_cast_fp16")];
+            tensor<int32, [1]> normed_69_axes_0 = const()[name = string("normed_69_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2219_to_fp16 = const()[name = string("op_2219_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_69_cast_fp16 = layer_norm(axes = normed_69_axes_0, epsilon = var_2219_to_fp16, x = input_75_cast_fp16)[name = string("normed_69_cast_fp16")];
+            tensor<int32, [2]> var_2229_split_sizes_0 = const()[name = string("op_2229_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2229_axis_0 = const()[name = string("op_2229_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2229_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2229_cast_fp16_1 = split(axis = var_2229_axis_0, split_sizes = var_2229_split_sizes_0, x = normed_69_cast_fp16)[name = string("op_2229_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_2_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(539831104)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_17_cast_fp16 = mul(x = var_2229_cast_fp16_0, y = layers_2_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_17_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_53_cast_fp16 = add(x = x_39_cast_fp16, y = attn_output_17_cast_fp16)[name = string("x_53_cast_fp16")];
+            int32 var_2238 = const()[name = string("op_2238"), val = int32(-1)];
+            fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_2240_cast_fp16 = mul(x = x_53_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_2240_cast_fp16")];
+            bool input_77_interleave_0 = const()[name = string("input_77_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_77_cast_fp16 = concat(axis = var_2238, interleave = input_77_interleave_0, values = (x_53_cast_fp16, var_2240_cast_fp16))[name = string("input_77_cast_fp16")];
+            tensor<int32, [1]> normed_73_axes_0 = const()[name = string("normed_73_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2235_to_fp16 = const()[name = string("op_2235_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_73_cast_fp16 = layer_norm(axes = normed_73_axes_0, epsilon = var_2235_to_fp16, x = input_77_cast_fp16)[name = string("normed_73_cast_fp16")];
+            tensor<int32, [2]> var_2245_split_sizes_0 = const()[name = string("op_2245_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2245_axis_0 = const()[name = string("op_2245_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2245_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2245_cast_fp16_1 = split(axis = var_2245_axis_0, split_sizes = var_2245_split_sizes_0, x = normed_73_cast_fp16)[name = string("op_2245_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_2_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(539836288)))];
+            tensor<fp16, [1, 1, 2560]> h_15_cast_fp16 = mul(x = var_2245_cast_fp16_0, y = layers_2_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_15_cast_fp16")];
+            tensor<int32, [3]> var_2256 = const()[name = string("op_2256"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_79_axes_0 = const()[name = string("input_79_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2257 = transpose(perm = var_2256, x = h_15_cast_fp16)[name = string("transpose_166")];
+            tensor<fp16, [1, 2560, 1, 1]> input_79 = expand_dims(axes = input_79_axes_0, x = var_2257)[name = string("input_79")];
+            string gate_9_pad_type_0 = const()[name = string("gate_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_9_strides_0 = const()[name = string("gate_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_9_pad_0 = const()[name = string("gate_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_9_dilations_0 = const()[name = string("gate_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_9_groups_0 = const()[name = string("gate_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_9 = conv(dilations = gate_9_dilations_0, groups = gate_9_groups_0, pad = gate_9_pad_0, pad_type = gate_9_pad_type_0, strides = gate_9_strides_0, weight = layers_2_mlp_gate_proj_weight_palettized, x = input_79)[name = string("gate_9")];
+            string up_5_pad_type_0 = const()[name = string("up_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_5_strides_0 = const()[name = string("up_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_5_pad_0 = const()[name = string("up_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_5_dilations_0 = const()[name = string("up_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_5_groups_0 = const()[name = string("up_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_5 = conv(dilations = up_5_dilations_0, groups = up_5_groups_0, pad = up_5_pad_0, pad_type = up_5_pad_type_0, strides = up_5_strides_0, weight = layers_2_mlp_up_proj_weight_palettized, x = input_79)[name = string("up_5")];
+            string gate_11_mode_0 = const()[name = string("gate_11_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_11 = gelu(mode = gate_11_mode_0, x = gate_9)[name = string("gate_11")];
+            tensor<fp16, [1, 10240, 1, 1]> input_81 = mul(x = gate_11, y = up_5)[name = string("input_81")];
+            string mlp_out_5_pad_type_0 = const()[name = string("mlp_out_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_5_strides_0 = const()[name = string("mlp_out_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_5_pad_0 = const()[name = string("mlp_out_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_5_dilations_0 = const()[name = string("mlp_out_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_5_groups_0 = const()[name = string("mlp_out_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_5 = conv(dilations = mlp_out_5_dilations_0, groups = mlp_out_5_groups_0, pad = mlp_out_5_pad_0, pad_type = mlp_out_5_pad_type_0, strides = mlp_out_5_strides_0, weight = layers_2_mlp_down_proj_weight_palettized, x = input_81)[name = string("mlp_out_5")];
+            tensor<int32, [1]> var_2297_axes_0 = const()[name = string("op_2297_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2297 = squeeze(axes = var_2297_axes_0, x = mlp_out_5)[name = string("op_2297")];
+            tensor<int32, [3]> var_2301 = const()[name = string("op_2301"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2307 = const()[name = string("op_2307"), val = int32(-1)];
+            fp16 const_33_promoted = const()[name = string("const_33_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_55 = transpose(perm = var_2301, x = var_2297)[name = string("transpose_165")];
+            tensor<fp16, [1, 1, 2560]> var_2309 = mul(x = x_55, y = const_33_promoted)[name = string("op_2309")];
+            bool input_83_interleave_0 = const()[name = string("input_83_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_83 = concat(axis = var_2307, interleave = input_83_interleave_0, values = (x_55, var_2309))[name = string("input_83")];
+            tensor<int32, [1]> normed_77_axes_0 = const()[name = string("normed_77_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2304_to_fp16 = const()[name = string("op_2304_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_77_cast_fp16 = layer_norm(axes = normed_77_axes_0, epsilon = var_2304_to_fp16, x = input_83)[name = string("normed_77_cast_fp16")];
+            tensor<int32, [2]> var_2314_split_sizes_0 = const()[name = string("op_2314_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2314_axis_0 = const()[name = string("op_2314_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2314_0, tensor<fp16, [1, 1, 2560]> var_2314_1 = split(axis = var_2314_axis_0, split_sizes = var_2314_split_sizes_0, x = normed_77_cast_fp16)[name = string("op_2314")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_23 = mul(x = var_2314_0, y = layers_2_post_feedforward_layernorm_weight)[name = string("hidden_states_23")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_25_cast_fp16 = add(x = x_53_cast_fp16, y = hidden_states_23)[name = string("hidden_states_25_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_5_begin_0 = const()[name = string("per_layer_slice_5_begin_0"), val = tensor<int32, [3]>([0, 0, 3584])];
+            tensor<int32, [3]> per_layer_slice_5_end_0 = const()[name = string("per_layer_slice_5_end_0"), val = tensor<int32, [3]>([1, 1, 3840])];
+            tensor<bool, [3]> per_layer_slice_5_end_mask_0 = const()[name = string("per_layer_slice_5_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_5_cast_fp16 = slice_by_index(begin = per_layer_slice_5_begin_0, end = per_layer_slice_5_end_0, end_mask = per_layer_slice_5_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_5_cast_fp16")];
+            tensor<int32, [3]> var_2342 = const()[name = string("op_2342"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_85_axes_0 = const()[name = string("input_85_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2343 = transpose(perm = var_2342, x = hidden_states_25_cast_fp16)[name = string("transpose_164")];
+            tensor<fp16, [1, 2560, 1, 1]> input_85 = expand_dims(axes = input_85_axes_0, x = var_2343)[name = string("input_85")];
+            string gated_13_pad_type_0 = const()[name = string("gated_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_13_strides_0 = const()[name = string("gated_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_13_pad_0 = const()[name = string("gated_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_13_dilations_0 = const()[name = string("gated_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_13_groups_0 = const()[name = string("gated_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_13 = conv(dilations = gated_13_dilations_0, groups = gated_13_groups_0, pad = gated_13_pad_0, pad_type = gated_13_pad_type_0, strides = gated_13_strides_0, weight = layers_2_per_layer_input_gate_weight_palettized, x = input_85)[name = string("gated_13")];
+            string gated_15_mode_0 = const()[name = string("gated_15_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_15 = gelu(mode = gated_15_mode_0, x = gated_13)[name = string("gated_15")];
+            tensor<int32, [3]> var_2362 = const()[name = string("op_2362"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_5_axes_0 = const()[name = string("per_layer_slice_conv_5_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_2363_cast_fp16 = transpose(perm = var_2362, x = per_layer_slice_5_cast_fp16)[name = string("transpose_163")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_5_cast_fp16 = expand_dims(axes = per_layer_slice_conv_5_axes_0, x = var_2363_cast_fp16)[name = string("per_layer_slice_conv_5_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_87_cast_fp16 = mul(x = gated_15, y = per_layer_slice_conv_5_cast_fp16)[name = string("input_87_cast_fp16")];
+            string gated_17_pad_type_0 = const()[name = string("gated_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_17_strides_0 = const()[name = string("gated_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_17_pad_0 = const()[name = string("gated_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_17_dilations_0 = const()[name = string("gated_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_17_groups_0 = const()[name = string("gated_17_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_2_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(539841472))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(540169216))))[name = string("layers_2_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_17_cast_fp16 = conv(dilations = gated_17_dilations_0, groups = gated_17_groups_0, pad = gated_17_pad_0, pad_type = gated_17_pad_type_0, strides = gated_17_strides_0, weight = layers_2_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_87_cast_fp16)[name = string("gated_17_cast_fp16")];
+            tensor<int32, [1]> var_2379_axes_0 = const()[name = string("op_2379_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2379_cast_fp16 = squeeze(axes = var_2379_axes_0, x = gated_17_cast_fp16)[name = string("op_2379_cast_fp16")];
+            tensor<int32, [3]> var_2383 = const()[name = string("op_2383"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2389 = const()[name = string("op_2389"), val = int32(-1)];
+            fp16 const_34_promoted_to_fp16 = const()[name = string("const_34_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_57_cast_fp16 = transpose(perm = var_2383, x = var_2379_cast_fp16)[name = string("transpose_162")];
+            tensor<fp16, [1, 1, 2560]> var_2391_cast_fp16 = mul(x = x_57_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_2391_cast_fp16")];
+            bool input_89_interleave_0 = const()[name = string("input_89_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_89_cast_fp16 = concat(axis = var_2389, interleave = input_89_interleave_0, values = (x_57_cast_fp16, var_2391_cast_fp16))[name = string("input_89_cast_fp16")];
+            tensor<int32, [1]> normed_81_axes_0 = const()[name = string("normed_81_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2386_to_fp16 = const()[name = string("op_2386_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_81_cast_fp16 = layer_norm(axes = normed_81_axes_0, epsilon = var_2386_to_fp16, x = input_89_cast_fp16)[name = string("normed_81_cast_fp16")];
+            tensor<int32, [2]> var_2396_split_sizes_0 = const()[name = string("op_2396_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2396_axis_0 = const()[name = string("op_2396_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2396_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2396_cast_fp16_1 = split(axis = var_2396_axis_0, split_sizes = var_2396_split_sizes_0, x = normed_81_cast_fp16)[name = string("op_2396_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_2_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(540171840)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_29_cast_fp16 = mul(x = var_2396_cast_fp16_0, y = layers_2_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_29_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_31_cast_fp16 = add(x = hidden_states_25_cast_fp16, y = hidden_states_29_cast_fp16)[name = string("hidden_states_31_cast_fp16")];
+            tensor<fp16, [1]> const_35_promoted_to_fp16 = const()[name = string("const_35_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.58p-1])];
+            tensor<fp16, [1, 1, 2560]> x_59_cast_fp16 = mul(x = hidden_states_31_cast_fp16, y = const_35_promoted_to_fp16)[name = string("x_59_cast_fp16")];
+            tensor<int32, [1]> var_2408_axes_0 = const()[name = string("op_2408_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_2408_cast_fp16 = squeeze(axes = var_2408_axes_0, x = K_sliding_out_5_cast_fp16)[name = string("op_2408_cast_fp16")];
+            tensor<int32, [1]> var_2410_axes_0 = const()[name = string("op_2410_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_2410_cast_fp16 = squeeze(axes = var_2410_axes_0, x = V_sliding_out_5_cast_fp16)[name = string("op_2410_cast_fp16")];
+            tensor<int32, [4]> var_2413_begin_0 = const()[name = string("op_2413_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
+            tensor<int32, [4]> var_2413_end_0 = const()[name = string("op_2413_end_0"), val = tensor<int32, [4]>([4, 2, 512, 512])];
+            tensor<bool, [4]> var_2413_end_mask_0 = const()[name = string("op_2413_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_2413_squeeze_mask_0 = const()[name = string("op_2413_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_2413_cast_fp16 = slice_by_index(begin = var_2413_begin_0, end = var_2413_end_0, end_mask = var_2413_end_mask_0, squeeze_mask = var_2413_squeeze_mask_0, x = K_sliding_in)[name = string("op_2413_cast_fp16")];
+            tensor<int32, [1]> K_sliding_slot_7_axes_0 = const()[name = string("K_sliding_slot_7_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_slot_7_cast_fp16 = expand_dims(axes = K_sliding_slot_7_axes_0, x = var_2413_cast_fp16)[name = string("K_sliding_slot_7_cast_fp16")];
+            tensor<int32, [4]> var_2418_begin_0 = const()[name = string("op_2418_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
+            tensor<int32, [4]> var_2418_end_0 = const()[name = string("op_2418_end_0"), val = tensor<int32, [4]>([4, 2, 512, 512])];
+            tensor<bool, [4]> var_2418_end_mask_0 = const()[name = string("op_2418_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_2418_squeeze_mask_0 = const()[name = string("op_2418_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_2418_cast_fp16 = slice_by_index(begin = var_2418_begin_0, end = var_2418_end_0, end_mask = var_2418_end_mask_0, squeeze_mask = var_2418_squeeze_mask_0, x = V_sliding_in)[name = string("op_2418_cast_fp16")];
+            tensor<int32, [1]> V_sliding_slot_7_axes_0 = const()[name = string("V_sliding_slot_7_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_slot_7_cast_fp16 = expand_dims(axes = V_sliding_slot_7_axes_0, x = var_2418_cast_fp16)[name = string("V_sliding_slot_7_cast_fp16")];
+            int32 var_2425 = const()[name = string("op_2425"), val = int32(-1)];
+            fp16 const_36_promoted_to_fp16 = const()[name = string("const_36_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_2427_cast_fp16 = mul(x = x_59_cast_fp16, y = const_36_promoted_to_fp16)[name = string("op_2427_cast_fp16")];
+            bool input_91_interleave_0 = const()[name = string("input_91_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_91_cast_fp16 = concat(axis = var_2425, interleave = input_91_interleave_0, values = (x_59_cast_fp16, var_2427_cast_fp16))[name = string("input_91_cast_fp16")];
+            tensor<int32, [1]> normed_85_axes_0 = const()[name = string("normed_85_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2422_to_fp16 = const()[name = string("op_2422_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_85_cast_fp16 = layer_norm(axes = normed_85_axes_0, epsilon = var_2422_to_fp16, x = input_91_cast_fp16)[name = string("normed_85_cast_fp16")];
+            tensor<int32, [2]> var_2432_split_sizes_0 = const()[name = string("op_2432_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2432_axis_0 = const()[name = string("op_2432_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2432_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2432_cast_fp16_1 = split(axis = var_2432_axis_0, split_sizes = var_2432_split_sizes_0, x = normed_85_cast_fp16)[name = string("op_2432_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_3_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(540177024)))];
+            tensor<fp16, [1, 1, 2560]> h_19_cast_fp16 = mul(x = var_2432_cast_fp16_0, y = layers_3_input_layernorm_weight_promoted_to_fp16)[name = string("h_19_cast_fp16")];
+            tensor<int32, [3]> var_2438 = const()[name = string("op_2438"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_2441_axes_0 = const()[name = string("op_2441_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2439_cast_fp16 = transpose(perm = var_2438, x = h_19_cast_fp16)[name = string("transpose_161")];
+            tensor<fp16, [1, 2560, 1, 1]> var_2441_cast_fp16 = expand_dims(axes = var_2441_axes_0, x = var_2439_cast_fp16)[name = string("op_2441_cast_fp16")];
+            string var_2457_pad_type_0 = const()[name = string("op_2457_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2457_strides_0 = const()[name = string("op_2457_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2457_pad_0 = const()[name = string("op_2457_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2457_dilations_0 = const()[name = string("op_2457_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2457_groups_0 = const()[name = string("op_2457_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_2457 = conv(dilations = var_2457_dilations_0, groups = var_2457_groups_0, pad = var_2457_pad_0, pad_type = var_2457_pad_type_0, strides = var_2457_strides_0, weight = layers_3_self_attn_q_proj_weight_palettized, x = var_2441_cast_fp16)[name = string("op_2457")];
+            tensor<int32, [4]> var_2462 = const()[name = string("op_2462"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_2463 = reshape(shape = var_2462, x = var_2457)[name = string("op_2463")];
+            tensor<int32, [4]> var_2468 = const()[name = string("op_2468"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_2478 = const()[name = string("op_2478"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_2469 = transpose(perm = var_2468, x = var_2463)[name = string("transpose_160")];
+            tensor<fp16, [1, 8, 256]> x_61 = reshape(shape = var_2478, x = var_2469)[name = string("x_61")];
+            int32 var_2484 = const()[name = string("op_2484"), val = int32(-1)];
+            fp16 const_37_promoted = const()[name = string("const_37_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_2486 = mul(x = x_61, y = const_37_promoted)[name = string("op_2486")];
+            bool input_95_interleave_0 = const()[name = string("input_95_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_95 = concat(axis = var_2484, interleave = input_95_interleave_0, values = (x_61, var_2486))[name = string("input_95")];
+            tensor<int32, [1]> normed_89_axes_0 = const()[name = string("normed_89_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2481_to_fp16 = const()[name = string("op_2481_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_89_cast_fp16 = layer_norm(axes = normed_89_axes_0, epsilon = var_2481_to_fp16, x = input_95)[name = string("normed_89_cast_fp16")];
+            tensor<int32, [2]> var_2491_split_sizes_0 = const()[name = string("op_2491_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2491_axis_0 = const()[name = string("op_2491_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_2491_0, tensor<fp16, [1, 8, 256]> var_2491_1 = split(axis = var_2491_axis_0, split_sizes = var_2491_split_sizes_0, x = normed_89_cast_fp16)[name = string("op_2491")];
+            tensor<fp16, [1, 8, 256]> var_2493 = mul(x = var_2491_0, y = layers_3_self_attn_q_norm_weight)[name = string("op_2493")];
+            tensor<int32, [4]> var_2498 = const()[name = string("op_2498"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_27 = reshape(shape = var_2498, x = var_2493)[name = string("q_27")];
+            tensor<fp16, [1, 8, 1, 256]> var_2500_cast_fp16 = mul(x = q_27, y = cos_s)[name = string("op_2500_cast_fp16")];
+            tensor<int32, [2]> var_2501_split_sizes_0 = const()[name = string("op_2501_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2501_axis_0 = const()[name = string("op_2501_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_2501_0, tensor<fp16, [1, 8, 1, 128]> var_2501_1 = split(axis = var_2501_axis_0, split_sizes = var_2501_split_sizes_0, x = q_27)[name = string("op_2501")];
+            fp16 const_38_promoted = const()[name = string("const_38_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_2503 = mul(x = var_2501_1, y = const_38_promoted)[name = string("op_2503")];
+            int32 var_2505 = const()[name = string("op_2505"), val = int32(-1)];
+            bool var_2506_interleave_0 = const()[name = string("op_2506_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_2506 = concat(axis = var_2505, interleave = var_2506_interleave_0, values = (var_2503, var_2501_0))[name = string("op_2506")];
+            tensor<fp16, [1, 8, 1, 256]> var_2507_cast_fp16 = mul(x = var_2506, y = sin_s)[name = string("op_2507_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_31_cast_fp16 = add(x = var_2500_cast_fp16, y = var_2507_cast_fp16)[name = string("q_31_cast_fp16")];
+            string var_2520_pad_type_0 = const()[name = string("op_2520_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2520_strides_0 = const()[name = string("op_2520_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2520_pad_0 = const()[name = string("op_2520_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2520_dilations_0 = const()[name = string("op_2520_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2520_groups_0 = const()[name = string("op_2520_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_2520 = conv(dilations = var_2520_dilations_0, groups = var_2520_groups_0, pad = var_2520_pad_0, pad_type = var_2520_pad_type_0, strides = var_2520_strides_0, weight = layers_3_self_attn_k_proj_weight_palettized, x = var_2441_cast_fp16)[name = string("op_2520")];
+            tensor<int32, [4]> var_2525 = const()[name = string("op_2525"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_2526 = reshape(shape = var_2525, x = var_2520)[name = string("op_2526")];
+            tensor<int32, [4]> var_2531 = const()[name = string("op_2531"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_2548_pad_type_0 = const()[name = string("op_2548_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2548_strides_0 = const()[name = string("op_2548_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2548_pad_0 = const()[name = string("op_2548_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2548_dilations_0 = const()[name = string("op_2548_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2548_groups_0 = const()[name = string("op_2548_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_2548 = conv(dilations = var_2548_dilations_0, groups = var_2548_groups_0, pad = var_2548_pad_0, pad_type = var_2548_pad_type_0, strides = var_2548_strides_0, weight = layers_3_self_attn_v_proj_weight_palettized, x = var_2441_cast_fp16)[name = string("op_2548")];
+            tensor<int32, [4]> var_2553 = const()[name = string("op_2553"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_2554 = reshape(shape = var_2553, x = var_2548)[name = string("op_2554")];
+            tensor<int32, [4]> var_2559 = const()[name = string("op_2559"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_2569 = const()[name = string("op_2569"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp16, [1, 2, 1, 256]> var_2532 = transpose(perm = var_2531, x = var_2526)[name = string("transpose_159")];
+            tensor<fp16, [1, 2, 256]> x_63 = reshape(shape = var_2569, x = var_2532)[name = string("x_63")];
+            int32 var_2575 = const()[name = string("op_2575"), val = int32(-1)];
+            fp16 const_39_promoted = const()[name = string("const_39_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 256]> var_2577 = mul(x = x_63, y = const_39_promoted)[name = string("op_2577")];
+            bool input_97_interleave_0 = const()[name = string("input_97_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512]> input_97 = concat(axis = var_2575, interleave = input_97_interleave_0, values = (x_63, var_2577))[name = string("input_97")];
+            tensor<int32, [1]> normed_93_axes_0 = const()[name = string("normed_93_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2572_to_fp16 = const()[name = string("op_2572_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 512]> normed_93_cast_fp16 = layer_norm(axes = normed_93_axes_0, epsilon = var_2572_to_fp16, x = input_97)[name = string("normed_93_cast_fp16")];
+            tensor<int32, [2]> var_2582_split_sizes_0 = const()[name = string("op_2582_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2582_axis_0 = const()[name = string("op_2582_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 256]> var_2582_0, tensor<fp16, [1, 2, 256]> var_2582_1 = split(axis = var_2582_axis_0, split_sizes = var_2582_split_sizes_0, x = normed_93_cast_fp16)[name = string("op_2582")];
+            tensor<fp16, [1, 2, 256]> var_2584 = mul(x = var_2582_0, y = layers_3_self_attn_k_norm_weight)[name = string("op_2584")];
+            tensor<int32, [4]> var_2589 = const()[name = string("op_2589"), val = tensor<int32, [4]>([1, 2, 1, 256])];
+            tensor<fp16, [1, 2, 1, 256]> q_29 = reshape(shape = var_2589, x = var_2584)[name = string("q_29")];
+            fp16 var_2591_promoted = const()[name = string("op_2591_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 256]> var_2560 = transpose(perm = var_2559, x = var_2554)[name = string("transpose_158")];
+            tensor<fp16, [1, 2, 1, 256]> var_2592 = pow(x = var_2560, y = var_2591_promoted)[name = string("op_2592")];
+            tensor<int32, [1]> var_2597_axes_0 = const()[name = string("op_2597_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2597_keep_dims_0 = const()[name = string("op_2597_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_2597 = reduce_mean(axes = var_2597_axes_0, keep_dims = var_2597_keep_dims_0, x = var_2592)[name = string("op_2597")];
+            fp16 var_2599_to_fp16 = const()[name = string("op_2599_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_7_cast_fp16 = add(x = var_2597, y = var_2599_to_fp16)[name = string("mean_sq_7_cast_fp16")];
+            fp32 var_2601_epsilon_0 = const()[name = string("op_2601_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_2601_cast_fp16 = rsqrt(epsilon = var_2601_epsilon_0, x = mean_sq_7_cast_fp16)[name = string("op_2601_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_101_cast_fp16 = mul(x = var_2560, y = var_2601_cast_fp16)[name = string("input_101_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> var_2603_cast_fp16 = mul(x = q_29, y = cos_s)[name = string("op_2603_cast_fp16")];
+            tensor<int32, [2]> var_2604_split_sizes_0 = const()[name = string("op_2604_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2604_axis_0 = const()[name = string("op_2604_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 128]> var_2604_0, tensor<fp16, [1, 2, 1, 128]> var_2604_1 = split(axis = var_2604_axis_0, split_sizes = var_2604_split_sizes_0, x = q_29)[name = string("op_2604")];
+            fp16 const_40_promoted = const()[name = string("const_40_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 128]> var_2606 = mul(x = var_2604_1, y = const_40_promoted)[name = string("op_2606")];
+            int32 var_2608 = const()[name = string("op_2608"), val = int32(-1)];
+            bool var_2609_interleave_0 = const()[name = string("op_2609_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 256]> var_2609 = concat(axis = var_2608, interleave = var_2609_interleave_0, values = (var_2606, var_2604_0))[name = string("op_2609")];
+            tensor<fp16, [1, 2, 1, 256]> var_2610_cast_fp16 = mul(x = var_2609, y = sin_s)[name = string("op_2610_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_99_cast_fp16 = add(x = var_2603_cast_fp16, y = var_2610_cast_fp16)[name = string("input_99_cast_fp16")];
+            tensor<int32, [8]> k_padded_7_pad_0 = const()[name = string("k_padded_7_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_7_mode_0 = const()[name = string("k_padded_7_mode_0"), val = string("constant")];
+            fp16 const_41_to_fp16 = const()[name = string("const_41_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> k_padded_7_cast_fp16 = pad(constant_val = const_41_to_fp16, mode = k_padded_7_mode_0, pad = k_padded_7_pad_0, x = input_99_cast_fp16)[name = string("k_padded_7_cast_fp16")];
+            tensor<int32, [8]> v_padded_7_pad_0 = const()[name = string("v_padded_7_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_7_mode_0 = const()[name = string("v_padded_7_mode_0"), val = string("constant")];
+            fp16 const_42_to_fp16 = const()[name = string("const_42_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> v_padded_7_cast_fp16 = pad(constant_val = const_42_to_fp16, mode = v_padded_7_mode_0, pad = v_padded_7_pad_0, x = input_101_cast_fp16)[name = string("v_padded_7_cast_fp16")];
+            tensor<int32, [4]> var_2639_begin_0 = const()[name = string("op_2639_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_2639_end_0 = const()[name = string("op_2639_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_2639_end_mask_0 = const()[name = string("op_2639_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_2639_cast_fp16 = slice_by_index(begin = var_2639_begin_0, end = var_2639_end_0, end_mask = var_2639_end_mask_0, x = K_sliding_slot_7_cast_fp16)[name = string("op_2639_cast_fp16")];
+            int32 var_2646 = const()[name = string("op_2646"), val = int32(2)];
+            bool K_sliding_out_7_interleave_0 = const()[name = string("K_sliding_out_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_out_7_cast_fp16 = concat(axis = var_2646, interleave = K_sliding_out_7_interleave_0, values = (var_2639_cast_fp16, k_padded_7_cast_fp16))[name = string("K_sliding_out_7_cast_fp16")];
+            tensor<int32, [4]> var_2662_begin_0 = const()[name = string("op_2662_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_2662_end_0 = const()[name = string("op_2662_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_2662_end_mask_0 = const()[name = string("op_2662_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_2662_cast_fp16 = slice_by_index(begin = var_2662_begin_0, end = var_2662_end_0, end_mask = var_2662_end_mask_0, x = V_sliding_slot_7_cast_fp16)[name = string("op_2662_cast_fp16")];
+            int32 var_2669 = const()[name = string("op_2669"), val = int32(2)];
+            bool V_sliding_out_7_interleave_0 = const()[name = string("V_sliding_out_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_out_7_cast_fp16 = concat(axis = var_2669, interleave = V_sliding_out_7_interleave_0, values = (var_2662_cast_fp16, v_padded_7_cast_fp16))[name = string("V_sliding_out_7_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_7_begin_0 = const()[name = string("K_for_attn_7_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_7_end_0 = const()[name = string("K_for_attn_7_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_7_end_mask_0 = const()[name = string("K_for_attn_7_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_7_cast_fp16 = slice_by_index(begin = K_for_attn_7_begin_0, end = K_for_attn_7_end_0, end_mask = K_for_attn_7_end_mask_0, x = K_sliding_out_7_cast_fp16)[name = string("K_for_attn_7_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_7_begin_0 = const()[name = string("V_for_attn_7_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_7_end_0 = const()[name = string("V_for_attn_7_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_7_end_mask_0 = const()[name = string("V_for_attn_7_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_7_cast_fp16 = slice_by_index(begin = V_for_attn_7_begin_0, end = V_for_attn_7_end_0, end_mask = V_for_attn_7_end_mask_0, x = V_sliding_out_7_cast_fp16)[name = string("V_for_attn_7_cast_fp16")];
+            tensor<int32, [4]> transpose_12_perm_0 = const()[name = string("transpose_12_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_6_reps_0 = const()[name = string("tile_6_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_12_cast_fp16 = transpose(perm = transpose_12_perm_0, x = K_for_attn_7_cast_fp16)[name = string("transpose_157")];
+            tensor<fp16, [8, 1, 512, 256]> tile_6_cast_fp16 = tile(reps = tile_6_reps_0, x = transpose_12_cast_fp16)[name = string("tile_6_cast_fp16")];
+            tensor<int32, [5]> concat_12 = const()[name = string("concat_12"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_12_cast_fp16 = reshape(shape = concat_12, x = tile_6_cast_fp16)[name = string("reshape_12_cast_fp16")];
+            tensor<int32, [5]> transpose_13_perm_0 = const()[name = string("transpose_13_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_13 = const()[name = string("concat_13"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_13_cast_fp16 = transpose(perm = transpose_13_perm_0, x = reshape_12_cast_fp16)[name = string("transpose_156")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_13_cast_fp16 = reshape(shape = concat_13, x = transpose_13_cast_fp16)[name = string("reshape_13_cast_fp16")];
+            tensor<int32, [4]> transpose_51_perm_0 = const()[name = string("transpose_51_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_14_perm_0 = const()[name = string("transpose_14_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_7_reps_0 = const()[name = string("tile_7_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_14_cast_fp16 = transpose(perm = transpose_14_perm_0, x = V_for_attn_7_cast_fp16)[name = string("transpose_155")];
+            tensor<fp16, [8, 1, 512, 256]> tile_7_cast_fp16 = tile(reps = tile_7_reps_0, x = transpose_14_cast_fp16)[name = string("tile_7_cast_fp16")];
+            tensor<int32, [5]> concat_14 = const()[name = string("concat_14"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_14_cast_fp16 = reshape(shape = concat_14, x = tile_7_cast_fp16)[name = string("reshape_14_cast_fp16")];
+            tensor<int32, [5]> transpose_15_perm_0 = const()[name = string("transpose_15_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_15 = const()[name = string("concat_15"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_15_cast_fp16 = transpose(perm = transpose_15_perm_0, x = reshape_14_cast_fp16)[name = string("transpose_154")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_15_cast_fp16 = reshape(shape = concat_15, x = transpose_15_cast_fp16)[name = string("reshape_15_cast_fp16")];
+            tensor<int32, [4]> V_expanded_7_perm_0 = const()[name = string("V_expanded_7_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(false)];
+            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_51_cast_fp16 = transpose(perm = transpose_51_perm_0, x = reshape_13_cast_fp16)[name = string("transpose_153")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = q_31_cast_fp16, y = transpose_51_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_67_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = causal_mask_sliding)[name = string("x_67_cast_fp16")];
+            tensor<int32, [1]> reduce_max_3_axes_0 = const()[name = string("reduce_max_3_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_3_keep_dims_0 = const()[name = string("reduce_max_3_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_3 = reduce_max(axes = reduce_max_3_axes_0, keep_dims = reduce_max_3_keep_dims_0, x = x_67_cast_fp16)[name = string("reduce_max_3")];
+            tensor<fp16, [1, 8, 1, 512]> var_2710 = sub(x = x_67_cast_fp16, y = reduce_max_3)[name = string("op_2710")];
+            tensor<fp16, [1, 8, 1, 512]> var_2716 = exp(x = var_2710)[name = string("op_2716")];
+            tensor<int32, [1]> var_2726_axes_0 = const()[name = string("op_2726_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2726_keep_dims_0 = const()[name = string("op_2726_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_2726 = reduce_sum(axes = var_2726_axes_0, keep_dims = var_2726_keep_dims_0, x = var_2716)[name = string("op_2726")];
+            tensor<fp16, [1, 8, 1, 512]> var_2732_cast_fp16 = real_div(x = var_2716, y = var_2726)[name = string("op_2732_cast_fp16")];
+            bool attn_output_19_transpose_x_0 = const()[name = string("attn_output_19_transpose_x_0"), val = bool(false)];
+            bool attn_output_19_transpose_y_0 = const()[name = string("attn_output_19_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_7_cast_fp16 = transpose(perm = V_expanded_7_perm_0, x = reshape_15_cast_fp16)[name = string("transpose_152")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_19_cast_fp16 = matmul(transpose_x = attn_output_19_transpose_x_0, transpose_y = attn_output_19_transpose_y_0, x = var_2732_cast_fp16, y = V_expanded_7_cast_fp16)[name = string("attn_output_19_cast_fp16")];
+            tensor<int32, [4]> var_2743 = const()[name = string("op_2743"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_2750 = const()[name = string("op_2750"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_2744_cast_fp16 = transpose(perm = var_2743, x = attn_output_19_cast_fp16)[name = string("transpose_151")];
+            tensor<fp16, [1, 1, 2048]> attn_output_21_cast_fp16 = reshape(shape = var_2750, x = var_2744_cast_fp16)[name = string("attn_output_21_cast_fp16")];
+            tensor<int32, [3]> var_2755 = const()[name = string("op_2755"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_2771_pad_type_0 = const()[name = string("op_2771_pad_type_0"), val = string("valid")];
+            int32 var_2771_groups_0 = const()[name = string("op_2771_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_2771_strides_0 = const()[name = string("op_2771_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_2771_pad_0 = const()[name = string("op_2771_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_2771_dilations_0 = const()[name = string("op_2771_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_3_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(540182208))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(542803712))))[name = string("squeeze_3_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_2756_cast_fp16 = transpose(perm = var_2755, x = attn_output_21_cast_fp16)[name = string("transpose_150")];
+            tensor<fp16, [1, 2560, 1]> var_2771_cast_fp16 = conv(dilations = var_2771_dilations_0, groups = var_2771_groups_0, pad = var_2771_pad_0, pad_type = var_2771_pad_type_0, strides = var_2771_strides_0, weight = squeeze_3_cast_fp16_to_fp32_to_fp16_palettized, x = var_2756_cast_fp16)[name = string("op_2771_cast_fp16")];
+            tensor<int32, [3]> var_2775 = const()[name = string("op_2775"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2781 = const()[name = string("op_2781"), val = int32(-1)];
+            fp16 const_43_promoted_to_fp16 = const()[name = string("const_43_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_71_cast_fp16 = transpose(perm = var_2775, x = var_2771_cast_fp16)[name = string("transpose_149")];
+            tensor<fp16, [1, 1, 2560]> var_2783_cast_fp16 = mul(x = x_71_cast_fp16, y = const_43_promoted_to_fp16)[name = string("op_2783_cast_fp16")];
+            bool input_105_interleave_0 = const()[name = string("input_105_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_105_cast_fp16 = concat(axis = var_2781, interleave = input_105_interleave_0, values = (x_71_cast_fp16, var_2783_cast_fp16))[name = string("input_105_cast_fp16")];
+            tensor<int32, [1]> normed_97_axes_0 = const()[name = string("normed_97_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2778_to_fp16 = const()[name = string("op_2778_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_97_cast_fp16 = layer_norm(axes = normed_97_axes_0, epsilon = var_2778_to_fp16, x = input_105_cast_fp16)[name = string("normed_97_cast_fp16")];
+            tensor<int32, [2]> var_2788_split_sizes_0 = const()[name = string("op_2788_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2788_axis_0 = const()[name = string("op_2788_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2788_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2788_cast_fp16_1 = split(axis = var_2788_axis_0, split_sizes = var_2788_split_sizes_0, x = normed_97_cast_fp16)[name = string("op_2788_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_3_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(542806336)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_23_cast_fp16 = mul(x = var_2788_cast_fp16_0, y = layers_3_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_23_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_73_cast_fp16 = add(x = x_59_cast_fp16, y = attn_output_23_cast_fp16)[name = string("x_73_cast_fp16")];
+            int32 var_2797 = const()[name = string("op_2797"), val = int32(-1)];
+            fp16 const_44_promoted_to_fp16 = const()[name = string("const_44_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_2799_cast_fp16 = mul(x = x_73_cast_fp16, y = const_44_promoted_to_fp16)[name = string("op_2799_cast_fp16")];
+            bool input_107_interleave_0 = const()[name = string("input_107_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_107_cast_fp16 = concat(axis = var_2797, interleave = input_107_interleave_0, values = (x_73_cast_fp16, var_2799_cast_fp16))[name = string("input_107_cast_fp16")];
+            tensor<int32, [1]> normed_101_axes_0 = const()[name = string("normed_101_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2794_to_fp16 = const()[name = string("op_2794_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_101_cast_fp16 = layer_norm(axes = normed_101_axes_0, epsilon = var_2794_to_fp16, x = input_107_cast_fp16)[name = string("normed_101_cast_fp16")];
+            tensor<int32, [2]> var_2804_split_sizes_0 = const()[name = string("op_2804_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2804_axis_0 = const()[name = string("op_2804_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2804_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2804_cast_fp16_1 = split(axis = var_2804_axis_0, split_sizes = var_2804_split_sizes_0, x = normed_101_cast_fp16)[name = string("op_2804_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_3_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(542811520)))];
+            tensor<fp16, [1, 1, 2560]> h_21_cast_fp16 = mul(x = var_2804_cast_fp16_0, y = layers_3_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_21_cast_fp16")];
+            tensor<int32, [3]> var_2815 = const()[name = string("op_2815"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_109_axes_0 = const()[name = string("input_109_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2816 = transpose(perm = var_2815, x = h_21_cast_fp16)[name = string("transpose_148")];
+            tensor<fp16, [1, 2560, 1, 1]> input_109 = expand_dims(axes = input_109_axes_0, x = var_2816)[name = string("input_109")];
+            string gate_13_pad_type_0 = const()[name = string("gate_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_13_strides_0 = const()[name = string("gate_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_13_pad_0 = const()[name = string("gate_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_13_dilations_0 = const()[name = string("gate_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_13_groups_0 = const()[name = string("gate_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_13 = conv(dilations = gate_13_dilations_0, groups = gate_13_groups_0, pad = gate_13_pad_0, pad_type = gate_13_pad_type_0, strides = gate_13_strides_0, weight = layers_3_mlp_gate_proj_weight_palettized, x = input_109)[name = string("gate_13")];
+            string up_7_pad_type_0 = const()[name = string("up_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_7_strides_0 = const()[name = string("up_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_7_pad_0 = const()[name = string("up_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_7_dilations_0 = const()[name = string("up_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_7_groups_0 = const()[name = string("up_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_7 = conv(dilations = up_7_dilations_0, groups = up_7_groups_0, pad = up_7_pad_0, pad_type = up_7_pad_type_0, strides = up_7_strides_0, weight = layers_3_mlp_up_proj_weight_palettized, x = input_109)[name = string("up_7")];
+            string gate_15_mode_0 = const()[name = string("gate_15_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_15 = gelu(mode = gate_15_mode_0, x = gate_13)[name = string("gate_15")];
+            tensor<fp16, [1, 10240, 1, 1]> input_111 = mul(x = gate_15, y = up_7)[name = string("input_111")];
+            string mlp_out_7_pad_type_0 = const()[name = string("mlp_out_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_7_strides_0 = const()[name = string("mlp_out_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_7_pad_0 = const()[name = string("mlp_out_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_7_dilations_0 = const()[name = string("mlp_out_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_7_groups_0 = const()[name = string("mlp_out_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_7 = conv(dilations = mlp_out_7_dilations_0, groups = mlp_out_7_groups_0, pad = mlp_out_7_pad_0, pad_type = mlp_out_7_pad_type_0, strides = mlp_out_7_strides_0, weight = layers_3_mlp_down_proj_weight_palettized, x = input_111)[name = string("mlp_out_7")];
+            tensor<int32, [1]> var_2856_axes_0 = const()[name = string("op_2856_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2856 = squeeze(axes = var_2856_axes_0, x = mlp_out_7)[name = string("op_2856")];
+            tensor<int32, [3]> var_2860 = const()[name = string("op_2860"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2866 = const()[name = string("op_2866"), val = int32(-1)];
+            fp16 const_45_promoted = const()[name = string("const_45_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_75 = transpose(perm = var_2860, x = var_2856)[name = string("transpose_147")];
+            tensor<fp16, [1, 1, 2560]> var_2868 = mul(x = x_75, y = const_45_promoted)[name = string("op_2868")];
+            bool input_113_interleave_0 = const()[name = string("input_113_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_113 = concat(axis = var_2866, interleave = input_113_interleave_0, values = (x_75, var_2868))[name = string("input_113")];
+            tensor<int32, [1]> normed_105_axes_0 = const()[name = string("normed_105_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2863_to_fp16 = const()[name = string("op_2863_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_105_cast_fp16 = layer_norm(axes = normed_105_axes_0, epsilon = var_2863_to_fp16, x = input_113)[name = string("normed_105_cast_fp16")];
+            tensor<int32, [2]> var_2873_split_sizes_0 = const()[name = string("op_2873_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2873_axis_0 = const()[name = string("op_2873_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2873_0, tensor<fp16, [1, 1, 2560]> var_2873_1 = split(axis = var_2873_axis_0, split_sizes = var_2873_split_sizes_0, x = normed_105_cast_fp16)[name = string("op_2873")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_33 = mul(x = var_2873_0, y = layers_3_post_feedforward_layernorm_weight)[name = string("hidden_states_33")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_35_cast_fp16 = add(x = x_73_cast_fp16, y = hidden_states_33)[name = string("hidden_states_35_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_7_begin_0 = const()[name = string("per_layer_slice_7_begin_0"), val = tensor<int32, [3]>([0, 0, 3840])];
+            tensor<int32, [3]> per_layer_slice_7_end_0 = const()[name = string("per_layer_slice_7_end_0"), val = tensor<int32, [3]>([1, 1, 4096])];
+            tensor<bool, [3]> per_layer_slice_7_end_mask_0 = const()[name = string("per_layer_slice_7_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_7_cast_fp16 = slice_by_index(begin = per_layer_slice_7_begin_0, end = per_layer_slice_7_end_0, end_mask = per_layer_slice_7_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_7_cast_fp16")];
+            tensor<int32, [3]> var_2901 = const()[name = string("op_2901"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_115_axes_0 = const()[name = string("input_115_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2902 = transpose(perm = var_2901, x = hidden_states_35_cast_fp16)[name = string("transpose_146")];
+            tensor<fp16, [1, 2560, 1, 1]> input_115 = expand_dims(axes = input_115_axes_0, x = var_2902)[name = string("input_115")];
+            string gated_19_pad_type_0 = const()[name = string("gated_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_19_strides_0 = const()[name = string("gated_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_19_pad_0 = const()[name = string("gated_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_19_dilations_0 = const()[name = string("gated_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_19_groups_0 = const()[name = string("gated_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_19 = conv(dilations = gated_19_dilations_0, groups = gated_19_groups_0, pad = gated_19_pad_0, pad_type = gated_19_pad_type_0, strides = gated_19_strides_0, weight = layers_3_per_layer_input_gate_weight_palettized, x = input_115)[name = string("gated_19")];
+            string gated_21_mode_0 = const()[name = string("gated_21_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_21 = gelu(mode = gated_21_mode_0, x = gated_19)[name = string("gated_21")];
+            tensor<int32, [3]> var_2921 = const()[name = string("op_2921"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_7_axes_0 = const()[name = string("per_layer_slice_conv_7_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_2922_cast_fp16 = transpose(perm = var_2921, x = per_layer_slice_7_cast_fp16)[name = string("transpose_145")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_7_cast_fp16 = expand_dims(axes = per_layer_slice_conv_7_axes_0, x = var_2922_cast_fp16)[name = string("per_layer_slice_conv_7_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_117_cast_fp16 = mul(x = gated_21, y = per_layer_slice_conv_7_cast_fp16)[name = string("input_117_cast_fp16")];
+            string gated_23_pad_type_0 = const()[name = string("gated_23_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_23_strides_0 = const()[name = string("gated_23_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_23_pad_0 = const()[name = string("gated_23_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_23_dilations_0 = const()[name = string("gated_23_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_23_groups_0 = const()[name = string("gated_23_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_3_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(542816704))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(543144448))))[name = string("layers_3_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_23_cast_fp16 = conv(dilations = gated_23_dilations_0, groups = gated_23_groups_0, pad = gated_23_pad_0, pad_type = gated_23_pad_type_0, strides = gated_23_strides_0, weight = layers_3_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_117_cast_fp16)[name = string("gated_23_cast_fp16")];
+            tensor<int32, [1]> var_2938_axes_0 = const()[name = string("op_2938_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2938_cast_fp16 = squeeze(axes = var_2938_axes_0, x = gated_23_cast_fp16)[name = string("op_2938_cast_fp16")];
+            tensor<int32, [3]> var_2942 = const()[name = string("op_2942"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2948 = const()[name = string("op_2948"), val = int32(-1)];
+            fp16 const_46_promoted_to_fp16 = const()[name = string("const_46_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_77_cast_fp16 = transpose(perm = var_2942, x = var_2938_cast_fp16)[name = string("transpose_144")];
+            tensor<fp16, [1, 1, 2560]> var_2950_cast_fp16 = mul(x = x_77_cast_fp16, y = const_46_promoted_to_fp16)[name = string("op_2950_cast_fp16")];
+            bool input_119_interleave_0 = const()[name = string("input_119_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_119_cast_fp16 = concat(axis = var_2948, interleave = input_119_interleave_0, values = (x_77_cast_fp16, var_2950_cast_fp16))[name = string("input_119_cast_fp16")];
+            tensor<int32, [1]> normed_109_axes_0 = const()[name = string("normed_109_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2945_to_fp16 = const()[name = string("op_2945_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_109_cast_fp16 = layer_norm(axes = normed_109_axes_0, epsilon = var_2945_to_fp16, x = input_119_cast_fp16)[name = string("normed_109_cast_fp16")];
+            tensor<int32, [2]> var_2955_split_sizes_0 = const()[name = string("op_2955_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2955_axis_0 = const()[name = string("op_2955_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2955_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2955_cast_fp16_1 = split(axis = var_2955_axis_0, split_sizes = var_2955_split_sizes_0, x = normed_109_cast_fp16)[name = string("op_2955_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_3_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(543147072)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_39_cast_fp16 = mul(x = var_2955_cast_fp16_0, y = layers_3_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_39_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_41_cast_fp16 = add(x = hidden_states_35_cast_fp16, y = hidden_states_39_cast_fp16)[name = string("hidden_states_41_cast_fp16")];
+            tensor<fp16, [1]> const_47_promoted_to_fp16 = const()[name = string("const_47_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.14p-1])];
+            tensor<fp16, [1, 1, 2560]> x_79_cast_fp16 = mul(x = hidden_states_41_cast_fp16, y = const_47_promoted_to_fp16)[name = string("x_79_cast_fp16")];
+            tensor<int32, [1]> var_2967_axes_0 = const()[name = string("op_2967_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_2967_cast_fp16 = squeeze(axes = var_2967_axes_0, x = K_sliding_out_7_cast_fp16)[name = string("op_2967_cast_fp16")];
+            tensor<int32, [1]> var_2969_axes_0 = const()[name = string("op_2969_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_2969_cast_fp16 = squeeze(axes = var_2969_axes_0, x = V_sliding_out_7_cast_fp16)[name = string("op_2969_cast_fp16")];
+            tensor<int32, [4]> var_2972_begin_0 = const()[name = string("op_2972_begin_0"), val = tensor<int32, [4]>([4, 0, 0, 0])];
+            tensor<int32, [4]> var_2972_end_0 = const()[name = string("op_2972_end_0"), val = tensor<int32, [4]>([5, 2, 512, 512])];
+            tensor<bool, [4]> var_2972_end_mask_0 = const()[name = string("op_2972_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_2972_squeeze_mask_0 = const()[name = string("op_2972_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_2972_cast_fp16 = slice_by_index(begin = var_2972_begin_0, end = var_2972_end_0, end_mask = var_2972_end_mask_0, squeeze_mask = var_2972_squeeze_mask_0, x = K_sliding_in)[name = string("op_2972_cast_fp16")];
+            tensor<int32, [1]> K_sliding_slot_9_axes_0 = const()[name = string("K_sliding_slot_9_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_slot_9_cast_fp16 = expand_dims(axes = K_sliding_slot_9_axes_0, x = var_2972_cast_fp16)[name = string("K_sliding_slot_9_cast_fp16")];
+            tensor<int32, [4]> var_2977_begin_0 = const()[name = string("op_2977_begin_0"), val = tensor<int32, [4]>([4, 0, 0, 0])];
+            tensor<int32, [4]> var_2977_end_0 = const()[name = string("op_2977_end_0"), val = tensor<int32, [4]>([5, 2, 512, 512])];
+            tensor<bool, [4]> var_2977_end_mask_0 = const()[name = string("op_2977_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_2977_squeeze_mask_0 = const()[name = string("op_2977_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_2977_cast_fp16 = slice_by_index(begin = var_2977_begin_0, end = var_2977_end_0, end_mask = var_2977_end_mask_0, squeeze_mask = var_2977_squeeze_mask_0, x = V_sliding_in)[name = string("op_2977_cast_fp16")];
+            tensor<int32, [1]> V_sliding_slot_9_axes_0 = const()[name = string("V_sliding_slot_9_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_slot_9_cast_fp16 = expand_dims(axes = V_sliding_slot_9_axes_0, x = var_2977_cast_fp16)[name = string("V_sliding_slot_9_cast_fp16")];
+            int32 var_2984 = const()[name = string("op_2984"), val = int32(-1)];
+            fp16 const_48_promoted_to_fp16 = const()[name = string("const_48_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_2986_cast_fp16 = mul(x = x_79_cast_fp16, y = const_48_promoted_to_fp16)[name = string("op_2986_cast_fp16")];
+            bool input_121_interleave_0 = const()[name = string("input_121_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_121_cast_fp16 = concat(axis = var_2984, interleave = input_121_interleave_0, values = (x_79_cast_fp16, var_2986_cast_fp16))[name = string("input_121_cast_fp16")];
+            tensor<int32, [1]> normed_113_axes_0 = const()[name = string("normed_113_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2981_to_fp16 = const()[name = string("op_2981_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_113_cast_fp16 = layer_norm(axes = normed_113_axes_0, epsilon = var_2981_to_fp16, x = input_121_cast_fp16)[name = string("normed_113_cast_fp16")];
+            tensor<int32, [2]> var_2991_split_sizes_0 = const()[name = string("op_2991_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2991_axis_0 = const()[name = string("op_2991_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2991_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2991_cast_fp16_1 = split(axis = var_2991_axis_0, split_sizes = var_2991_split_sizes_0, x = normed_113_cast_fp16)[name = string("op_2991_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_4_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(543152256)))];
+            tensor<fp16, [1, 1, 2560]> h_25_cast_fp16 = mul(x = var_2991_cast_fp16_0, y = layers_4_input_layernorm_weight_promoted_to_fp16)[name = string("h_25_cast_fp16")];
+            tensor<int32, [3]> var_2997 = const()[name = string("op_2997"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_3000_axes_0 = const()[name = string("op_3000_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2998_cast_fp16 = transpose(perm = var_2997, x = h_25_cast_fp16)[name = string("transpose_143")];
+            tensor<fp16, [1, 2560, 1, 1]> var_3000_cast_fp16 = expand_dims(axes = var_3000_axes_0, x = var_2998_cast_fp16)[name = string("op_3000_cast_fp16")];
+            string var_3016_pad_type_0 = const()[name = string("op_3016_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_3016_strides_0 = const()[name = string("op_3016_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_3016_pad_0 = const()[name = string("op_3016_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_3016_dilations_0 = const()[name = string("op_3016_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_3016_groups_0 = const()[name = string("op_3016_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_3016 = conv(dilations = var_3016_dilations_0, groups = var_3016_groups_0, pad = var_3016_pad_0, pad_type = var_3016_pad_type_0, strides = var_3016_strides_0, weight = layers_4_self_attn_q_proj_weight_palettized, x = var_3000_cast_fp16)[name = string("op_3016")];
+            tensor<int32, [4]> var_3021 = const()[name = string("op_3021"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_3022 = reshape(shape = var_3021, x = var_3016)[name = string("op_3022")];
+            tensor<int32, [4]> var_3027 = const()[name = string("op_3027"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_3037 = const()[name = string("op_3037"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_3028 = transpose(perm = var_3027, x = var_3022)[name = string("transpose_142")];
+            tensor<fp16, [1, 8, 256]> x_81 = reshape(shape = var_3037, x = var_3028)[name = string("x_81")];
+            int32 var_3043 = const()[name = string("op_3043"), val = int32(-1)];
+            fp16 const_49_promoted = const()[name = string("const_49_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_3045 = mul(x = x_81, y = const_49_promoted)[name = string("op_3045")];
+            bool input_125_interleave_0 = const()[name = string("input_125_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_125 = concat(axis = var_3043, interleave = input_125_interleave_0, values = (x_81, var_3045))[name = string("input_125")];
+            tensor<int32, [1]> normed_117_axes_0 = const()[name = string("normed_117_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3040_to_fp16 = const()[name = string("op_3040_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_117_cast_fp16 = layer_norm(axes = normed_117_axes_0, epsilon = var_3040_to_fp16, x = input_125)[name = string("normed_117_cast_fp16")];
+            tensor<int32, [2]> var_3050_split_sizes_0 = const()[name = string("op_3050_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_3050_axis_0 = const()[name = string("op_3050_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_3050_0, tensor<fp16, [1, 8, 256]> var_3050_1 = split(axis = var_3050_axis_0, split_sizes = var_3050_split_sizes_0, x = normed_117_cast_fp16)[name = string("op_3050")];
+            tensor<fp16, [1, 8, 256]> var_3052 = mul(x = var_3050_0, y = layers_4_self_attn_q_norm_weight)[name = string("op_3052")];
+            tensor<int32, [4]> var_3057 = const()[name = string("op_3057"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_35 = reshape(shape = var_3057, x = var_3052)[name = string("q_35")];
+            tensor<fp16, [1, 8, 1, 256]> var_3059_cast_fp16 = mul(x = q_35, y = cos_s)[name = string("op_3059_cast_fp16")];
+            tensor<int32, [2]> var_3060_split_sizes_0 = const()[name = string("op_3060_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_3060_axis_0 = const()[name = string("op_3060_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_3060_0, tensor<fp16, [1, 8, 1, 128]> var_3060_1 = split(axis = var_3060_axis_0, split_sizes = var_3060_split_sizes_0, x = q_35)[name = string("op_3060")];
+            fp16 const_50_promoted = const()[name = string("const_50_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_3062 = mul(x = var_3060_1, y = const_50_promoted)[name = string("op_3062")];
+            int32 var_3064 = const()[name = string("op_3064"), val = int32(-1)];
+            bool var_3065_interleave_0 = const()[name = string("op_3065_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_3065 = concat(axis = var_3064, interleave = var_3065_interleave_0, values = (var_3062, var_3060_0))[name = string("op_3065")];
+            tensor<fp16, [1, 8, 1, 256]> var_3066_cast_fp16 = mul(x = var_3065, y = sin_s)[name = string("op_3066_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_39_cast_fp16 = add(x = var_3059_cast_fp16, y = var_3066_cast_fp16)[name = string("q_39_cast_fp16")];
+            string var_3079_pad_type_0 = const()[name = string("op_3079_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_3079_strides_0 = const()[name = string("op_3079_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_3079_pad_0 = const()[name = string("op_3079_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_3079_dilations_0 = const()[name = string("op_3079_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_3079_groups_0 = const()[name = string("op_3079_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_3079 = conv(dilations = var_3079_dilations_0, groups = var_3079_groups_0, pad = var_3079_pad_0, pad_type = var_3079_pad_type_0, strides = var_3079_strides_0, weight = layers_4_self_attn_k_proj_weight_palettized, x = var_3000_cast_fp16)[name = string("op_3079")];
+            tensor<int32, [4]> var_3084 = const()[name = string("op_3084"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_3085 = reshape(shape = var_3084, x = var_3079)[name = string("op_3085")];
+            tensor<int32, [4]> var_3090 = const()[name = string("op_3090"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_3107_pad_type_0 = const()[name = string("op_3107_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_3107_strides_0 = const()[name = string("op_3107_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_3107_pad_0 = const()[name = string("op_3107_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_3107_dilations_0 = const()[name = string("op_3107_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_3107_groups_0 = const()[name = string("op_3107_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_3107 = conv(dilations = var_3107_dilations_0, groups = var_3107_groups_0, pad = var_3107_pad_0, pad_type = var_3107_pad_type_0, strides = var_3107_strides_0, weight = layers_4_self_attn_v_proj_weight_palettized, x = var_3000_cast_fp16)[name = string("op_3107")];
+            tensor<int32, [4]> var_3112 = const()[name = string("op_3112"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_3113 = reshape(shape = var_3112, x = var_3107)[name = string("op_3113")];
+            tensor<int32, [4]> var_3118 = const()[name = string("op_3118"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_3128 = const()[name = string("op_3128"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp16, [1, 2, 1, 256]> var_3091 = transpose(perm = var_3090, x = var_3085)[name = string("transpose_141")];
+            tensor<fp16, [1, 2, 256]> x_83 = reshape(shape = var_3128, x = var_3091)[name = string("x_83")];
+            int32 var_3134 = const()[name = string("op_3134"), val = int32(-1)];
+            fp16 const_51_promoted = const()[name = string("const_51_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 256]> var_3136 = mul(x = x_83, y = const_51_promoted)[name = string("op_3136")];
+            bool input_127_interleave_0 = const()[name = string("input_127_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512]> input_127 = concat(axis = var_3134, interleave = input_127_interleave_0, values = (x_83, var_3136))[name = string("input_127")];
+            tensor<int32, [1]> normed_121_axes_0 = const()[name = string("normed_121_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3131_to_fp16 = const()[name = string("op_3131_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 512]> normed_121_cast_fp16 = layer_norm(axes = normed_121_axes_0, epsilon = var_3131_to_fp16, x = input_127)[name = string("normed_121_cast_fp16")];
+            tensor<int32, [2]> var_3141_split_sizes_0 = const()[name = string("op_3141_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_3141_axis_0 = const()[name = string("op_3141_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 256]> var_3141_0, tensor<fp16, [1, 2, 256]> var_3141_1 = split(axis = var_3141_axis_0, split_sizes = var_3141_split_sizes_0, x = normed_121_cast_fp16)[name = string("op_3141")];
+            tensor<fp16, [1, 2, 256]> var_3143 = mul(x = var_3141_0, y = layers_4_self_attn_k_norm_weight)[name = string("op_3143")];
+            tensor<int32, [4]> var_3148 = const()[name = string("op_3148"), val = tensor<int32, [4]>([1, 2, 1, 256])];
+            tensor<fp16, [1, 2, 1, 256]> q_37 = reshape(shape = var_3148, x = var_3143)[name = string("q_37")];
+            fp16 var_3150_promoted = const()[name = string("op_3150_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 256]> var_3119 = transpose(perm = var_3118, x = var_3113)[name = string("transpose_140")];
+            tensor<fp16, [1, 2, 1, 256]> var_3151 = pow(x = var_3119, y = var_3150_promoted)[name = string("op_3151")];
+            tensor<int32, [1]> var_3156_axes_0 = const()[name = string("op_3156_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3156_keep_dims_0 = const()[name = string("op_3156_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_3156 = reduce_mean(axes = var_3156_axes_0, keep_dims = var_3156_keep_dims_0, x = var_3151)[name = string("op_3156")];
+            fp16 var_3158_to_fp16 = const()[name = string("op_3158_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_9_cast_fp16 = add(x = var_3156, y = var_3158_to_fp16)[name = string("mean_sq_9_cast_fp16")];
+            fp32 var_3160_epsilon_0 = const()[name = string("op_3160_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_3160_cast_fp16 = rsqrt(epsilon = var_3160_epsilon_0, x = mean_sq_9_cast_fp16)[name = string("op_3160_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_131_cast_fp16 = mul(x = var_3119, y = var_3160_cast_fp16)[name = string("input_131_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> var_3162_cast_fp16 = mul(x = q_37, y = cos_s)[name = string("op_3162_cast_fp16")];
+            tensor<int32, [2]> var_3163_split_sizes_0 = const()[name = string("op_3163_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_3163_axis_0 = const()[name = string("op_3163_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 128]> var_3163_0, tensor<fp16, [1, 2, 1, 128]> var_3163_1 = split(axis = var_3163_axis_0, split_sizes = var_3163_split_sizes_0, x = q_37)[name = string("op_3163")];
+            fp16 const_52_promoted = const()[name = string("const_52_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 128]> var_3165 = mul(x = var_3163_1, y = const_52_promoted)[name = string("op_3165")];
+            int32 var_3167 = const()[name = string("op_3167"), val = int32(-1)];
+            bool var_3168_interleave_0 = const()[name = string("op_3168_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 256]> var_3168 = concat(axis = var_3167, interleave = var_3168_interleave_0, values = (var_3165, var_3163_0))[name = string("op_3168")];
+            tensor<fp16, [1, 2, 1, 256]> var_3169_cast_fp16 = mul(x = var_3168, y = sin_s)[name = string("op_3169_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_129_cast_fp16 = add(x = var_3162_cast_fp16, y = var_3169_cast_fp16)[name = string("input_129_cast_fp16")];
+            tensor<int32, [8]> k_padded_9_pad_0 = const()[name = string("k_padded_9_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_9_mode_0 = const()[name = string("k_padded_9_mode_0"), val = string("constant")];
+            fp16 const_53_to_fp16 = const()[name = string("const_53_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> k_padded_9_cast_fp16 = pad(constant_val = const_53_to_fp16, mode = k_padded_9_mode_0, pad = k_padded_9_pad_0, x = input_129_cast_fp16)[name = string("k_padded_9_cast_fp16")];
+            tensor<int32, [8]> v_padded_9_pad_0 = const()[name = string("v_padded_9_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_9_mode_0 = const()[name = string("v_padded_9_mode_0"), val = string("constant")];
+            fp16 const_54_to_fp16 = const()[name = string("const_54_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> v_padded_9_cast_fp16 = pad(constant_val = const_54_to_fp16, mode = v_padded_9_mode_0, pad = v_padded_9_pad_0, x = input_131_cast_fp16)[name = string("v_padded_9_cast_fp16")];
+            tensor<int32, [4]> var_3198_begin_0 = const()[name = string("op_3198_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_3198_end_0 = const()[name = string("op_3198_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_3198_end_mask_0 = const()[name = string("op_3198_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_3198_cast_fp16 = slice_by_index(begin = var_3198_begin_0, end = var_3198_end_0, end_mask = var_3198_end_mask_0, x = K_sliding_slot_9_cast_fp16)[name = string("op_3198_cast_fp16")];
+            int32 var_3205 = const()[name = string("op_3205"), val = int32(2)];
+            bool K_sliding_out_9_interleave_0 = const()[name = string("K_sliding_out_9_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_out_9_cast_fp16 = concat(axis = var_3205, interleave = K_sliding_out_9_interleave_0, values = (var_3198_cast_fp16, k_padded_9_cast_fp16))[name = string("K_sliding_out_9_cast_fp16")];
+            tensor<int32, [4]> var_3221_begin_0 = const()[name = string("op_3221_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_3221_end_0 = const()[name = string("op_3221_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_3221_end_mask_0 = const()[name = string("op_3221_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_3221_cast_fp16 = slice_by_index(begin = var_3221_begin_0, end = var_3221_end_0, end_mask = var_3221_end_mask_0, x = V_sliding_slot_9_cast_fp16)[name = string("op_3221_cast_fp16")];
+            int32 var_3228 = const()[name = string("op_3228"), val = int32(2)];
+            bool V_sliding_out_9_interleave_0 = const()[name = string("V_sliding_out_9_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_out_9_cast_fp16 = concat(axis = var_3228, interleave = V_sliding_out_9_interleave_0, values = (var_3221_cast_fp16, v_padded_9_cast_fp16))[name = string("V_sliding_out_9_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_9_begin_0 = const()[name = string("K_for_attn_9_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_9_end_0 = const()[name = string("K_for_attn_9_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_9_end_mask_0 = const()[name = string("K_for_attn_9_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_9_cast_fp16 = slice_by_index(begin = K_for_attn_9_begin_0, end = K_for_attn_9_end_0, end_mask = K_for_attn_9_end_mask_0, x = K_sliding_out_9_cast_fp16)[name = string("K_for_attn_9_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_9_begin_0 = const()[name = string("V_for_attn_9_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_9_end_0 = const()[name = string("V_for_attn_9_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_9_end_mask_0 = const()[name = string("V_for_attn_9_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_9_cast_fp16 = slice_by_index(begin = V_for_attn_9_begin_0, end = V_for_attn_9_end_0, end_mask = V_for_attn_9_end_mask_0, x = V_sliding_out_9_cast_fp16)[name = string("V_for_attn_9_cast_fp16")];
+            tensor<int32, [4]> transpose_16_perm_0 = const()[name = string("transpose_16_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_8_reps_0 = const()[name = string("tile_8_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_16_cast_fp16 = transpose(perm = transpose_16_perm_0, x = K_for_attn_9_cast_fp16)[name = string("transpose_139")];
+            tensor<fp16, [8, 1, 512, 256]> tile_8_cast_fp16 = tile(reps = tile_8_reps_0, x = transpose_16_cast_fp16)[name = string("tile_8_cast_fp16")];
+            tensor<int32, [5]> concat_16 = const()[name = string("concat_16"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_16_cast_fp16 = reshape(shape = concat_16, x = tile_8_cast_fp16)[name = string("reshape_16_cast_fp16")];
+            tensor<int32, [5]> transpose_17_perm_0 = const()[name = string("transpose_17_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_17 = const()[name = string("concat_17"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_17_cast_fp16 = transpose(perm = transpose_17_perm_0, x = reshape_16_cast_fp16)[name = string("transpose_138")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_17_cast_fp16 = reshape(shape = concat_17, x = transpose_17_cast_fp16)[name = string("reshape_17_cast_fp16")];
+            tensor<int32, [4]> transpose_52_perm_0 = const()[name = string("transpose_52_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_18_perm_0 = const()[name = string("transpose_18_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_9_reps_0 = const()[name = string("tile_9_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_18_cast_fp16 = transpose(perm = transpose_18_perm_0, x = V_for_attn_9_cast_fp16)[name = string("transpose_137")];
+            tensor<fp16, [8, 1, 512, 256]> tile_9_cast_fp16 = tile(reps = tile_9_reps_0, x = transpose_18_cast_fp16)[name = string("tile_9_cast_fp16")];
+            tensor<int32, [5]> concat_18 = const()[name = string("concat_18"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_18_cast_fp16 = reshape(shape = concat_18, x = tile_9_cast_fp16)[name = string("reshape_18_cast_fp16")];
+            tensor<int32, [5]> transpose_19_perm_0 = const()[name = string("transpose_19_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_19 = const()[name = string("concat_19"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_19_cast_fp16 = transpose(perm = transpose_19_perm_0, x = reshape_18_cast_fp16)[name = string("transpose_136")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_19_cast_fp16 = reshape(shape = concat_19, x = transpose_19_cast_fp16)[name = string("reshape_19_cast_fp16")];
+            tensor<int32, [4]> V_expanded_9_perm_0 = const()[name = string("V_expanded_9_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_17_transpose_x_0 = const()[name = string("attn_weights_17_transpose_x_0"), val = bool(false)];
+            bool attn_weights_17_transpose_y_0 = const()[name = string("attn_weights_17_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_52_cast_fp16 = transpose(perm = transpose_52_perm_0, x = reshape_17_cast_fp16)[name = string("transpose_135")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_17_cast_fp16 = matmul(transpose_x = attn_weights_17_transpose_x_0, transpose_y = attn_weights_17_transpose_y_0, x = q_39_cast_fp16, y = transpose_52_cast_fp16)[name = string("attn_weights_17_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_87_cast_fp16 = add(x = attn_weights_17_cast_fp16, y = causal_mask_sliding)[name = string("x_87_cast_fp16")];
+            tensor<int32, [1]> reduce_max_4_axes_0 = const()[name = string("reduce_max_4_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_4_keep_dims_0 = const()[name = string("reduce_max_4_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_4 = reduce_max(axes = reduce_max_4_axes_0, keep_dims = reduce_max_4_keep_dims_0, x = x_87_cast_fp16)[name = string("reduce_max_4")];
+            tensor<fp16, [1, 8, 1, 512]> var_3269 = sub(x = x_87_cast_fp16, y = reduce_max_4)[name = string("op_3269")];
+            tensor<fp16, [1, 8, 1, 512]> var_3275 = exp(x = var_3269)[name = string("op_3275")];
+            tensor<int32, [1]> var_3285_axes_0 = const()[name = string("op_3285_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3285_keep_dims_0 = const()[name = string("op_3285_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_3285 = reduce_sum(axes = var_3285_axes_0, keep_dims = var_3285_keep_dims_0, x = var_3275)[name = string("op_3285")];
+            tensor<fp16, [1, 8, 1, 512]> var_3291_cast_fp16 = real_div(x = var_3275, y = var_3285)[name = string("op_3291_cast_fp16")];
+            bool attn_output_25_transpose_x_0 = const()[name = string("attn_output_25_transpose_x_0"), val = bool(false)];
+            bool attn_output_25_transpose_y_0 = const()[name = string("attn_output_25_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_9_cast_fp16 = transpose(perm = V_expanded_9_perm_0, x = reshape_19_cast_fp16)[name = string("transpose_134")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_25_cast_fp16 = matmul(transpose_x = attn_output_25_transpose_x_0, transpose_y = attn_output_25_transpose_y_0, x = var_3291_cast_fp16, y = V_expanded_9_cast_fp16)[name = string("attn_output_25_cast_fp16")];
+            tensor<int32, [4]> var_3302 = const()[name = string("op_3302"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_3309 = const()[name = string("op_3309"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_3303_cast_fp16 = transpose(perm = var_3302, x = attn_output_25_cast_fp16)[name = string("transpose_133")];
+            tensor<fp16, [1, 1, 2048]> attn_output_27_cast_fp16 = reshape(shape = var_3309, x = var_3303_cast_fp16)[name = string("attn_output_27_cast_fp16")];
+            tensor<int32, [3]> var_3314 = const()[name = string("op_3314"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_3330_pad_type_0 = const()[name = string("op_3330_pad_type_0"), val = string("valid")];
+            int32 var_3330_groups_0 = const()[name = string("op_3330_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_3330_strides_0 = const()[name = string("op_3330_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_3330_pad_0 = const()[name = string("op_3330_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_3330_dilations_0 = const()[name = string("op_3330_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_4_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(543157440))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(545778944))))[name = string("squeeze_4_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_3315_cast_fp16 = transpose(perm = var_3314, x = attn_output_27_cast_fp16)[name = string("transpose_132")];
+            tensor<fp16, [1, 2560, 1]> var_3330_cast_fp16 = conv(dilations = var_3330_dilations_0, groups = var_3330_groups_0, pad = var_3330_pad_0, pad_type = var_3330_pad_type_0, strides = var_3330_strides_0, weight = squeeze_4_cast_fp16_to_fp32_to_fp16_palettized, x = var_3315_cast_fp16)[name = string("op_3330_cast_fp16")];
+            tensor<int32, [3]> var_3334 = const()[name = string("op_3334"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3340 = const()[name = string("op_3340"), val = int32(-1)];
+            fp16 const_55_promoted_to_fp16 = const()[name = string("const_55_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_91_cast_fp16 = transpose(perm = var_3334, x = var_3330_cast_fp16)[name = string("transpose_131")];
+            tensor<fp16, [1, 1, 2560]> var_3342_cast_fp16 = mul(x = x_91_cast_fp16, y = const_55_promoted_to_fp16)[name = string("op_3342_cast_fp16")];
+            bool input_135_interleave_0 = const()[name = string("input_135_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_135_cast_fp16 = concat(axis = var_3340, interleave = input_135_interleave_0, values = (x_91_cast_fp16, var_3342_cast_fp16))[name = string("input_135_cast_fp16")];
+            tensor<int32, [1]> normed_125_axes_0 = const()[name = string("normed_125_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3337_to_fp16 = const()[name = string("op_3337_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_125_cast_fp16 = layer_norm(axes = normed_125_axes_0, epsilon = var_3337_to_fp16, x = input_135_cast_fp16)[name = string("normed_125_cast_fp16")];
+            tensor<int32, [2]> var_3347_split_sizes_0 = const()[name = string("op_3347_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3347_axis_0 = const()[name = string("op_3347_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3347_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3347_cast_fp16_1 = split(axis = var_3347_axis_0, split_sizes = var_3347_split_sizes_0, x = normed_125_cast_fp16)[name = string("op_3347_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_4_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(545781568)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_29_cast_fp16 = mul(x = var_3347_cast_fp16_0, y = layers_4_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_29_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_93_cast_fp16 = add(x = x_79_cast_fp16, y = attn_output_29_cast_fp16)[name = string("x_93_cast_fp16")];
+            int32 var_3356 = const()[name = string("op_3356"), val = int32(-1)];
+            fp16 const_56_promoted_to_fp16 = const()[name = string("const_56_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_3358_cast_fp16 = mul(x = x_93_cast_fp16, y = const_56_promoted_to_fp16)[name = string("op_3358_cast_fp16")];
+            bool input_137_interleave_0 = const()[name = string("input_137_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_137_cast_fp16 = concat(axis = var_3356, interleave = input_137_interleave_0, values = (x_93_cast_fp16, var_3358_cast_fp16))[name = string("input_137_cast_fp16")];
+            tensor<int32, [1]> normed_129_axes_0 = const()[name = string("normed_129_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3353_to_fp16 = const()[name = string("op_3353_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_129_cast_fp16 = layer_norm(axes = normed_129_axes_0, epsilon = var_3353_to_fp16, x = input_137_cast_fp16)[name = string("normed_129_cast_fp16")];
+            tensor<int32, [2]> var_3363_split_sizes_0 = const()[name = string("op_3363_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3363_axis_0 = const()[name = string("op_3363_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3363_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3363_cast_fp16_1 = split(axis = var_3363_axis_0, split_sizes = var_3363_split_sizes_0, x = normed_129_cast_fp16)[name = string("op_3363_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_4_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(545786752)))];
+            tensor<fp16, [1, 1, 2560]> h_27_cast_fp16 = mul(x = var_3363_cast_fp16_0, y = layers_4_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_27_cast_fp16")];
+            tensor<int32, [3]> var_3374 = const()[name = string("op_3374"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_139_axes_0 = const()[name = string("input_139_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3375 = transpose(perm = var_3374, x = h_27_cast_fp16)[name = string("transpose_130")];
+            tensor<fp16, [1, 2560, 1, 1]> input_139 = expand_dims(axes = input_139_axes_0, x = var_3375)[name = string("input_139")];
+            string gate_17_pad_type_0 = const()[name = string("gate_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_17_strides_0 = const()[name = string("gate_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_17_pad_0 = const()[name = string("gate_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_17_dilations_0 = const()[name = string("gate_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_17_groups_0 = const()[name = string("gate_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_17 = conv(dilations = gate_17_dilations_0, groups = gate_17_groups_0, pad = gate_17_pad_0, pad_type = gate_17_pad_type_0, strides = gate_17_strides_0, weight = layers_4_mlp_gate_proj_weight_palettized, x = input_139)[name = string("gate_17")];
+            string up_9_pad_type_0 = const()[name = string("up_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_9_strides_0 = const()[name = string("up_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_9_pad_0 = const()[name = string("up_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_9_dilations_0 = const()[name = string("up_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_9_groups_0 = const()[name = string("up_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_9 = conv(dilations = up_9_dilations_0, groups = up_9_groups_0, pad = up_9_pad_0, pad_type = up_9_pad_type_0, strides = up_9_strides_0, weight = layers_4_mlp_up_proj_weight_palettized, x = input_139)[name = string("up_9")];
+            string gate_19_mode_0 = const()[name = string("gate_19_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_19 = gelu(mode = gate_19_mode_0, x = gate_17)[name = string("gate_19")];
+            tensor<fp16, [1, 10240, 1, 1]> input_141 = mul(x = gate_19, y = up_9)[name = string("input_141")];
+            string mlp_out_9_pad_type_0 = const()[name = string("mlp_out_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_9_strides_0 = const()[name = string("mlp_out_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_9_pad_0 = const()[name = string("mlp_out_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_9_dilations_0 = const()[name = string("mlp_out_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_9_groups_0 = const()[name = string("mlp_out_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_9 = conv(dilations = mlp_out_9_dilations_0, groups = mlp_out_9_groups_0, pad = mlp_out_9_pad_0, pad_type = mlp_out_9_pad_type_0, strides = mlp_out_9_strides_0, weight = layers_4_mlp_down_proj_weight_palettized, x = input_141)[name = string("mlp_out_9")];
+            tensor<int32, [1]> var_3415_axes_0 = const()[name = string("op_3415_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3415 = squeeze(axes = var_3415_axes_0, x = mlp_out_9)[name = string("op_3415")];
+            tensor<int32, [3]> var_3419 = const()[name = string("op_3419"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3425 = const()[name = string("op_3425"), val = int32(-1)];
+            fp16 const_57_promoted = const()[name = string("const_57_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_95 = transpose(perm = var_3419, x = var_3415)[name = string("transpose_129")];
+            tensor<fp16, [1, 1, 2560]> var_3427 = mul(x = x_95, y = const_57_promoted)[name = string("op_3427")];
+            bool input_143_interleave_0 = const()[name = string("input_143_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_143 = concat(axis = var_3425, interleave = input_143_interleave_0, values = (x_95, var_3427))[name = string("input_143")];
+            tensor<int32, [1]> normed_133_axes_0 = const()[name = string("normed_133_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3422_to_fp16 = const()[name = string("op_3422_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_133_cast_fp16 = layer_norm(axes = normed_133_axes_0, epsilon = var_3422_to_fp16, x = input_143)[name = string("normed_133_cast_fp16")];
+            tensor<int32, [2]> var_3432_split_sizes_0 = const()[name = string("op_3432_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3432_axis_0 = const()[name = string("op_3432_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3432_0, tensor<fp16, [1, 1, 2560]> var_3432_1 = split(axis = var_3432_axis_0, split_sizes = var_3432_split_sizes_0, x = normed_133_cast_fp16)[name = string("op_3432")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_43 = mul(x = var_3432_0, y = layers_4_post_feedforward_layernorm_weight)[name = string("hidden_states_43")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_45_cast_fp16 = add(x = x_93_cast_fp16, y = hidden_states_43)[name = string("hidden_states_45_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_9_begin_0 = const()[name = string("per_layer_slice_9_begin_0"), val = tensor<int32, [3]>([0, 0, 4096])];
+            tensor<int32, [3]> per_layer_slice_9_end_0 = const()[name = string("per_layer_slice_9_end_0"), val = tensor<int32, [3]>([1, 1, 4352])];
+            tensor<bool, [3]> per_layer_slice_9_end_mask_0 = const()[name = string("per_layer_slice_9_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_9_cast_fp16 = slice_by_index(begin = per_layer_slice_9_begin_0, end = per_layer_slice_9_end_0, end_mask = per_layer_slice_9_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_9_cast_fp16")];
+            tensor<int32, [3]> var_3460 = const()[name = string("op_3460"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_145_axes_0 = const()[name = string("input_145_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3461 = transpose(perm = var_3460, x = hidden_states_45_cast_fp16)[name = string("transpose_128")];
+            tensor<fp16, [1, 2560, 1, 1]> input_145 = expand_dims(axes = input_145_axes_0, x = var_3461)[name = string("input_145")];
+            string gated_25_pad_type_0 = const()[name = string("gated_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_25_strides_0 = const()[name = string("gated_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_25_pad_0 = const()[name = string("gated_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_25_dilations_0 = const()[name = string("gated_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_25_groups_0 = const()[name = string("gated_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_25 = conv(dilations = gated_25_dilations_0, groups = gated_25_groups_0, pad = gated_25_pad_0, pad_type = gated_25_pad_type_0, strides = gated_25_strides_0, weight = layers_4_per_layer_input_gate_weight_palettized, x = input_145)[name = string("gated_25")];
+            string gated_27_mode_0 = const()[name = string("gated_27_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_27 = gelu(mode = gated_27_mode_0, x = gated_25)[name = string("gated_27")];
+            tensor<int32, [3]> var_3480 = const()[name = string("op_3480"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_9_axes_0 = const()[name = string("per_layer_slice_conv_9_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_3481_cast_fp16 = transpose(perm = var_3480, x = per_layer_slice_9_cast_fp16)[name = string("transpose_127")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_9_cast_fp16 = expand_dims(axes = per_layer_slice_conv_9_axes_0, x = var_3481_cast_fp16)[name = string("per_layer_slice_conv_9_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_147_cast_fp16 = mul(x = gated_27, y = per_layer_slice_conv_9_cast_fp16)[name = string("input_147_cast_fp16")];
+            string gated_29_pad_type_0 = const()[name = string("gated_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_29_strides_0 = const()[name = string("gated_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_29_pad_0 = const()[name = string("gated_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_29_dilations_0 = const()[name = string("gated_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_29_groups_0 = const()[name = string("gated_29_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_4_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(545791936))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(546119680))))[name = string("layers_4_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_29_cast_fp16 = conv(dilations = gated_29_dilations_0, groups = gated_29_groups_0, pad = gated_29_pad_0, pad_type = gated_29_pad_type_0, strides = gated_29_strides_0, weight = layers_4_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_147_cast_fp16)[name = string("gated_29_cast_fp16")];
+            tensor<int32, [1]> var_3497_axes_0 = const()[name = string("op_3497_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3497_cast_fp16 = squeeze(axes = var_3497_axes_0, x = gated_29_cast_fp16)[name = string("op_3497_cast_fp16")];
+            tensor<int32, [3]> var_3501 = const()[name = string("op_3501"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3507 = const()[name = string("op_3507"), val = int32(-1)];
+            fp16 const_58_promoted_to_fp16 = const()[name = string("const_58_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_97_cast_fp16 = transpose(perm = var_3501, x = var_3497_cast_fp16)[name = string("transpose_126")];
+            tensor<fp16, [1, 1, 2560]> var_3509_cast_fp16 = mul(x = x_97_cast_fp16, y = const_58_promoted_to_fp16)[name = string("op_3509_cast_fp16")];
+            bool input_149_interleave_0 = const()[name = string("input_149_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_149_cast_fp16 = concat(axis = var_3507, interleave = input_149_interleave_0, values = (x_97_cast_fp16, var_3509_cast_fp16))[name = string("input_149_cast_fp16")];
+            tensor<int32, [1]> normed_137_axes_0 = const()[name = string("normed_137_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3504_to_fp16 = const()[name = string("op_3504_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_137_cast_fp16 = layer_norm(axes = normed_137_axes_0, epsilon = var_3504_to_fp16, x = input_149_cast_fp16)[name = string("normed_137_cast_fp16")];
+            tensor<int32, [2]> var_3514_split_sizes_0 = const()[name = string("op_3514_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3514_axis_0 = const()[name = string("op_3514_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3514_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3514_cast_fp16_1 = split(axis = var_3514_axis_0, split_sizes = var_3514_split_sizes_0, x = normed_137_cast_fp16)[name = string("op_3514_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_4_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(546122304)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_49_cast_fp16 = mul(x = var_3514_cast_fp16_0, y = layers_4_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_49_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_51_cast_fp16 = add(x = hidden_states_45_cast_fp16, y = hidden_states_49_cast_fp16)[name = string("hidden_states_51_cast_fp16")];
+            tensor<fp16, [1]> const_59_promoted_to_fp16 = const()[name = string("const_59_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.46p-1])];
+            tensor<fp16, [1, 1, 2560]> x_99_cast_fp16 = mul(x = hidden_states_51_cast_fp16, y = const_59_promoted_to_fp16)[name = string("x_99_cast_fp16")];
+            tensor<int32, [1]> var_3526_axes_0 = const()[name = string("op_3526_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_3526_cast_fp16 = squeeze(axes = var_3526_axes_0, x = K_sliding_out_9_cast_fp16)[name = string("op_3526_cast_fp16")];
+            tensor<int32, [1]> var_3528_axes_0 = const()[name = string("op_3528_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_3528_cast_fp16 = squeeze(axes = var_3528_axes_0, x = V_sliding_out_9_cast_fp16)[name = string("op_3528_cast_fp16")];
+            tensor<int32, [4]> var_3531_begin_0 = const()[name = string("op_3531_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3531_end_0 = const()[name = string("op_3531_end_0"), val = tensor<int32, [4]>([1, 2, 2048, 512])];
+            tensor<bool, [4]> var_3531_end_mask_0 = const()[name = string("op_3531_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_3531_squeeze_mask_0 = const()[name = string("op_3531_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 2048, 512]> var_3531_cast_fp16 = slice_by_index(begin = var_3531_begin_0, end = var_3531_end_0, end_mask = var_3531_end_mask_0, squeeze_mask = var_3531_squeeze_mask_0, x = K_full_in)[name = string("op_3531_cast_fp16")];
+            tensor<int32, [1]> K_full_slot_1_axes_0 = const()[name = string("K_full_slot_1_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 2048, 512]> K_full_slot_1_cast_fp16 = expand_dims(axes = K_full_slot_1_axes_0, x = var_3531_cast_fp16)[name = string("K_full_slot_1_cast_fp16")];
+            tensor<int32, [4]> var_3536_begin_0 = const()[name = string("op_3536_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3536_end_0 = const()[name = string("op_3536_end_0"), val = tensor<int32, [4]>([1, 2, 2048, 512])];
+            tensor<bool, [4]> var_3536_end_mask_0 = const()[name = string("op_3536_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_3536_squeeze_mask_0 = const()[name = string("op_3536_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 2048, 512]> var_3536_cast_fp16 = slice_by_index(begin = var_3536_begin_0, end = var_3536_end_0, end_mask = var_3536_end_mask_0, squeeze_mask = var_3536_squeeze_mask_0, x = V_full_in)[name = string("op_3536_cast_fp16")];
+            tensor<int32, [1]> V_full_slot_1_axes_0 = const()[name = string("V_full_slot_1_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 2048, 512]> V_full_slot_1_cast_fp16 = expand_dims(axes = V_full_slot_1_axes_0, x = var_3536_cast_fp16)[name = string("V_full_slot_1_cast_fp16")];
+            int32 var_3543 = const()[name = string("op_3543"), val = int32(-1)];
+            fp16 const_60_promoted_to_fp16 = const()[name = string("const_60_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_3545_cast_fp16 = mul(x = x_99_cast_fp16, y = const_60_promoted_to_fp16)[name = string("op_3545_cast_fp16")];
+            bool input_151_interleave_0 = const()[name = string("input_151_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_151_cast_fp16 = concat(axis = var_3543, interleave = input_151_interleave_0, values = (x_99_cast_fp16, var_3545_cast_fp16))[name = string("input_151_cast_fp16")];
+            tensor<int32, [1]> normed_141_axes_0 = const()[name = string("normed_141_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3540_to_fp16 = const()[name = string("op_3540_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_141_cast_fp16 = layer_norm(axes = normed_141_axes_0, epsilon = var_3540_to_fp16, x = input_151_cast_fp16)[name = string("normed_141_cast_fp16")];
+            tensor<int32, [2]> var_3550_split_sizes_0 = const()[name = string("op_3550_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3550_axis_0 = const()[name = string("op_3550_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3550_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3550_cast_fp16_1 = split(axis = var_3550_axis_0, split_sizes = var_3550_split_sizes_0, x = normed_141_cast_fp16)[name = string("op_3550_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_5_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(546127488)))];
+            tensor<fp16, [1, 1, 2560]> h_31_cast_fp16 = mul(x = var_3550_cast_fp16_0, y = layers_5_input_layernorm_weight_promoted_to_fp16)[name = string("h_31_cast_fp16")];
+            tensor<int32, [3]> var_3556 = const()[name = string("op_3556"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_3559_axes_0 = const()[name = string("op_3559_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3557_cast_fp16 = transpose(perm = var_3556, x = h_31_cast_fp16)[name = string("transpose_125")];
+            tensor<fp16, [1, 2560, 1, 1]> var_3559_cast_fp16 = expand_dims(axes = var_3559_axes_0, x = var_3557_cast_fp16)[name = string("op_3559_cast_fp16")];
+            string var_3575_pad_type_0 = const()[name = string("op_3575_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_3575_strides_0 = const()[name = string("op_3575_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_3575_pad_0 = const()[name = string("op_3575_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_3575_dilations_0 = const()[name = string("op_3575_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_3575_groups_0 = const()[name = string("op_3575_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 4096, 1, 1]> var_3575 = conv(dilations = var_3575_dilations_0, groups = var_3575_groups_0, pad = var_3575_pad_0, pad_type = var_3575_pad_type_0, strides = var_3575_strides_0, weight = layers_5_self_attn_q_proj_weight_palettized, x = var_3559_cast_fp16)[name = string("op_3575")];
+            tensor<int32, [4]> var_3580 = const()[name = string("op_3580"), val = tensor<int32, [4]>([1, 8, 512, 1])];
+            tensor<fp16, [1, 8, 512, 1]> var_3581 = reshape(shape = var_3580, x = var_3575)[name = string("op_3581")];
+            tensor<int32, [4]> var_3586 = const()[name = string("op_3586"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_3596 = const()[name = string("op_3596"), val = tensor<int32, [3]>([1, 8, 512])];
+            tensor<fp16, [1, 8, 1, 512]> var_3587 = transpose(perm = var_3586, x = var_3581)[name = string("transpose_124")];
+            tensor<fp16, [1, 8, 512]> x_101 = reshape(shape = var_3596, x = var_3587)[name = string("x_101")];
+            int32 var_3602 = const()[name = string("op_3602"), val = int32(-1)];
+            fp16 const_61_promoted = const()[name = string("const_61_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 512]> var_3604 = mul(x = x_101, y = const_61_promoted)[name = string("op_3604")];
+            bool input_155_interleave_0 = const()[name = string("input_155_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1024]> input_155 = concat(axis = var_3602, interleave = input_155_interleave_0, values = (x_101, var_3604))[name = string("input_155")];
+            tensor<int32, [1]> normed_145_axes_0 = const()[name = string("normed_145_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3599_to_fp16 = const()[name = string("op_3599_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 1024]> normed_145_cast_fp16 = layer_norm(axes = normed_145_axes_0, epsilon = var_3599_to_fp16, x = input_155)[name = string("normed_145_cast_fp16")];
+            tensor<int32, [2]> var_3609_split_sizes_0 = const()[name = string("op_3609_split_sizes_0"), val = tensor<int32, [2]>([512, 512])];
+            int32 var_3609_axis_0 = const()[name = string("op_3609_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 512]> var_3609_0, tensor<fp16, [1, 8, 512]> var_3609_1 = split(axis = var_3609_axis_0, split_sizes = var_3609_split_sizes_0, x = normed_145_cast_fp16)[name = string("op_3609")];
+            tensor<fp16, [1, 8, 512]> var_3611 = mul(x = var_3609_0, y = layers_5_self_attn_q_norm_weight)[name = string("op_3611")];
+            tensor<int32, [4]> var_3616 = const()[name = string("op_3616"), val = tensor<int32, [4]>([1, 8, 1, 512])];
+            tensor<fp16, [1, 8, 1, 512]> q_43 = reshape(shape = var_3616, x = var_3611)[name = string("q_43")];
+            tensor<fp16, [1, 8, 1, 512]> var_3618_cast_fp16 = mul(x = q_43, y = cos_f)[name = string("op_3618_cast_fp16")];
+            tensor<int32, [2]> var_3619_split_sizes_0 = const()[name = string("op_3619_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_3619_axis_0 = const()[name = string("op_3619_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 256]> var_3619_0, tensor<fp16, [1, 8, 1, 256]> var_3619_1 = split(axis = var_3619_axis_0, split_sizes = var_3619_split_sizes_0, x = q_43)[name = string("op_3619")];
+            fp16 const_62_promoted = const()[name = string("const_62_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 256]> var_3621 = mul(x = var_3619_1, y = const_62_promoted)[name = string("op_3621")];
+            int32 var_3623 = const()[name = string("op_3623"), val = int32(-1)];
+            bool var_3624_interleave_0 = const()[name = string("op_3624_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> var_3624 = concat(axis = var_3623, interleave = var_3624_interleave_0, values = (var_3621, var_3619_0))[name = string("op_3624")];
+            tensor<fp16, [1, 8, 1, 512]> var_3625_cast_fp16 = mul(x = var_3624, y = sin_f)[name = string("op_3625_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> q_47_cast_fp16 = add(x = var_3618_cast_fp16, y = var_3625_cast_fp16)[name = string("q_47_cast_fp16")];
+            string var_3638_pad_type_0 = const()[name = string("op_3638_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_3638_strides_0 = const()[name = string("op_3638_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_3638_pad_0 = const()[name = string("op_3638_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_3638_dilations_0 = const()[name = string("op_3638_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_3638_groups_0 = const()[name = string("op_3638_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> var_3638 = conv(dilations = var_3638_dilations_0, groups = var_3638_groups_0, pad = var_3638_pad_0, pad_type = var_3638_pad_type_0, strides = var_3638_strides_0, weight = layers_5_self_attn_k_proj_weight_palettized, x = var_3559_cast_fp16)[name = string("op_3638")];
+            tensor<int32, [4]> var_3643 = const()[name = string("op_3643"), val = tensor<int32, [4]>([1, 2, 512, 1])];
+            tensor<fp16, [1, 2, 512, 1]> var_3644 = reshape(shape = var_3643, x = var_3638)[name = string("op_3644")];
+            tensor<int32, [4]> var_3649 = const()[name = string("op_3649"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_3666_pad_type_0 = const()[name = string("op_3666_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_3666_strides_0 = const()[name = string("op_3666_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_3666_pad_0 = const()[name = string("op_3666_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_3666_dilations_0 = const()[name = string("op_3666_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_3666_groups_0 = const()[name = string("op_3666_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> var_3666 = conv(dilations = var_3666_dilations_0, groups = var_3666_groups_0, pad = var_3666_pad_0, pad_type = var_3666_pad_type_0, strides = var_3666_strides_0, weight = layers_5_self_attn_v_proj_weight_palettized, x = var_3559_cast_fp16)[name = string("op_3666")];
+            tensor<int32, [4]> var_3671 = const()[name = string("op_3671"), val = tensor<int32, [4]>([1, 2, 512, 1])];
+            tensor<fp16, [1, 2, 512, 1]> var_3672 = reshape(shape = var_3671, x = var_3666)[name = string("op_3672")];
+            tensor<int32, [4]> var_3677 = const()[name = string("op_3677"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_3687 = const()[name = string("op_3687"), val = tensor<int32, [3]>([1, 2, 512])];
+            tensor<fp16, [1, 2, 1, 512]> var_3650 = transpose(perm = var_3649, x = var_3644)[name = string("transpose_123")];
+            tensor<fp16, [1, 2, 512]> x_103 = reshape(shape = var_3687, x = var_3650)[name = string("x_103")];
+            int32 var_3693 = const()[name = string("op_3693"), val = int32(-1)];
+            fp16 const_63_promoted = const()[name = string("const_63_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 512]> var_3695 = mul(x = x_103, y = const_63_promoted)[name = string("op_3695")];
+            bool input_157_interleave_0 = const()[name = string("input_157_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1024]> input_157 = concat(axis = var_3693, interleave = input_157_interleave_0, values = (x_103, var_3695))[name = string("input_157")];
+            tensor<int32, [1]> normed_149_axes_0 = const()[name = string("normed_149_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3690_to_fp16 = const()[name = string("op_3690_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1024]> normed_149_cast_fp16 = layer_norm(axes = normed_149_axes_0, epsilon = var_3690_to_fp16, x = input_157)[name = string("normed_149_cast_fp16")];
+            tensor<int32, [2]> var_3700_split_sizes_0 = const()[name = string("op_3700_split_sizes_0"), val = tensor<int32, [2]>([512, 512])];
+            int32 var_3700_axis_0 = const()[name = string("op_3700_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 512]> var_3700_0, tensor<fp16, [1, 2, 512]> var_3700_1 = split(axis = var_3700_axis_0, split_sizes = var_3700_split_sizes_0, x = normed_149_cast_fp16)[name = string("op_3700")];
+            tensor<fp16, [1, 2, 512]> var_3702 = mul(x = var_3700_0, y = layers_5_self_attn_k_norm_weight)[name = string("op_3702")];
+            tensor<int32, [4]> var_3707 = const()[name = string("op_3707"), val = tensor<int32, [4]>([1, 2, 1, 512])];
+            tensor<fp16, [1, 2, 1, 512]> q_45 = reshape(shape = var_3707, x = var_3702)[name = string("q_45")];
+            fp16 var_3709_promoted = const()[name = string("op_3709_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 512]> var_3678 = transpose(perm = var_3677, x = var_3672)[name = string("transpose_122")];
+            tensor<fp16, [1, 2, 1, 512]> var_3710 = pow(x = var_3678, y = var_3709_promoted)[name = string("op_3710")];
+            tensor<int32, [1]> var_3715_axes_0 = const()[name = string("op_3715_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3715_keep_dims_0 = const()[name = string("op_3715_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_3715 = reduce_mean(axes = var_3715_axes_0, keep_dims = var_3715_keep_dims_0, x = var_3710)[name = string("op_3715")];
+            fp16 var_3717_to_fp16 = const()[name = string("op_3717_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_11_cast_fp16 = add(x = var_3715, y = var_3717_to_fp16)[name = string("mean_sq_11_cast_fp16")];
+            fp32 var_3719_epsilon_0 = const()[name = string("op_3719_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_3719_cast_fp16 = rsqrt(epsilon = var_3719_epsilon_0, x = mean_sq_11_cast_fp16)[name = string("op_3719_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 512]> v_1_cast_fp16 = mul(x = var_3678, y = var_3719_cast_fp16)[name = string("v_1_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 512]> var_3721_cast_fp16 = mul(x = q_45, y = cos_f)[name = string("op_3721_cast_fp16")];
+            tensor<int32, [2]> var_3722_split_sizes_0 = const()[name = string("op_3722_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_3722_axis_0 = const()[name = string("op_3722_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 256]> var_3722_0, tensor<fp16, [1, 2, 1, 256]> var_3722_1 = split(axis = var_3722_axis_0, split_sizes = var_3722_split_sizes_0, x = q_45)[name = string("op_3722")];
+            fp16 const_64_promoted = const()[name = string("const_64_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 256]> var_3724 = mul(x = var_3722_1, y = const_64_promoted)[name = string("op_3724")];
+            int32 var_3726 = const()[name = string("op_3726"), val = int32(-1)];
+            bool var_3727_interleave_0 = const()[name = string("op_3727_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 512]> var_3727 = concat(axis = var_3726, interleave = var_3727_interleave_0, values = (var_3724, var_3722_0))[name = string("op_3727")];
+            tensor<fp16, [1, 2, 1, 512]> var_3728_cast_fp16 = mul(x = var_3727, y = sin_f)[name = string("op_3728_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 512]> k_13_cast_fp16 = add(x = var_3721_cast_fp16, y = var_3728_cast_fp16)[name = string("k_13_cast_fp16")];
+            fp16 var_3731_promoted_to_fp16 = const()[name = string("op_3731_promoted_to_fp16"), val = fp16(0x1p+0)];
+            tensor<fp16, [1, 1, 2048, 1]> var_3733_cast_fp16 = sub(x = var_3731_promoted_to_fp16, y = update_mask)[name = string("op_3733_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_3734_cast_fp16 = mul(x = K_full_slot_1_cast_fp16, y = var_3733_cast_fp16)[name = string("op_3734_cast_fp16")];
+            tensor<int32, [4]> var_3735_reps_0 = const()[name = string("op_3735_reps_0"), val = tensor<int32, [4]>([1, 1, 2048, 1])];
+            tensor<fp16, [1, 2, 2048, 512]> var_3735_cast_fp16 = tile(reps = var_3735_reps_0, x = k_13_cast_fp16)[name = string("op_3735_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_3736_cast_fp16 = mul(x = var_3735_cast_fp16, y = update_mask)[name = string("op_3736_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> K_full_out_1_cast_fp16 = add(x = var_3734_cast_fp16, y = var_3736_cast_fp16)[name = string("K_full_out_1_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_3742_cast_fp16 = mul(x = V_full_slot_1_cast_fp16, y = var_3733_cast_fp16)[name = string("op_3742_cast_fp16")];
+            tensor<int32, [4]> var_3743_reps_0 = const()[name = string("op_3743_reps_0"), val = tensor<int32, [4]>([1, 1, 2048, 1])];
+            tensor<fp16, [1, 2, 2048, 512]> var_3743_cast_fp16 = tile(reps = var_3743_reps_0, x = v_1_cast_fp16)[name = string("op_3743_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_3744_cast_fp16 = mul(x = var_3743_cast_fp16, y = update_mask)[name = string("op_3744_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> V_full_out_1_cast_fp16 = add(x = var_3742_cast_fp16, y = var_3744_cast_fp16)[name = string("V_full_out_1_cast_fp16")];
+            tensor<int32, [4]> transpose_20_perm_0 = const()[name = string("transpose_20_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_10_reps_0 = const()[name = string("tile_10_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 2048, 512]> transpose_20_cast_fp16 = transpose(perm = transpose_20_perm_0, x = K_full_out_1_cast_fp16)[name = string("transpose_121")];
+            tensor<fp16, [8, 1, 2048, 512]> tile_10_cast_fp16 = tile(reps = tile_10_reps_0, x = transpose_20_cast_fp16)[name = string("tile_10_cast_fp16")];
+            tensor<int32, [5]> concat_20 = const()[name = string("concat_20"), val = tensor<int32, [5]>([4, 2, 1, 2048, 512])];
+            tensor<fp16, [4, 2, 1, 2048, 512]> reshape_20_cast_fp16 = reshape(shape = concat_20, x = tile_10_cast_fp16)[name = string("reshape_20_cast_fp16")];
+            tensor<int32, [5]> transpose_21_perm_0 = const()[name = string("transpose_21_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_21 = const()[name = string("concat_21"), val = tensor<int32, [4]>([-1, 1, 2048, 512])];
+            tensor<fp16, [2, 4, 1, 2048, 512]> transpose_21_cast_fp16 = transpose(perm = transpose_21_perm_0, x = reshape_20_cast_fp16)[name = string("transpose_120")];
+            tensor<fp16, [8, 1, 2048, 512]> reshape_21_cast_fp16 = reshape(shape = concat_21, x = transpose_21_cast_fp16)[name = string("reshape_21_cast_fp16")];
+            tensor<int32, [4]> transpose_53_perm_0 = const()[name = string("transpose_53_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_22_perm_0 = const()[name = string("transpose_22_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_11_reps_0 = const()[name = string("tile_11_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 2048, 512]> transpose_22_cast_fp16 = transpose(perm = transpose_22_perm_0, x = V_full_out_1_cast_fp16)[name = string("transpose_119")];
+            tensor<fp16, [8, 1, 2048, 512]> tile_11_cast_fp16 = tile(reps = tile_11_reps_0, x = transpose_22_cast_fp16)[name = string("tile_11_cast_fp16")];
+            tensor<int32, [5]> concat_22 = const()[name = string("concat_22"), val = tensor<int32, [5]>([4, 2, 1, 2048, 512])];
+            tensor<fp16, [4, 2, 1, 2048, 512]> reshape_22_cast_fp16 = reshape(shape = concat_22, x = tile_11_cast_fp16)[name = string("reshape_22_cast_fp16")];
+            tensor<int32, [5]> transpose_23_perm_0 = const()[name = string("transpose_23_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_23 = const()[name = string("concat_23"), val = tensor<int32, [4]>([-1, 1, 2048, 512])];
+            tensor<fp16, [2, 4, 1, 2048, 512]> transpose_23_cast_fp16 = transpose(perm = transpose_23_perm_0, x = reshape_22_cast_fp16)[name = string("transpose_118")];
+            tensor<fp16, [8, 1, 2048, 512]> reshape_23_cast_fp16 = reshape(shape = concat_23, x = transpose_23_cast_fp16)[name = string("reshape_23_cast_fp16")];
+            tensor<int32, [4]> V_expanded_11_perm_0 = const()[name = string("V_expanded_11_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_21_transpose_x_0 = const()[name = string("attn_weights_21_transpose_x_0"), val = bool(false)];
+            bool attn_weights_21_transpose_y_0 = const()[name = string("attn_weights_21_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 2048]> transpose_53_cast_fp16 = transpose(perm = transpose_53_perm_0, x = reshape_21_cast_fp16)[name = string("transpose_117")];
+            tensor<fp16, [1, 8, 1, 2048]> attn_weights_21_cast_fp16 = matmul(transpose_x = attn_weights_21_transpose_x_0, transpose_y = attn_weights_21_transpose_y_0, x = q_47_cast_fp16, y = transpose_53_cast_fp16)[name = string("attn_weights_21_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 2048]> x_107_cast_fp16 = add(x = attn_weights_21_cast_fp16, y = causal_mask_full)[name = string("x_107_cast_fp16")];
+            tensor<int32, [1]> reduce_max_5_axes_0 = const()[name = string("reduce_max_5_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_5_keep_dims_0 = const()[name = string("reduce_max_5_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_5 = reduce_max(axes = reduce_max_5_axes_0, keep_dims = reduce_max_5_keep_dims_0, x = x_107_cast_fp16)[name = string("reduce_max_5")];
+            tensor<fp16, [1, 8, 1, 2048]> var_3786 = sub(x = x_107_cast_fp16, y = reduce_max_5)[name = string("op_3786")];
+            tensor<fp16, [1, 8, 1, 2048]> var_3792 = exp(x = var_3786)[name = string("op_3792")];
+            tensor<int32, [1]> var_3802_axes_0 = const()[name = string("op_3802_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3802_keep_dims_0 = const()[name = string("op_3802_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_3802 = reduce_sum(axes = var_3802_axes_0, keep_dims = var_3802_keep_dims_0, x = var_3792)[name = string("op_3802")];
+            tensor<fp16, [1, 8, 1, 2048]> var_3808_cast_fp16 = real_div(x = var_3792, y = var_3802)[name = string("op_3808_cast_fp16")];
+            bool attn_output_31_transpose_x_0 = const()[name = string("attn_output_31_transpose_x_0"), val = bool(false)];
+            bool attn_output_31_transpose_y_0 = const()[name = string("attn_output_31_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 2048, 512]> V_expanded_11_cast_fp16 = transpose(perm = V_expanded_11_perm_0, x = reshape_23_cast_fp16)[name = string("transpose_116")];
+            tensor<fp16, [1, 8, 1, 512]> attn_output_31_cast_fp16 = matmul(transpose_x = attn_output_31_transpose_x_0, transpose_y = attn_output_31_transpose_y_0, x = var_3808_cast_fp16, y = V_expanded_11_cast_fp16)[name = string("attn_output_31_cast_fp16")];
+            tensor<int32, [4]> var_3819 = const()[name = string("op_3819"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_3826 = const()[name = string("op_3826"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 512]> var_3820_cast_fp16 = transpose(perm = var_3819, x = attn_output_31_cast_fp16)[name = string("transpose_115")];
+            tensor<fp16, [1, 1, 4096]> attn_output_33_cast_fp16 = reshape(shape = var_3826, x = var_3820_cast_fp16)[name = string("attn_output_33_cast_fp16")];
+            tensor<int32, [3]> var_3831 = const()[name = string("op_3831"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_3847_pad_type_0 = const()[name = string("op_3847_pad_type_0"), val = string("valid")];
+            int32 var_3847_groups_0 = const()[name = string("op_3847_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_3847_strides_0 = const()[name = string("op_3847_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_3847_pad_0 = const()[name = string("op_3847_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_3847_dilations_0 = const()[name = string("op_3847_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 4096, 1]> squeeze_5_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 4096, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(546132672))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(551375616))))[name = string("squeeze_5_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 4096, 1]> var_3832_cast_fp16 = transpose(perm = var_3831, x = attn_output_33_cast_fp16)[name = string("transpose_114")];
+            tensor<fp16, [1, 2560, 1]> var_3847_cast_fp16 = conv(dilations = var_3847_dilations_0, groups = var_3847_groups_0, pad = var_3847_pad_0, pad_type = var_3847_pad_type_0, strides = var_3847_strides_0, weight = squeeze_5_cast_fp16_to_fp32_to_fp16_palettized, x = var_3832_cast_fp16)[name = string("op_3847_cast_fp16")];
+            tensor<int32, [3]> var_3851 = const()[name = string("op_3851"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3857 = const()[name = string("op_3857"), val = int32(-1)];
+            fp16 const_65_promoted_to_fp16 = const()[name = string("const_65_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_111_cast_fp16 = transpose(perm = var_3851, x = var_3847_cast_fp16)[name = string("transpose_113")];
+            tensor<fp16, [1, 1, 2560]> var_3859_cast_fp16 = mul(x = x_111_cast_fp16, y = const_65_promoted_to_fp16)[name = string("op_3859_cast_fp16")];
+            bool input_161_interleave_0 = const()[name = string("input_161_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_161_cast_fp16 = concat(axis = var_3857, interleave = input_161_interleave_0, values = (x_111_cast_fp16, var_3859_cast_fp16))[name = string("input_161_cast_fp16")];
+            tensor<int32, [1]> normed_153_axes_0 = const()[name = string("normed_153_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3854_to_fp16 = const()[name = string("op_3854_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_153_cast_fp16 = layer_norm(axes = normed_153_axes_0, epsilon = var_3854_to_fp16, x = input_161_cast_fp16)[name = string("normed_153_cast_fp16")];
+            tensor<int32, [2]> var_3864_split_sizes_0 = const()[name = string("op_3864_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3864_axis_0 = const()[name = string("op_3864_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3864_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3864_cast_fp16_1 = split(axis = var_3864_axis_0, split_sizes = var_3864_split_sizes_0, x = normed_153_cast_fp16)[name = string("op_3864_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_5_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(551378240)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_35_cast_fp16 = mul(x = var_3864_cast_fp16_0, y = layers_5_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_35_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_113_cast_fp16 = add(x = x_99_cast_fp16, y = attn_output_35_cast_fp16)[name = string("x_113_cast_fp16")];
+            int32 var_3873 = const()[name = string("op_3873"), val = int32(-1)];
+            fp16 const_66_promoted_to_fp16 = const()[name = string("const_66_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_3875_cast_fp16 = mul(x = x_113_cast_fp16, y = const_66_promoted_to_fp16)[name = string("op_3875_cast_fp16")];
+            bool input_163_interleave_0 = const()[name = string("input_163_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_163_cast_fp16 = concat(axis = var_3873, interleave = input_163_interleave_0, values = (x_113_cast_fp16, var_3875_cast_fp16))[name = string("input_163_cast_fp16")];
+            tensor<int32, [1]> normed_157_axes_0 = const()[name = string("normed_157_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3870_to_fp16 = const()[name = string("op_3870_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_157_cast_fp16 = layer_norm(axes = normed_157_axes_0, epsilon = var_3870_to_fp16, x = input_163_cast_fp16)[name = string("normed_157_cast_fp16")];
+            tensor<int32, [2]> var_3880_split_sizes_0 = const()[name = string("op_3880_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3880_axis_0 = const()[name = string("op_3880_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3880_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3880_cast_fp16_1 = split(axis = var_3880_axis_0, split_sizes = var_3880_split_sizes_0, x = normed_157_cast_fp16)[name = string("op_3880_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_5_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(551383424)))];
+            tensor<fp16, [1, 1, 2560]> h_33_cast_fp16 = mul(x = var_3880_cast_fp16_0, y = layers_5_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_33_cast_fp16")];
+            tensor<int32, [3]> var_3891 = const()[name = string("op_3891"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_165_axes_0 = const()[name = string("input_165_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3892 = transpose(perm = var_3891, x = h_33_cast_fp16)[name = string("transpose_112")];
+            tensor<fp16, [1, 2560, 1, 1]> input_165 = expand_dims(axes = input_165_axes_0, x = var_3892)[name = string("input_165")];
+            string gate_21_pad_type_0 = const()[name = string("gate_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_21_strides_0 = const()[name = string("gate_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_21_pad_0 = const()[name = string("gate_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_21_dilations_0 = const()[name = string("gate_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_21_groups_0 = const()[name = string("gate_21_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_21 = conv(dilations = gate_21_dilations_0, groups = gate_21_groups_0, pad = gate_21_pad_0, pad_type = gate_21_pad_type_0, strides = gate_21_strides_0, weight = layers_5_mlp_gate_proj_weight_palettized, x = input_165)[name = string("gate_21")];
+            string up_11_pad_type_0 = const()[name = string("up_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_11_strides_0 = const()[name = string("up_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_11_pad_0 = const()[name = string("up_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_11_dilations_0 = const()[name = string("up_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_11_groups_0 = const()[name = string("up_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_11 = conv(dilations = up_11_dilations_0, groups = up_11_groups_0, pad = up_11_pad_0, pad_type = up_11_pad_type_0, strides = up_11_strides_0, weight = layers_5_mlp_up_proj_weight_palettized, x = input_165)[name = string("up_11")];
+            string gate_23_mode_0 = const()[name = string("gate_23_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_23 = gelu(mode = gate_23_mode_0, x = gate_21)[name = string("gate_23")];
+            tensor<fp16, [1, 10240, 1, 1]> input_167 = mul(x = gate_23, y = up_11)[name = string("input_167")];
+            string mlp_out_11_pad_type_0 = const()[name = string("mlp_out_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_11_strides_0 = const()[name = string("mlp_out_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_11_pad_0 = const()[name = string("mlp_out_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_11_dilations_0 = const()[name = string("mlp_out_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_11_groups_0 = const()[name = string("mlp_out_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_11 = conv(dilations = mlp_out_11_dilations_0, groups = mlp_out_11_groups_0, pad = mlp_out_11_pad_0, pad_type = mlp_out_11_pad_type_0, strides = mlp_out_11_strides_0, weight = layers_5_mlp_down_proj_weight_palettized, x = input_167)[name = string("mlp_out_11")];
+            tensor<int32, [1]> var_3932_axes_0 = const()[name = string("op_3932_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3932 = squeeze(axes = var_3932_axes_0, x = mlp_out_11)[name = string("op_3932")];
+            tensor<int32, [3]> var_3936 = const()[name = string("op_3936"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3942 = const()[name = string("op_3942"), val = int32(-1)];
+            fp16 const_67_promoted = const()[name = string("const_67_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_115 = transpose(perm = var_3936, x = var_3932)[name = string("transpose_111")];
+            tensor<fp16, [1, 1, 2560]> var_3944 = mul(x = x_115, y = const_67_promoted)[name = string("op_3944")];
+            bool input_169_interleave_0 = const()[name = string("input_169_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_169 = concat(axis = var_3942, interleave = input_169_interleave_0, values = (x_115, var_3944))[name = string("input_169")];
+            tensor<int32, [1]> normed_161_axes_0 = const()[name = string("normed_161_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3939_to_fp16 = const()[name = string("op_3939_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_161_cast_fp16 = layer_norm(axes = normed_161_axes_0, epsilon = var_3939_to_fp16, x = input_169)[name = string("normed_161_cast_fp16")];
+            tensor<int32, [2]> var_3949_split_sizes_0 = const()[name = string("op_3949_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3949_axis_0 = const()[name = string("op_3949_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3949_0, tensor<fp16, [1, 1, 2560]> var_3949_1 = split(axis = var_3949_axis_0, split_sizes = var_3949_split_sizes_0, x = normed_161_cast_fp16)[name = string("op_3949")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_53 = mul(x = var_3949_0, y = layers_5_post_feedforward_layernorm_weight)[name = string("hidden_states_53")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_55_cast_fp16 = add(x = x_113_cast_fp16, y = hidden_states_53)[name = string("hidden_states_55_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_11_begin_0 = const()[name = string("per_layer_slice_11_begin_0"), val = tensor<int32, [3]>([0, 0, 4352])];
+            tensor<int32, [3]> per_layer_slice_11_end_0 = const()[name = string("per_layer_slice_11_end_0"), val = tensor<int32, [3]>([1, 1, 4608])];
+            tensor<bool, [3]> per_layer_slice_11_end_mask_0 = const()[name = string("per_layer_slice_11_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_11_cast_fp16 = slice_by_index(begin = per_layer_slice_11_begin_0, end = per_layer_slice_11_end_0, end_mask = per_layer_slice_11_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_11_cast_fp16")];
+            tensor<int32, [3]> var_3977 = const()[name = string("op_3977"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_171_axes_0 = const()[name = string("input_171_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3978 = transpose(perm = var_3977, x = hidden_states_55_cast_fp16)[name = string("transpose_110")];
+            tensor<fp16, [1, 2560, 1, 1]> input_171 = expand_dims(axes = input_171_axes_0, x = var_3978)[name = string("input_171")];
+            string gated_31_pad_type_0 = const()[name = string("gated_31_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_31_strides_0 = const()[name = string("gated_31_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_31_pad_0 = const()[name = string("gated_31_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_31_dilations_0 = const()[name = string("gated_31_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_31_groups_0 = const()[name = string("gated_31_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_31 = conv(dilations = gated_31_dilations_0, groups = gated_31_groups_0, pad = gated_31_pad_0, pad_type = gated_31_pad_type_0, strides = gated_31_strides_0, weight = layers_5_per_layer_input_gate_weight_palettized, x = input_171)[name = string("gated_31")];
+            string gated_33_mode_0 = const()[name = string("gated_33_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_33 = gelu(mode = gated_33_mode_0, x = gated_31)[name = string("gated_33")];
+            tensor<int32, [3]> var_3997 = const()[name = string("op_3997"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_11_axes_0 = const()[name = string("per_layer_slice_conv_11_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_3998_cast_fp16 = transpose(perm = var_3997, x = per_layer_slice_11_cast_fp16)[name = string("transpose_109")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_11_cast_fp16 = expand_dims(axes = per_layer_slice_conv_11_axes_0, x = var_3998_cast_fp16)[name = string("per_layer_slice_conv_11_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_173_cast_fp16 = mul(x = gated_33, y = per_layer_slice_conv_11_cast_fp16)[name = string("input_173_cast_fp16")];
+            string gated_35_pad_type_0 = const()[name = string("gated_35_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_35_strides_0 = const()[name = string("gated_35_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_35_pad_0 = const()[name = string("gated_35_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_35_dilations_0 = const()[name = string("gated_35_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_35_groups_0 = const()[name = string("gated_35_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_5_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(551388608))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(551716352))))[name = string("layers_5_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_35_cast_fp16 = conv(dilations = gated_35_dilations_0, groups = gated_35_groups_0, pad = gated_35_pad_0, pad_type = gated_35_pad_type_0, strides = gated_35_strides_0, weight = layers_5_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_173_cast_fp16)[name = string("gated_35_cast_fp16")];
+            tensor<int32, [1]> var_4014_axes_0 = const()[name = string("op_4014_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_4014_cast_fp16 = squeeze(axes = var_4014_axes_0, x = gated_35_cast_fp16)[name = string("op_4014_cast_fp16")];
+            tensor<int32, [3]> var_4018 = const()[name = string("op_4018"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_4024 = const()[name = string("op_4024"), val = int32(-1)];
+            fp16 const_68_promoted_to_fp16 = const()[name = string("const_68_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_117_cast_fp16 = transpose(perm = var_4018, x = var_4014_cast_fp16)[name = string("transpose_108")];
+            tensor<fp16, [1, 1, 2560]> var_4026_cast_fp16 = mul(x = x_117_cast_fp16, y = const_68_promoted_to_fp16)[name = string("op_4026_cast_fp16")];
+            bool input_175_interleave_0 = const()[name = string("input_175_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_175_cast_fp16 = concat(axis = var_4024, interleave = input_175_interleave_0, values = (x_117_cast_fp16, var_4026_cast_fp16))[name = string("input_175_cast_fp16")];
+            tensor<int32, [1]> normed_165_axes_0 = const()[name = string("normed_165_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4021_to_fp16 = const()[name = string("op_4021_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_165_cast_fp16 = layer_norm(axes = normed_165_axes_0, epsilon = var_4021_to_fp16, x = input_175_cast_fp16)[name = string("normed_165_cast_fp16")];
+            tensor<int32, [2]> var_4031_split_sizes_0 = const()[name = string("op_4031_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4031_axis_0 = const()[name = string("op_4031_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_4031_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_4031_cast_fp16_1 = split(axis = var_4031_axis_0, split_sizes = var_4031_split_sizes_0, x = normed_165_cast_fp16)[name = string("op_4031_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_5_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(551718976)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_59_cast_fp16 = mul(x = var_4031_cast_fp16_0, y = layers_5_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_59_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_61_cast_fp16 = add(x = hidden_states_55_cast_fp16, y = hidden_states_59_cast_fp16)[name = string("hidden_states_61_cast_fp16")];
+            tensor<fp16, [1]> const_69_promoted_to_fp16 = const()[name = string("const_69_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.b2p-2])];
+            tensor<fp16, [1, 1, 2560]> x_119_cast_fp16 = mul(x = hidden_states_61_cast_fp16, y = const_69_promoted_to_fp16)[name = string("x_119_cast_fp16")];
+            tensor<int32, [1]> var_4043_axes_0 = const()[name = string("op_4043_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 2048, 512]> var_4043_cast_fp16 = squeeze(axes = var_4043_axes_0, x = K_full_out_1_cast_fp16)[name = string("op_4043_cast_fp16")];
+            tensor<int32, [1]> var_4045_axes_0 = const()[name = string("op_4045_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 2048, 512]> var_4045_cast_fp16 = squeeze(axes = var_4045_axes_0, x = V_full_out_1_cast_fp16)[name = string("op_4045_cast_fp16")];
+            tensor<int32, [4]> var_4048_begin_0 = const()[name = string("op_4048_begin_0"), val = tensor<int32, [4]>([5, 0, 0, 0])];
+            tensor<int32, [4]> var_4048_end_0 = const()[name = string("op_4048_end_0"), val = tensor<int32, [4]>([6, 2, 512, 512])];
+            tensor<bool, [4]> var_4048_end_mask_0 = const()[name = string("op_4048_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_4048_squeeze_mask_0 = const()[name = string("op_4048_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_4048_cast_fp16 = slice_by_index(begin = var_4048_begin_0, end = var_4048_end_0, end_mask = var_4048_end_mask_0, squeeze_mask = var_4048_squeeze_mask_0, x = K_sliding_in)[name = string("op_4048_cast_fp16")];
+            tensor<int32, [1]> K_sliding_slot_11_axes_0 = const()[name = string("K_sliding_slot_11_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_slot_11_cast_fp16 = expand_dims(axes = K_sliding_slot_11_axes_0, x = var_4048_cast_fp16)[name = string("K_sliding_slot_11_cast_fp16")];
+            tensor<int32, [4]> var_4053_begin_0 = const()[name = string("op_4053_begin_0"), val = tensor<int32, [4]>([5, 0, 0, 0])];
+            tensor<int32, [4]> var_4053_end_0 = const()[name = string("op_4053_end_0"), val = tensor<int32, [4]>([6, 2, 512, 512])];
+            tensor<bool, [4]> var_4053_end_mask_0 = const()[name = string("op_4053_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_4053_squeeze_mask_0 = const()[name = string("op_4053_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_4053_cast_fp16 = slice_by_index(begin = var_4053_begin_0, end = var_4053_end_0, end_mask = var_4053_end_mask_0, squeeze_mask = var_4053_squeeze_mask_0, x = V_sliding_in)[name = string("op_4053_cast_fp16")];
+            tensor<int32, [1]> V_sliding_slot_11_axes_0 = const()[name = string("V_sliding_slot_11_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_slot_11_cast_fp16 = expand_dims(axes = V_sliding_slot_11_axes_0, x = var_4053_cast_fp16)[name = string("V_sliding_slot_11_cast_fp16")];
+            int32 var_4060 = const()[name = string("op_4060"), val = int32(-1)];
+            fp16 const_70_promoted_to_fp16 = const()[name = string("const_70_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_4062_cast_fp16 = mul(x = x_119_cast_fp16, y = const_70_promoted_to_fp16)[name = string("op_4062_cast_fp16")];
+            bool input_177_interleave_0 = const()[name = string("input_177_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_177_cast_fp16 = concat(axis = var_4060, interleave = input_177_interleave_0, values = (x_119_cast_fp16, var_4062_cast_fp16))[name = string("input_177_cast_fp16")];
+            tensor<int32, [1]> normed_169_axes_0 = const()[name = string("normed_169_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4057_to_fp16 = const()[name = string("op_4057_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_169_cast_fp16 = layer_norm(axes = normed_169_axes_0, epsilon = var_4057_to_fp16, x = input_177_cast_fp16)[name = string("normed_169_cast_fp16")];
+            tensor<int32, [2]> var_4067_split_sizes_0 = const()[name = string("op_4067_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4067_axis_0 = const()[name = string("op_4067_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_4067_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_4067_cast_fp16_1 = split(axis = var_4067_axis_0, split_sizes = var_4067_split_sizes_0, x = normed_169_cast_fp16)[name = string("op_4067_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_6_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(551724160)))];
+            tensor<fp16, [1, 1, 2560]> h_37_cast_fp16 = mul(x = var_4067_cast_fp16_0, y = layers_6_input_layernorm_weight_promoted_to_fp16)[name = string("h_37_cast_fp16")];
+            tensor<int32, [3]> var_4073 = const()[name = string("op_4073"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_4076_axes_0 = const()[name = string("op_4076_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_4074_cast_fp16 = transpose(perm = var_4073, x = h_37_cast_fp16)[name = string("transpose_107")];
+            tensor<fp16, [1, 2560, 1, 1]> var_4076_cast_fp16 = expand_dims(axes = var_4076_axes_0, x = var_4074_cast_fp16)[name = string("op_4076_cast_fp16")];
+            string var_4092_pad_type_0 = const()[name = string("op_4092_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_4092_strides_0 = const()[name = string("op_4092_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_4092_pad_0 = const()[name = string("op_4092_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_4092_dilations_0 = const()[name = string("op_4092_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_4092_groups_0 = const()[name = string("op_4092_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_4092 = conv(dilations = var_4092_dilations_0, groups = var_4092_groups_0, pad = var_4092_pad_0, pad_type = var_4092_pad_type_0, strides = var_4092_strides_0, weight = layers_6_self_attn_q_proj_weight_palettized, x = var_4076_cast_fp16)[name = string("op_4092")];
+            tensor<int32, [4]> var_4097 = const()[name = string("op_4097"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_4098 = reshape(shape = var_4097, x = var_4092)[name = string("op_4098")];
+            tensor<int32, [4]> var_4103 = const()[name = string("op_4103"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_4113 = const()[name = string("op_4113"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_4104 = transpose(perm = var_4103, x = var_4098)[name = string("transpose_106")];
+            tensor<fp16, [1, 8, 256]> x_121 = reshape(shape = var_4113, x = var_4104)[name = string("x_121")];
+            int32 var_4119 = const()[name = string("op_4119"), val = int32(-1)];
+            fp16 const_71_promoted = const()[name = string("const_71_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_4121 = mul(x = x_121, y = const_71_promoted)[name = string("op_4121")];
+            bool input_181_interleave_0 = const()[name = string("input_181_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_181 = concat(axis = var_4119, interleave = input_181_interleave_0, values = (x_121, var_4121))[name = string("input_181")];
+            tensor<int32, [1]> normed_173_axes_0 = const()[name = string("normed_173_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4116_to_fp16 = const()[name = string("op_4116_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_173_cast_fp16 = layer_norm(axes = normed_173_axes_0, epsilon = var_4116_to_fp16, x = input_181)[name = string("normed_173_cast_fp16")];
+            tensor<int32, [2]> var_4126_split_sizes_0 = const()[name = string("op_4126_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_4126_axis_0 = const()[name = string("op_4126_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_4126_0, tensor<fp16, [1, 8, 256]> var_4126_1 = split(axis = var_4126_axis_0, split_sizes = var_4126_split_sizes_0, x = normed_173_cast_fp16)[name = string("op_4126")];
+            tensor<fp16, [1, 8, 256]> var_4128 = mul(x = var_4126_0, y = layers_2_self_attn_q_norm_weight)[name = string("op_4128")];
+            tensor<int32, [4]> var_4133 = const()[name = string("op_4133"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_51 = reshape(shape = var_4133, x = var_4128)[name = string("q_51")];
+            tensor<fp16, [1, 8, 1, 256]> var_4135_cast_fp16 = mul(x = q_51, y = cos_s)[name = string("op_4135_cast_fp16")];
+            tensor<int32, [2]> var_4136_split_sizes_0 = const()[name = string("op_4136_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_4136_axis_0 = const()[name = string("op_4136_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_4136_0, tensor<fp16, [1, 8, 1, 128]> var_4136_1 = split(axis = var_4136_axis_0, split_sizes = var_4136_split_sizes_0, x = q_51)[name = string("op_4136")];
+            fp16 const_72_promoted = const()[name = string("const_72_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_4138 = mul(x = var_4136_1, y = const_72_promoted)[name = string("op_4138")];
+            int32 var_4140 = const()[name = string("op_4140"), val = int32(-1)];
+            bool var_4141_interleave_0 = const()[name = string("op_4141_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_4141 = concat(axis = var_4140, interleave = var_4141_interleave_0, values = (var_4138, var_4136_0))[name = string("op_4141")];
+            tensor<fp16, [1, 8, 1, 256]> var_4142_cast_fp16 = mul(x = var_4141, y = sin_s)[name = string("op_4142_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_55_cast_fp16 = add(x = var_4135_cast_fp16, y = var_4142_cast_fp16)[name = string("q_55_cast_fp16")];
+            string var_4155_pad_type_0 = const()[name = string("op_4155_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_4155_strides_0 = const()[name = string("op_4155_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_4155_pad_0 = const()[name = string("op_4155_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_4155_dilations_0 = const()[name = string("op_4155_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_4155_groups_0 = const()[name = string("op_4155_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_4155 = conv(dilations = var_4155_dilations_0, groups = var_4155_groups_0, pad = var_4155_pad_0, pad_type = var_4155_pad_type_0, strides = var_4155_strides_0, weight = layers_6_self_attn_k_proj_weight_palettized, x = var_4076_cast_fp16)[name = string("op_4155")];
+            tensor<int32, [4]> var_4160 = const()[name = string("op_4160"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_4161 = reshape(shape = var_4160, x = var_4155)[name = string("op_4161")];
+            tensor<int32, [4]> var_4166 = const()[name = string("op_4166"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_4183_pad_type_0 = const()[name = string("op_4183_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_4183_strides_0 = const()[name = string("op_4183_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_4183_pad_0 = const()[name = string("op_4183_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_4183_dilations_0 = const()[name = string("op_4183_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_4183_groups_0 = const()[name = string("op_4183_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_4183 = conv(dilations = var_4183_dilations_0, groups = var_4183_groups_0, pad = var_4183_pad_0, pad_type = var_4183_pad_type_0, strides = var_4183_strides_0, weight = layers_6_self_attn_v_proj_weight_palettized, x = var_4076_cast_fp16)[name = string("op_4183")];
+            tensor<int32, [4]> var_4188 = const()[name = string("op_4188"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_4189 = reshape(shape = var_4188, x = var_4183)[name = string("op_4189")];
+            tensor<int32, [4]> var_4194 = const()[name = string("op_4194"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_4204 = const()[name = string("op_4204"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp16, [1, 2, 1, 256]> var_4167 = transpose(perm = var_4166, x = var_4161)[name = string("transpose_105")];
+            tensor<fp16, [1, 2, 256]> x_123 = reshape(shape = var_4204, x = var_4167)[name = string("x_123")];
+            int32 var_4210 = const()[name = string("op_4210"), val = int32(-1)];
+            fp16 const_73_promoted = const()[name = string("const_73_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 256]> var_4212 = mul(x = x_123, y = const_73_promoted)[name = string("op_4212")];
+            bool input_183_interleave_0 = const()[name = string("input_183_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512]> input_183 = concat(axis = var_4210, interleave = input_183_interleave_0, values = (x_123, var_4212))[name = string("input_183")];
+            tensor<int32, [1]> normed_177_axes_0 = const()[name = string("normed_177_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4207_to_fp16 = const()[name = string("op_4207_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 512]> normed_177_cast_fp16 = layer_norm(axes = normed_177_axes_0, epsilon = var_4207_to_fp16, x = input_183)[name = string("normed_177_cast_fp16")];
+            tensor<int32, [2]> var_4217_split_sizes_0 = const()[name = string("op_4217_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_4217_axis_0 = const()[name = string("op_4217_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 256]> var_4217_0, tensor<fp16, [1, 2, 256]> var_4217_1 = split(axis = var_4217_axis_0, split_sizes = var_4217_split_sizes_0, x = normed_177_cast_fp16)[name = string("op_4217")];
+            tensor<fp16, [1, 2, 256]> var_4219 = mul(x = var_4217_0, y = layers_6_self_attn_k_norm_weight)[name = string("op_4219")];
+            tensor<int32, [4]> var_4224 = const()[name = string("op_4224"), val = tensor<int32, [4]>([1, 2, 1, 256])];
+            tensor<fp16, [1, 2, 1, 256]> q_53 = reshape(shape = var_4224, x = var_4219)[name = string("q_53")];
+            fp16 var_4226_promoted = const()[name = string("op_4226_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 256]> var_4195 = transpose(perm = var_4194, x = var_4189)[name = string("transpose_104")];
+            tensor<fp16, [1, 2, 1, 256]> var_4227 = pow(x = var_4195, y = var_4226_promoted)[name = string("op_4227")];
+            tensor<int32, [1]> var_4232_axes_0 = const()[name = string("op_4232_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4232_keep_dims_0 = const()[name = string("op_4232_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_4232 = reduce_mean(axes = var_4232_axes_0, keep_dims = var_4232_keep_dims_0, x = var_4227)[name = string("op_4232")];
+            fp16 var_4234_to_fp16 = const()[name = string("op_4234_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_13_cast_fp16 = add(x = var_4232, y = var_4234_to_fp16)[name = string("mean_sq_13_cast_fp16")];
+            fp32 var_4236_epsilon_0 = const()[name = string("op_4236_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_4236_cast_fp16 = rsqrt(epsilon = var_4236_epsilon_0, x = mean_sq_13_cast_fp16)[name = string("op_4236_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_187_cast_fp16 = mul(x = var_4195, y = var_4236_cast_fp16)[name = string("input_187_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> var_4238_cast_fp16 = mul(x = q_53, y = cos_s)[name = string("op_4238_cast_fp16")];
+            tensor<int32, [2]> var_4239_split_sizes_0 = const()[name = string("op_4239_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_4239_axis_0 = const()[name = string("op_4239_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 128]> var_4239_0, tensor<fp16, [1, 2, 1, 128]> var_4239_1 = split(axis = var_4239_axis_0, split_sizes = var_4239_split_sizes_0, x = q_53)[name = string("op_4239")];
+            fp16 const_74_promoted = const()[name = string("const_74_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 128]> var_4241 = mul(x = var_4239_1, y = const_74_promoted)[name = string("op_4241")];
+            int32 var_4243 = const()[name = string("op_4243"), val = int32(-1)];
+            bool var_4244_interleave_0 = const()[name = string("op_4244_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 256]> var_4244 = concat(axis = var_4243, interleave = var_4244_interleave_0, values = (var_4241, var_4239_0))[name = string("op_4244")];
+            tensor<fp16, [1, 2, 1, 256]> var_4245_cast_fp16 = mul(x = var_4244, y = sin_s)[name = string("op_4245_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_185_cast_fp16 = add(x = var_4238_cast_fp16, y = var_4245_cast_fp16)[name = string("input_185_cast_fp16")];
+            tensor<int32, [8]> k_padded_11_pad_0 = const()[name = string("k_padded_11_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_11_mode_0 = const()[name = string("k_padded_11_mode_0"), val = string("constant")];
+            fp16 const_75_to_fp16 = const()[name = string("const_75_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> k_padded_11_cast_fp16 = pad(constant_val = const_75_to_fp16, mode = k_padded_11_mode_0, pad = k_padded_11_pad_0, x = input_185_cast_fp16)[name = string("k_padded_11_cast_fp16")];
+            tensor<int32, [8]> v_padded_11_pad_0 = const()[name = string("v_padded_11_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_11_mode_0 = const()[name = string("v_padded_11_mode_0"), val = string("constant")];
+            fp16 const_76_to_fp16 = const()[name = string("const_76_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> v_padded_11_cast_fp16 = pad(constant_val = const_76_to_fp16, mode = v_padded_11_mode_0, pad = v_padded_11_pad_0, x = input_187_cast_fp16)[name = string("v_padded_11_cast_fp16")];
+            tensor<int32, [4]> var_4274_begin_0 = const()[name = string("op_4274_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_4274_end_0 = const()[name = string("op_4274_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_4274_end_mask_0 = const()[name = string("op_4274_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_4274_cast_fp16 = slice_by_index(begin = var_4274_begin_0, end = var_4274_end_0, end_mask = var_4274_end_mask_0, x = K_sliding_slot_11_cast_fp16)[name = string("op_4274_cast_fp16")];
+            int32 var_4281 = const()[name = string("op_4281"), val = int32(2)];
+            bool K_sliding_out_11_interleave_0 = const()[name = string("K_sliding_out_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_out_11_cast_fp16 = concat(axis = var_4281, interleave = K_sliding_out_11_interleave_0, values = (var_4274_cast_fp16, k_padded_11_cast_fp16))[name = string("K_sliding_out_11_cast_fp16")];
+            tensor<int32, [4]> var_4297_begin_0 = const()[name = string("op_4297_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_4297_end_0 = const()[name = string("op_4297_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_4297_end_mask_0 = const()[name = string("op_4297_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_4297_cast_fp16 = slice_by_index(begin = var_4297_begin_0, end = var_4297_end_0, end_mask = var_4297_end_mask_0, x = V_sliding_slot_11_cast_fp16)[name = string("op_4297_cast_fp16")];
+            int32 var_4304 = const()[name = string("op_4304"), val = int32(2)];
+            bool V_sliding_out_11_interleave_0 = const()[name = string("V_sliding_out_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_out_11_cast_fp16 = concat(axis = var_4304, interleave = V_sliding_out_11_interleave_0, values = (var_4297_cast_fp16, v_padded_11_cast_fp16))[name = string("V_sliding_out_11_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_13_begin_0 = const()[name = string("K_for_attn_13_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_13_end_0 = const()[name = string("K_for_attn_13_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_13_end_mask_0 = const()[name = string("K_for_attn_13_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_13_cast_fp16 = slice_by_index(begin = K_for_attn_13_begin_0, end = K_for_attn_13_end_0, end_mask = K_for_attn_13_end_mask_0, x = K_sliding_out_11_cast_fp16)[name = string("K_for_attn_13_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_13_begin_0 = const()[name = string("V_for_attn_13_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_13_end_0 = const()[name = string("V_for_attn_13_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_13_end_mask_0 = const()[name = string("V_for_attn_13_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_13_cast_fp16 = slice_by_index(begin = V_for_attn_13_begin_0, end = V_for_attn_13_end_0, end_mask = V_for_attn_13_end_mask_0, x = V_sliding_out_11_cast_fp16)[name = string("V_for_attn_13_cast_fp16")];
+            tensor<int32, [4]> transpose_24_perm_0 = const()[name = string("transpose_24_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_12_reps_0 = const()[name = string("tile_12_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_24_cast_fp16 = transpose(perm = transpose_24_perm_0, x = K_for_attn_13_cast_fp16)[name = string("transpose_103")];
+            tensor<fp16, [8, 1, 512, 256]> tile_12_cast_fp16 = tile(reps = tile_12_reps_0, x = transpose_24_cast_fp16)[name = string("tile_12_cast_fp16")];
+            tensor<int32, [5]> concat_24 = const()[name = string("concat_24"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_24_cast_fp16 = reshape(shape = concat_24, x = tile_12_cast_fp16)[name = string("reshape_24_cast_fp16")];
+            tensor<int32, [5]> transpose_25_perm_0 = const()[name = string("transpose_25_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_25 = const()[name = string("concat_25"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_25_cast_fp16 = transpose(perm = transpose_25_perm_0, x = reshape_24_cast_fp16)[name = string("transpose_102")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_25_cast_fp16 = reshape(shape = concat_25, x = transpose_25_cast_fp16)[name = string("reshape_25_cast_fp16")];
+            tensor<int32, [4]> transpose_54_perm_0 = const()[name = string("transpose_54_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_26_perm_0 = const()[name = string("transpose_26_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_13_reps_0 = const()[name = string("tile_13_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_26_cast_fp16 = transpose(perm = transpose_26_perm_0, x = V_for_attn_13_cast_fp16)[name = string("transpose_101")];
+            tensor<fp16, [8, 1, 512, 256]> tile_13_cast_fp16 = tile(reps = tile_13_reps_0, x = transpose_26_cast_fp16)[name = string("tile_13_cast_fp16")];
+            tensor<int32, [5]> concat_26 = const()[name = string("concat_26"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_26_cast_fp16 = reshape(shape = concat_26, x = tile_13_cast_fp16)[name = string("reshape_26_cast_fp16")];
+            tensor<int32, [5]> transpose_27_perm_0 = const()[name = string("transpose_27_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_27 = const()[name = string("concat_27"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_27_cast_fp16 = transpose(perm = transpose_27_perm_0, x = reshape_26_cast_fp16)[name = string("transpose_100")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_27_cast_fp16 = reshape(shape = concat_27, x = transpose_27_cast_fp16)[name = string("reshape_27_cast_fp16")];
+            tensor<int32, [4]> V_expanded_13_perm_0 = const()[name = string("V_expanded_13_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_25_transpose_x_0 = const()[name = string("attn_weights_25_transpose_x_0"), val = bool(false)];
+            bool attn_weights_25_transpose_y_0 = const()[name = string("attn_weights_25_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_54_cast_fp16 = transpose(perm = transpose_54_perm_0, x = reshape_25_cast_fp16)[name = string("transpose_99")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_25_cast_fp16 = matmul(transpose_x = attn_weights_25_transpose_x_0, transpose_y = attn_weights_25_transpose_y_0, x = q_55_cast_fp16, y = transpose_54_cast_fp16)[name = string("attn_weights_25_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_127_cast_fp16 = add(x = attn_weights_25_cast_fp16, y = causal_mask_sliding)[name = string("x_127_cast_fp16")];
+            tensor<int32, [1]> reduce_max_6_axes_0 = const()[name = string("reduce_max_6_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_6_keep_dims_0 = const()[name = string("reduce_max_6_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_6 = reduce_max(axes = reduce_max_6_axes_0, keep_dims = reduce_max_6_keep_dims_0, x = x_127_cast_fp16)[name = string("reduce_max_6")];
+            tensor<fp16, [1, 8, 1, 512]> var_4345 = sub(x = x_127_cast_fp16, y = reduce_max_6)[name = string("op_4345")];
+            tensor<fp16, [1, 8, 1, 512]> var_4351 = exp(x = var_4345)[name = string("op_4351")];
+            tensor<int32, [1]> var_4361_axes_0 = const()[name = string("op_4361_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4361_keep_dims_0 = const()[name = string("op_4361_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_4361 = reduce_sum(axes = var_4361_axes_0, keep_dims = var_4361_keep_dims_0, x = var_4351)[name = string("op_4361")];
+            tensor<fp16, [1, 8, 1, 512]> var_4367_cast_fp16 = real_div(x = var_4351, y = var_4361)[name = string("op_4367_cast_fp16")];
+            bool attn_output_37_transpose_x_0 = const()[name = string("attn_output_37_transpose_x_0"), val = bool(false)];
+            bool attn_output_37_transpose_y_0 = const()[name = string("attn_output_37_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_13_cast_fp16 = transpose(perm = V_expanded_13_perm_0, x = reshape_27_cast_fp16)[name = string("transpose_98")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_37_cast_fp16 = matmul(transpose_x = attn_output_37_transpose_x_0, transpose_y = attn_output_37_transpose_y_0, x = var_4367_cast_fp16, y = V_expanded_13_cast_fp16)[name = string("attn_output_37_cast_fp16")];
+            tensor<int32, [4]> var_4378 = const()[name = string("op_4378"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_4385 = const()[name = string("op_4385"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_4379_cast_fp16 = transpose(perm = var_4378, x = attn_output_37_cast_fp16)[name = string("transpose_97")];
+            tensor<fp16, [1, 1, 2048]> attn_output_39_cast_fp16 = reshape(shape = var_4385, x = var_4379_cast_fp16)[name = string("attn_output_39_cast_fp16")];
+            tensor<int32, [3]> var_4390 = const()[name = string("op_4390"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_4406_pad_type_0 = const()[name = string("op_4406_pad_type_0"), val = string("valid")];
+            int32 var_4406_groups_0 = const()[name = string("op_4406_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_4406_strides_0 = const()[name = string("op_4406_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_4406_pad_0 = const()[name = string("op_4406_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_4406_dilations_0 = const()[name = string("op_4406_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_6_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(551729344))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(554350848))))[name = string("squeeze_6_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_4391_cast_fp16 = transpose(perm = var_4390, x = attn_output_39_cast_fp16)[name = string("transpose_96")];
+            tensor<fp16, [1, 2560, 1]> var_4406_cast_fp16 = conv(dilations = var_4406_dilations_0, groups = var_4406_groups_0, pad = var_4406_pad_0, pad_type = var_4406_pad_type_0, strides = var_4406_strides_0, weight = squeeze_6_cast_fp16_to_fp32_to_fp16_palettized, x = var_4391_cast_fp16)[name = string("op_4406_cast_fp16")];
+            tensor<int32, [3]> var_4410 = const()[name = string("op_4410"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_4416 = const()[name = string("op_4416"), val = int32(-1)];
+            fp16 const_77_promoted_to_fp16 = const()[name = string("const_77_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_131_cast_fp16 = transpose(perm = var_4410, x = var_4406_cast_fp16)[name = string("transpose_95")];
+            tensor<fp16, [1, 1, 2560]> var_4418_cast_fp16 = mul(x = x_131_cast_fp16, y = const_77_promoted_to_fp16)[name = string("op_4418_cast_fp16")];
+            bool input_191_interleave_0 = const()[name = string("input_191_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_191_cast_fp16 = concat(axis = var_4416, interleave = input_191_interleave_0, values = (x_131_cast_fp16, var_4418_cast_fp16))[name = string("input_191_cast_fp16")];
+            tensor<int32, [1]> normed_181_axes_0 = const()[name = string("normed_181_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4413_to_fp16 = const()[name = string("op_4413_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_181_cast_fp16 = layer_norm(axes = normed_181_axes_0, epsilon = var_4413_to_fp16, x = input_191_cast_fp16)[name = string("normed_181_cast_fp16")];
+            tensor<int32, [2]> var_4423_split_sizes_0 = const()[name = string("op_4423_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4423_axis_0 = const()[name = string("op_4423_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_4423_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_4423_cast_fp16_1 = split(axis = var_4423_axis_0, split_sizes = var_4423_split_sizes_0, x = normed_181_cast_fp16)[name = string("op_4423_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_6_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(554353472)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_41_cast_fp16 = mul(x = var_4423_cast_fp16_0, y = layers_6_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_41_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_133_cast_fp16 = add(x = x_119_cast_fp16, y = attn_output_41_cast_fp16)[name = string("x_133_cast_fp16")];
+            int32 var_4432 = const()[name = string("op_4432"), val = int32(-1)];
+            fp16 const_78_promoted_to_fp16 = const()[name = string("const_78_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_4434_cast_fp16 = mul(x = x_133_cast_fp16, y = const_78_promoted_to_fp16)[name = string("op_4434_cast_fp16")];
+            bool input_193_interleave_0 = const()[name = string("input_193_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_193_cast_fp16 = concat(axis = var_4432, interleave = input_193_interleave_0, values = (x_133_cast_fp16, var_4434_cast_fp16))[name = string("input_193_cast_fp16")];
+            tensor<int32, [1]> normed_185_axes_0 = const()[name = string("normed_185_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4429_to_fp16 = const()[name = string("op_4429_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_185_cast_fp16 = layer_norm(axes = normed_185_axes_0, epsilon = var_4429_to_fp16, x = input_193_cast_fp16)[name = string("normed_185_cast_fp16")];
+            tensor<int32, [2]> var_4439_split_sizes_0 = const()[name = string("op_4439_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4439_axis_0 = const()[name = string("op_4439_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_4439_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_4439_cast_fp16_1 = split(axis = var_4439_axis_0, split_sizes = var_4439_split_sizes_0, x = normed_185_cast_fp16)[name = string("op_4439_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_6_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(554358656)))];
+            tensor<fp16, [1, 1, 2560]> h_39_cast_fp16 = mul(x = var_4439_cast_fp16_0, y = layers_6_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_39_cast_fp16")];
+            tensor<int32, [3]> var_4450 = const()[name = string("op_4450"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_195_axes_0 = const()[name = string("input_195_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_4451 = transpose(perm = var_4450, x = h_39_cast_fp16)[name = string("transpose_94")];
+            tensor<fp16, [1, 2560, 1, 1]> input_195 = expand_dims(axes = input_195_axes_0, x = var_4451)[name = string("input_195")];
+            string gate_25_pad_type_0 = const()[name = string("gate_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_25_strides_0 = const()[name = string("gate_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_25_pad_0 = const()[name = string("gate_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_25_dilations_0 = const()[name = string("gate_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_25_groups_0 = const()[name = string("gate_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_25 = conv(dilations = gate_25_dilations_0, groups = gate_25_groups_0, pad = gate_25_pad_0, pad_type = gate_25_pad_type_0, strides = gate_25_strides_0, weight = layers_6_mlp_gate_proj_weight_palettized, x = input_195)[name = string("gate_25")];
+            string up_13_pad_type_0 = const()[name = string("up_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_13_strides_0 = const()[name = string("up_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_13_pad_0 = const()[name = string("up_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_13_dilations_0 = const()[name = string("up_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_13_groups_0 = const()[name = string("up_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_13 = conv(dilations = up_13_dilations_0, groups = up_13_groups_0, pad = up_13_pad_0, pad_type = up_13_pad_type_0, strides = up_13_strides_0, weight = layers_6_mlp_up_proj_weight_palettized, x = input_195)[name = string("up_13")];
+            string gate_27_mode_0 = const()[name = string("gate_27_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_27 = gelu(mode = gate_27_mode_0, x = gate_25)[name = string("gate_27")];
+            tensor<fp16, [1, 10240, 1, 1]> input_197 = mul(x = gate_27, y = up_13)[name = string("input_197")];
+            string mlp_out_13_pad_type_0 = const()[name = string("mlp_out_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_13_strides_0 = const()[name = string("mlp_out_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_13_pad_0 = const()[name = string("mlp_out_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_13_dilations_0 = const()[name = string("mlp_out_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_13_groups_0 = const()[name = string("mlp_out_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_13 = conv(dilations = mlp_out_13_dilations_0, groups = mlp_out_13_groups_0, pad = mlp_out_13_pad_0, pad_type = mlp_out_13_pad_type_0, strides = mlp_out_13_strides_0, weight = layers_6_mlp_down_proj_weight_palettized, x = input_197)[name = string("mlp_out_13")];
+            tensor<int32, [1]> var_4491_axes_0 = const()[name = string("op_4491_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_4491 = squeeze(axes = var_4491_axes_0, x = mlp_out_13)[name = string("op_4491")];
+            tensor<int32, [3]> var_4495 = const()[name = string("op_4495"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_4501 = const()[name = string("op_4501"), val = int32(-1)];
+            fp16 const_79_promoted = const()[name = string("const_79_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_135 = transpose(perm = var_4495, x = var_4491)[name = string("transpose_93")];
+            tensor<fp16, [1, 1, 2560]> var_4503 = mul(x = x_135, y = const_79_promoted)[name = string("op_4503")];
+            bool input_199_interleave_0 = const()[name = string("input_199_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_199 = concat(axis = var_4501, interleave = input_199_interleave_0, values = (x_135, var_4503))[name = string("input_199")];
+            tensor<int32, [1]> normed_189_axes_0 = const()[name = string("normed_189_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4498_to_fp16 = const()[name = string("op_4498_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_189_cast_fp16 = layer_norm(axes = normed_189_axes_0, epsilon = var_4498_to_fp16, x = input_199)[name = string("normed_189_cast_fp16")];
+            tensor<int32, [2]> var_4508_split_sizes_0 = const()[name = string("op_4508_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4508_axis_0 = const()[name = string("op_4508_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_4508_0, tensor<fp16, [1, 1, 2560]> var_4508_1 = split(axis = var_4508_axis_0, split_sizes = var_4508_split_sizes_0, x = normed_189_cast_fp16)[name = string("op_4508")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_63 = mul(x = var_4508_0, y = layers_6_post_feedforward_layernorm_weight)[name = string("hidden_states_63")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_65_cast_fp16 = add(x = x_133_cast_fp16, y = hidden_states_63)[name = string("hidden_states_65_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_13_begin_0 = const()[name = string("per_layer_slice_13_begin_0"), val = tensor<int32, [3]>([0, 0, 4608])];
+            tensor<int32, [3]> per_layer_slice_13_end_0 = const()[name = string("per_layer_slice_13_end_0"), val = tensor<int32, [3]>([1, 1, 4864])];
+            tensor<bool, [3]> per_layer_slice_13_end_mask_0 = const()[name = string("per_layer_slice_13_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_13_cast_fp16 = slice_by_index(begin = per_layer_slice_13_begin_0, end = per_layer_slice_13_end_0, end_mask = per_layer_slice_13_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_13_cast_fp16")];
+            tensor<int32, [3]> var_4536 = const()[name = string("op_4536"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_201_axes_0 = const()[name = string("input_201_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_4537 = transpose(perm = var_4536, x = hidden_states_65_cast_fp16)[name = string("transpose_92")];
+            tensor<fp16, [1, 2560, 1, 1]> input_201 = expand_dims(axes = input_201_axes_0, x = var_4537)[name = string("input_201")];
+            string gated_37_pad_type_0 = const()[name = string("gated_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_37_strides_0 = const()[name = string("gated_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_37_pad_0 = const()[name = string("gated_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_37_dilations_0 = const()[name = string("gated_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_37_groups_0 = const()[name = string("gated_37_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_37 = conv(dilations = gated_37_dilations_0, groups = gated_37_groups_0, pad = gated_37_pad_0, pad_type = gated_37_pad_type_0, strides = gated_37_strides_0, weight = layers_6_per_layer_input_gate_weight_palettized, x = input_201)[name = string("gated_37")];
+            string gated_39_mode_0 = const()[name = string("gated_39_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_39 = gelu(mode = gated_39_mode_0, x = gated_37)[name = string("gated_39")];
+            tensor<int32, [3]> var_4556 = const()[name = string("op_4556"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_13_axes_0 = const()[name = string("per_layer_slice_conv_13_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_4557_cast_fp16 = transpose(perm = var_4556, x = per_layer_slice_13_cast_fp16)[name = string("transpose_91")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_13_cast_fp16 = expand_dims(axes = per_layer_slice_conv_13_axes_0, x = var_4557_cast_fp16)[name = string("per_layer_slice_conv_13_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_203_cast_fp16 = mul(x = gated_39, y = per_layer_slice_conv_13_cast_fp16)[name = string("input_203_cast_fp16")];
+            string gated_41_pad_type_0 = const()[name = string("gated_41_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_41_strides_0 = const()[name = string("gated_41_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_41_pad_0 = const()[name = string("gated_41_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_41_dilations_0 = const()[name = string("gated_41_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_41_groups_0 = const()[name = string("gated_41_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_6_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(554363840))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(554691584))))[name = string("layers_6_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_41_cast_fp16 = conv(dilations = gated_41_dilations_0, groups = gated_41_groups_0, pad = gated_41_pad_0, pad_type = gated_41_pad_type_0, strides = gated_41_strides_0, weight = layers_6_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_203_cast_fp16)[name = string("gated_41_cast_fp16")];
+            tensor<int32, [1]> var_4573_axes_0 = const()[name = string("op_4573_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_4573_cast_fp16 = squeeze(axes = var_4573_axes_0, x = gated_41_cast_fp16)[name = string("op_4573_cast_fp16")];
+            tensor<int32, [3]> var_4577 = const()[name = string("op_4577"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_4583 = const()[name = string("op_4583"), val = int32(-1)];
+            fp16 const_80_promoted_to_fp16 = const()[name = string("const_80_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_137_cast_fp16 = transpose(perm = var_4577, x = var_4573_cast_fp16)[name = string("transpose_90")];
+            tensor<fp16, [1, 1, 2560]> var_4585_cast_fp16 = mul(x = x_137_cast_fp16, y = const_80_promoted_to_fp16)[name = string("op_4585_cast_fp16")];
+            bool input_205_interleave_0 = const()[name = string("input_205_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_205_cast_fp16 = concat(axis = var_4583, interleave = input_205_interleave_0, values = (x_137_cast_fp16, var_4585_cast_fp16))[name = string("input_205_cast_fp16")];
+            tensor<int32, [1]> normed_193_axes_0 = const()[name = string("normed_193_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4580_to_fp16 = const()[name = string("op_4580_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_193_cast_fp16 = layer_norm(axes = normed_193_axes_0, epsilon = var_4580_to_fp16, x = input_205_cast_fp16)[name = string("normed_193_cast_fp16")];
+            tensor<int32, [2]> var_4590_split_sizes_0 = const()[name = string("op_4590_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4590_axis_0 = const()[name = string("op_4590_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_4590_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_4590_cast_fp16_1 = split(axis = var_4590_axis_0, split_sizes = var_4590_split_sizes_0, x = normed_193_cast_fp16)[name = string("op_4590_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_6_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(554694208)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_69_cast_fp16 = mul(x = var_4590_cast_fp16_0, y = layers_6_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_69_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_71_cast_fp16 = add(x = hidden_states_65_cast_fp16, y = hidden_states_69_cast_fp16)[name = string("hidden_states_71_cast_fp16")];
+            tensor<fp16, [1]> const_81_promoted_to_fp16 = const()[name = string("const_81_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.16p-1])];
+            tensor<fp16, [1, 1, 2560]> x_139_cast_fp16 = mul(x = hidden_states_71_cast_fp16, y = const_81_promoted_to_fp16)[name = string("x_139_cast_fp16")];
+            tensor<int32, [1]> var_4602_axes_0 = const()[name = string("op_4602_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_4602_cast_fp16 = squeeze(axes = var_4602_axes_0, x = K_sliding_out_11_cast_fp16)[name = string("op_4602_cast_fp16")];
+            tensor<int32, [1]> var_4604_axes_0 = const()[name = string("op_4604_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_4604_cast_fp16 = squeeze(axes = var_4604_axes_0, x = V_sliding_out_11_cast_fp16)[name = string("op_4604_cast_fp16")];
+            tensor<int32, [4]> var_4607_begin_0 = const()[name = string("op_4607_begin_0"), val = tensor<int32, [4]>([6, 0, 0, 0])];
+            tensor<int32, [4]> var_4607_end_0 = const()[name = string("op_4607_end_0"), val = tensor<int32, [4]>([7, 2, 512, 512])];
+            tensor<bool, [4]> var_4607_end_mask_0 = const()[name = string("op_4607_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_4607_squeeze_mask_0 = const()[name = string("op_4607_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_4607_cast_fp16 = slice_by_index(begin = var_4607_begin_0, end = var_4607_end_0, end_mask = var_4607_end_mask_0, squeeze_mask = var_4607_squeeze_mask_0, x = K_sliding_in)[name = string("op_4607_cast_fp16")];
+            tensor<int32, [1]> K_sliding_slot_13_axes_0 = const()[name = string("K_sliding_slot_13_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_slot_13_cast_fp16 = expand_dims(axes = K_sliding_slot_13_axes_0, x = var_4607_cast_fp16)[name = string("K_sliding_slot_13_cast_fp16")];
+            tensor<int32, [4]> var_4612_begin_0 = const()[name = string("op_4612_begin_0"), val = tensor<int32, [4]>([6, 0, 0, 0])];
+            tensor<int32, [4]> var_4612_end_0 = const()[name = string("op_4612_end_0"), val = tensor<int32, [4]>([7, 2, 512, 512])];
+            tensor<bool, [4]> var_4612_end_mask_0 = const()[name = string("op_4612_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_4612_squeeze_mask_0 = const()[name = string("op_4612_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_4612_cast_fp16 = slice_by_index(begin = var_4612_begin_0, end = var_4612_end_0, end_mask = var_4612_end_mask_0, squeeze_mask = var_4612_squeeze_mask_0, x = V_sliding_in)[name = string("op_4612_cast_fp16")];
+            tensor<int32, [1]> V_sliding_slot_13_axes_0 = const()[name = string("V_sliding_slot_13_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_slot_13_cast_fp16 = expand_dims(axes = V_sliding_slot_13_axes_0, x = var_4612_cast_fp16)[name = string("V_sliding_slot_13_cast_fp16")];
+            int32 var_4619 = const()[name = string("op_4619"), val = int32(-1)];
+            fp16 const_82_promoted_to_fp16 = const()[name = string("const_82_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_4621_cast_fp16 = mul(x = x_139_cast_fp16, y = const_82_promoted_to_fp16)[name = string("op_4621_cast_fp16")];
+            bool input_207_interleave_0 = const()[name = string("input_207_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_207_cast_fp16 = concat(axis = var_4619, interleave = input_207_interleave_0, values = (x_139_cast_fp16, var_4621_cast_fp16))[name = string("input_207_cast_fp16")];
+            tensor<int32, [1]> normed_197_axes_0 = const()[name = string("normed_197_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4616_to_fp16 = const()[name = string("op_4616_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_197_cast_fp16 = layer_norm(axes = normed_197_axes_0, epsilon = var_4616_to_fp16, x = input_207_cast_fp16)[name = string("normed_197_cast_fp16")];
+            tensor<int32, [2]> var_4626_split_sizes_0 = const()[name = string("op_4626_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4626_axis_0 = const()[name = string("op_4626_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_4626_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_4626_cast_fp16_1 = split(axis = var_4626_axis_0, split_sizes = var_4626_split_sizes_0, x = normed_197_cast_fp16)[name = string("op_4626_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_7_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(554699392)))];
+            tensor<fp16, [1, 1, 2560]> h_43_cast_fp16 = mul(x = var_4626_cast_fp16_0, y = layers_7_input_layernorm_weight_promoted_to_fp16)[name = string("h_43_cast_fp16")];
+            tensor<int32, [3]> var_4632 = const()[name = string("op_4632"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_4635_axes_0 = const()[name = string("op_4635_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_4633_cast_fp16 = transpose(perm = var_4632, x = h_43_cast_fp16)[name = string("transpose_89")];
+            tensor<fp16, [1, 2560, 1, 1]> var_4635_cast_fp16 = expand_dims(axes = var_4635_axes_0, x = var_4633_cast_fp16)[name = string("op_4635_cast_fp16")];
+            string var_4651_pad_type_0 = const()[name = string("op_4651_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_4651_strides_0 = const()[name = string("op_4651_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_4651_pad_0 = const()[name = string("op_4651_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_4651_dilations_0 = const()[name = string("op_4651_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_4651_groups_0 = const()[name = string("op_4651_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_4651 = conv(dilations = var_4651_dilations_0, groups = var_4651_groups_0, pad = var_4651_pad_0, pad_type = var_4651_pad_type_0, strides = var_4651_strides_0, weight = layers_7_self_attn_q_proj_weight_palettized, x = var_4635_cast_fp16)[name = string("op_4651")];
+            tensor<int32, [4]> var_4656 = const()[name = string("op_4656"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_4657 = reshape(shape = var_4656, x = var_4651)[name = string("op_4657")];
+            tensor<int32, [4]> var_4662 = const()[name = string("op_4662"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_4672 = const()[name = string("op_4672"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_4663 = transpose(perm = var_4662, x = var_4657)[name = string("transpose_88")];
+            tensor<fp16, [1, 8, 256]> x_141 = reshape(shape = var_4672, x = var_4663)[name = string("x_141")];
+            int32 var_4678 = const()[name = string("op_4678"), val = int32(-1)];
+            fp16 const_83_promoted = const()[name = string("const_83_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_4680 = mul(x = x_141, y = const_83_promoted)[name = string("op_4680")];
+            bool input_211_interleave_0 = const()[name = string("input_211_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_211 = concat(axis = var_4678, interleave = input_211_interleave_0, values = (x_141, var_4680))[name = string("input_211")];
+            tensor<int32, [1]> normed_201_axes_0 = const()[name = string("normed_201_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4675_to_fp16 = const()[name = string("op_4675_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_201_cast_fp16 = layer_norm(axes = normed_201_axes_0, epsilon = var_4675_to_fp16, x = input_211)[name = string("normed_201_cast_fp16")];
+            tensor<int32, [2]> var_4685_split_sizes_0 = const()[name = string("op_4685_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_4685_axis_0 = const()[name = string("op_4685_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_4685_0, tensor<fp16, [1, 8, 256]> var_4685_1 = split(axis = var_4685_axis_0, split_sizes = var_4685_split_sizes_0, x = normed_201_cast_fp16)[name = string("op_4685")];
+            tensor<fp16, [1, 8, 256]> var_4687 = mul(x = var_4685_0, y = layers_7_self_attn_q_norm_weight)[name = string("op_4687")];
+            tensor<int32, [4]> var_4692 = const()[name = string("op_4692"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_59 = reshape(shape = var_4692, x = var_4687)[name = string("q_59")];
+            tensor<fp16, [1, 8, 1, 256]> var_4694_cast_fp16 = mul(x = q_59, y = cos_s)[name = string("op_4694_cast_fp16")];
+            tensor<int32, [2]> var_4695_split_sizes_0 = const()[name = string("op_4695_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_4695_axis_0 = const()[name = string("op_4695_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_4695_0, tensor<fp16, [1, 8, 1, 128]> var_4695_1 = split(axis = var_4695_axis_0, split_sizes = var_4695_split_sizes_0, x = q_59)[name = string("op_4695")];
+            fp16 const_84_promoted = const()[name = string("const_84_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_4697 = mul(x = var_4695_1, y = const_84_promoted)[name = string("op_4697")];
+            int32 var_4699 = const()[name = string("op_4699"), val = int32(-1)];
+            bool var_4700_interleave_0 = const()[name = string("op_4700_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_4700 = concat(axis = var_4699, interleave = var_4700_interleave_0, values = (var_4697, var_4695_0))[name = string("op_4700")];
+            tensor<fp16, [1, 8, 1, 256]> var_4701_cast_fp16 = mul(x = var_4700, y = sin_s)[name = string("op_4701_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_63_cast_fp16 = add(x = var_4694_cast_fp16, y = var_4701_cast_fp16)[name = string("q_63_cast_fp16")];
+            string var_4714_pad_type_0 = const()[name = string("op_4714_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_4714_strides_0 = const()[name = string("op_4714_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_4714_pad_0 = const()[name = string("op_4714_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_4714_dilations_0 = const()[name = string("op_4714_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_4714_groups_0 = const()[name = string("op_4714_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_4714 = conv(dilations = var_4714_dilations_0, groups = var_4714_groups_0, pad = var_4714_pad_0, pad_type = var_4714_pad_type_0, strides = var_4714_strides_0, weight = layers_7_self_attn_k_proj_weight_palettized, x = var_4635_cast_fp16)[name = string("op_4714")];
+            tensor<int32, [4]> var_4719 = const()[name = string("op_4719"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_4720 = reshape(shape = var_4719, x = var_4714)[name = string("op_4720")];
+            tensor<int32, [4]> var_4725 = const()[name = string("op_4725"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_4742_pad_type_0 = const()[name = string("op_4742_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_4742_strides_0 = const()[name = string("op_4742_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_4742_pad_0 = const()[name = string("op_4742_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_4742_dilations_0 = const()[name = string("op_4742_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_4742_groups_0 = const()[name = string("op_4742_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_4742 = conv(dilations = var_4742_dilations_0, groups = var_4742_groups_0, pad = var_4742_pad_0, pad_type = var_4742_pad_type_0, strides = var_4742_strides_0, weight = layers_7_self_attn_v_proj_weight_palettized, x = var_4635_cast_fp16)[name = string("op_4742")];
+            tensor<int32, [4]> var_4747 = const()[name = string("op_4747"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_4748 = reshape(shape = var_4747, x = var_4742)[name = string("op_4748")];
+            tensor<int32, [4]> var_4753 = const()[name = string("op_4753"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_4763 = const()[name = string("op_4763"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp16, [1, 2, 1, 256]> var_4726 = transpose(perm = var_4725, x = var_4720)[name = string("transpose_87")];
+            tensor<fp16, [1, 2, 256]> x_143 = reshape(shape = var_4763, x = var_4726)[name = string("x_143")];
+            int32 var_4769 = const()[name = string("op_4769"), val = int32(-1)];
+            fp16 const_85_promoted = const()[name = string("const_85_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 256]> var_4771 = mul(x = x_143, y = const_85_promoted)[name = string("op_4771")];
+            bool input_213_interleave_0 = const()[name = string("input_213_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512]> input_213 = concat(axis = var_4769, interleave = input_213_interleave_0, values = (x_143, var_4771))[name = string("input_213")];
+            tensor<int32, [1]> normed_205_axes_0 = const()[name = string("normed_205_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4766_to_fp16 = const()[name = string("op_4766_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 512]> normed_205_cast_fp16 = layer_norm(axes = normed_205_axes_0, epsilon = var_4766_to_fp16, x = input_213)[name = string("normed_205_cast_fp16")];
+            tensor<int32, [2]> var_4776_split_sizes_0 = const()[name = string("op_4776_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_4776_axis_0 = const()[name = string("op_4776_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 256]> var_4776_0, tensor<fp16, [1, 2, 256]> var_4776_1 = split(axis = var_4776_axis_0, split_sizes = var_4776_split_sizes_0, x = normed_205_cast_fp16)[name = string("op_4776")];
+            tensor<fp16, [1, 2, 256]> var_4778 = mul(x = var_4776_0, y = layers_7_self_attn_k_norm_weight)[name = string("op_4778")];
+            tensor<int32, [4]> var_4783 = const()[name = string("op_4783"), val = tensor<int32, [4]>([1, 2, 1, 256])];
+            tensor<fp16, [1, 2, 1, 256]> q_61 = reshape(shape = var_4783, x = var_4778)[name = string("q_61")];
+            fp16 var_4785_promoted = const()[name = string("op_4785_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 256]> var_4754 = transpose(perm = var_4753, x = var_4748)[name = string("transpose_86")];
+            tensor<fp16, [1, 2, 1, 256]> var_4786 = pow(x = var_4754, y = var_4785_promoted)[name = string("op_4786")];
+            tensor<int32, [1]> var_4791_axes_0 = const()[name = string("op_4791_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4791_keep_dims_0 = const()[name = string("op_4791_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_4791 = reduce_mean(axes = var_4791_axes_0, keep_dims = var_4791_keep_dims_0, x = var_4786)[name = string("op_4791")];
+            fp16 var_4793_to_fp16 = const()[name = string("op_4793_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_15_cast_fp16 = add(x = var_4791, y = var_4793_to_fp16)[name = string("mean_sq_15_cast_fp16")];
+            fp32 var_4795_epsilon_0 = const()[name = string("op_4795_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_4795_cast_fp16 = rsqrt(epsilon = var_4795_epsilon_0, x = mean_sq_15_cast_fp16)[name = string("op_4795_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_217_cast_fp16 = mul(x = var_4754, y = var_4795_cast_fp16)[name = string("input_217_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> var_4797_cast_fp16 = mul(x = q_61, y = cos_s)[name = string("op_4797_cast_fp16")];
+            tensor<int32, [2]> var_4798_split_sizes_0 = const()[name = string("op_4798_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_4798_axis_0 = const()[name = string("op_4798_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 128]> var_4798_0, tensor<fp16, [1, 2, 1, 128]> var_4798_1 = split(axis = var_4798_axis_0, split_sizes = var_4798_split_sizes_0, x = q_61)[name = string("op_4798")];
+            fp16 const_86_promoted = const()[name = string("const_86_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 128]> var_4800 = mul(x = var_4798_1, y = const_86_promoted)[name = string("op_4800")];
+            int32 var_4802 = const()[name = string("op_4802"), val = int32(-1)];
+            bool var_4803_interleave_0 = const()[name = string("op_4803_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 256]> var_4803 = concat(axis = var_4802, interleave = var_4803_interleave_0, values = (var_4800, var_4798_0))[name = string("op_4803")];
+            tensor<fp16, [1, 2, 1, 256]> var_4804_cast_fp16 = mul(x = var_4803, y = sin_s)[name = string("op_4804_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_215_cast_fp16 = add(x = var_4797_cast_fp16, y = var_4804_cast_fp16)[name = string("input_215_cast_fp16")];
+            tensor<int32, [8]> k_padded_13_pad_0 = const()[name = string("k_padded_13_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_13_mode_0 = const()[name = string("k_padded_13_mode_0"), val = string("constant")];
+            fp16 const_87_to_fp16 = const()[name = string("const_87_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> k_padded_13_cast_fp16 = pad(constant_val = const_87_to_fp16, mode = k_padded_13_mode_0, pad = k_padded_13_pad_0, x = input_215_cast_fp16)[name = string("k_padded_13_cast_fp16")];
+            tensor<int32, [8]> v_padded_13_pad_0 = const()[name = string("v_padded_13_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_13_mode_0 = const()[name = string("v_padded_13_mode_0"), val = string("constant")];
+            fp16 const_88_to_fp16 = const()[name = string("const_88_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> v_padded_13_cast_fp16 = pad(constant_val = const_88_to_fp16, mode = v_padded_13_mode_0, pad = v_padded_13_pad_0, x = input_217_cast_fp16)[name = string("v_padded_13_cast_fp16")];
+            tensor<int32, [4]> var_4833_begin_0 = const()[name = string("op_4833_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_4833_end_0 = const()[name = string("op_4833_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_4833_end_mask_0 = const()[name = string("op_4833_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_4833_cast_fp16 = slice_by_index(begin = var_4833_begin_0, end = var_4833_end_0, end_mask = var_4833_end_mask_0, x = K_sliding_slot_13_cast_fp16)[name = string("op_4833_cast_fp16")];
+            int32 var_4840 = const()[name = string("op_4840"), val = int32(2)];
+            bool K_sliding_out_13_interleave_0 = const()[name = string("K_sliding_out_13_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_out_13_cast_fp16 = concat(axis = var_4840, interleave = K_sliding_out_13_interleave_0, values = (var_4833_cast_fp16, k_padded_13_cast_fp16))[name = string("K_sliding_out_13_cast_fp16")];
+            tensor<int32, [4]> var_4856_begin_0 = const()[name = string("op_4856_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_4856_end_0 = const()[name = string("op_4856_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_4856_end_mask_0 = const()[name = string("op_4856_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_4856_cast_fp16 = slice_by_index(begin = var_4856_begin_0, end = var_4856_end_0, end_mask = var_4856_end_mask_0, x = V_sliding_slot_13_cast_fp16)[name = string("op_4856_cast_fp16")];
+            int32 var_4863 = const()[name = string("op_4863"), val = int32(2)];
+            bool V_sliding_out_13_interleave_0 = const()[name = string("V_sliding_out_13_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_out_13_cast_fp16 = concat(axis = var_4863, interleave = V_sliding_out_13_interleave_0, values = (var_4856_cast_fp16, v_padded_13_cast_fp16))[name = string("V_sliding_out_13_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_15_begin_0 = const()[name = string("K_for_attn_15_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_15_end_0 = const()[name = string("K_for_attn_15_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_15_end_mask_0 = const()[name = string("K_for_attn_15_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_15_cast_fp16 = slice_by_index(begin = K_for_attn_15_begin_0, end = K_for_attn_15_end_0, end_mask = K_for_attn_15_end_mask_0, x = K_sliding_out_13_cast_fp16)[name = string("K_for_attn_15_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_15_begin_0 = const()[name = string("V_for_attn_15_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_15_end_0 = const()[name = string("V_for_attn_15_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_15_end_mask_0 = const()[name = string("V_for_attn_15_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_15_cast_fp16 = slice_by_index(begin = V_for_attn_15_begin_0, end = V_for_attn_15_end_0, end_mask = V_for_attn_15_end_mask_0, x = V_sliding_out_13_cast_fp16)[name = string("V_for_attn_15_cast_fp16")];
+            tensor<int32, [4]> transpose_28_perm_0 = const()[name = string("transpose_28_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_14_reps_0 = const()[name = string("tile_14_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_28_cast_fp16 = transpose(perm = transpose_28_perm_0, x = K_for_attn_15_cast_fp16)[name = string("transpose_85")];
+            tensor<fp16, [8, 1, 512, 256]> tile_14_cast_fp16 = tile(reps = tile_14_reps_0, x = transpose_28_cast_fp16)[name = string("tile_14_cast_fp16")];
+            tensor<int32, [5]> concat_28 = const()[name = string("concat_28"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_28_cast_fp16 = reshape(shape = concat_28, x = tile_14_cast_fp16)[name = string("reshape_28_cast_fp16")];
+            tensor<int32, [5]> transpose_29_perm_0 = const()[name = string("transpose_29_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_29 = const()[name = string("concat_29"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_29_cast_fp16 = transpose(perm = transpose_29_perm_0, x = reshape_28_cast_fp16)[name = string("transpose_84")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_29_cast_fp16 = reshape(shape = concat_29, x = transpose_29_cast_fp16)[name = string("reshape_29_cast_fp16")];
+            tensor<int32, [4]> transpose_55_perm_0 = const()[name = string("transpose_55_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_30_perm_0 = const()[name = string("transpose_30_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_15_reps_0 = const()[name = string("tile_15_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_30_cast_fp16 = transpose(perm = transpose_30_perm_0, x = V_for_attn_15_cast_fp16)[name = string("transpose_83")];
+            tensor<fp16, [8, 1, 512, 256]> tile_15_cast_fp16 = tile(reps = tile_15_reps_0, x = transpose_30_cast_fp16)[name = string("tile_15_cast_fp16")];
+            tensor<int32, [5]> concat_30 = const()[name = string("concat_30"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_30_cast_fp16 = reshape(shape = concat_30, x = tile_15_cast_fp16)[name = string("reshape_30_cast_fp16")];
+            tensor<int32, [5]> transpose_31_perm_0 = const()[name = string("transpose_31_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_31 = const()[name = string("concat_31"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_31_cast_fp16 = transpose(perm = transpose_31_perm_0, x = reshape_30_cast_fp16)[name = string("transpose_82")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_31_cast_fp16 = reshape(shape = concat_31, x = transpose_31_cast_fp16)[name = string("reshape_31_cast_fp16")];
+            tensor<int32, [4]> V_expanded_15_perm_0 = const()[name = string("V_expanded_15_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_29_transpose_x_0 = const()[name = string("attn_weights_29_transpose_x_0"), val = bool(false)];
+            bool attn_weights_29_transpose_y_0 = const()[name = string("attn_weights_29_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_55_cast_fp16 = transpose(perm = transpose_55_perm_0, x = reshape_29_cast_fp16)[name = string("transpose_81")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_29_cast_fp16 = matmul(transpose_x = attn_weights_29_transpose_x_0, transpose_y = attn_weights_29_transpose_y_0, x = q_63_cast_fp16, y = transpose_55_cast_fp16)[name = string("attn_weights_29_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_147_cast_fp16 = add(x = attn_weights_29_cast_fp16, y = causal_mask_sliding)[name = string("x_147_cast_fp16")];
+            tensor<int32, [1]> reduce_max_7_axes_0 = const()[name = string("reduce_max_7_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_7_keep_dims_0 = const()[name = string("reduce_max_7_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_7 = reduce_max(axes = reduce_max_7_axes_0, keep_dims = reduce_max_7_keep_dims_0, x = x_147_cast_fp16)[name = string("reduce_max_7")];
+            tensor<fp16, [1, 8, 1, 512]> var_4904 = sub(x = x_147_cast_fp16, y = reduce_max_7)[name = string("op_4904")];
+            tensor<fp16, [1, 8, 1, 512]> var_4910 = exp(x = var_4904)[name = string("op_4910")];
+            tensor<int32, [1]> var_4920_axes_0 = const()[name = string("op_4920_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4920_keep_dims_0 = const()[name = string("op_4920_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_4920 = reduce_sum(axes = var_4920_axes_0, keep_dims = var_4920_keep_dims_0, x = var_4910)[name = string("op_4920")];
+            tensor<fp16, [1, 8, 1, 512]> var_4926_cast_fp16 = real_div(x = var_4910, y = var_4920)[name = string("op_4926_cast_fp16")];
+            bool attn_output_43_transpose_x_0 = const()[name = string("attn_output_43_transpose_x_0"), val = bool(false)];
+            bool attn_output_43_transpose_y_0 = const()[name = string("attn_output_43_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_15_cast_fp16 = transpose(perm = V_expanded_15_perm_0, x = reshape_31_cast_fp16)[name = string("transpose_80")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_43_cast_fp16 = matmul(transpose_x = attn_output_43_transpose_x_0, transpose_y = attn_output_43_transpose_y_0, x = var_4926_cast_fp16, y = V_expanded_15_cast_fp16)[name = string("attn_output_43_cast_fp16")];
+            tensor<int32, [4]> var_4937 = const()[name = string("op_4937"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_4944 = const()[name = string("op_4944"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_4938_cast_fp16 = transpose(perm = var_4937, x = attn_output_43_cast_fp16)[name = string("transpose_79")];
+            tensor<fp16, [1, 1, 2048]> attn_output_45_cast_fp16 = reshape(shape = var_4944, x = var_4938_cast_fp16)[name = string("attn_output_45_cast_fp16")];
+            tensor<int32, [3]> var_4949 = const()[name = string("op_4949"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_4965_pad_type_0 = const()[name = string("op_4965_pad_type_0"), val = string("valid")];
+            int32 var_4965_groups_0 = const()[name = string("op_4965_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_4965_strides_0 = const()[name = string("op_4965_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_4965_pad_0 = const()[name = string("op_4965_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_4965_dilations_0 = const()[name = string("op_4965_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_7_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(554704576))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(557326080))))[name = string("squeeze_7_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_4950_cast_fp16 = transpose(perm = var_4949, x = attn_output_45_cast_fp16)[name = string("transpose_78")];
+            tensor<fp16, [1, 2560, 1]> var_4965_cast_fp16 = conv(dilations = var_4965_dilations_0, groups = var_4965_groups_0, pad = var_4965_pad_0, pad_type = var_4965_pad_type_0, strides = var_4965_strides_0, weight = squeeze_7_cast_fp16_to_fp32_to_fp16_palettized, x = var_4950_cast_fp16)[name = string("op_4965_cast_fp16")];
+            tensor<int32, [3]> var_4969 = const()[name = string("op_4969"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_4975 = const()[name = string("op_4975"), val = int32(-1)];
+            fp16 const_89_promoted_to_fp16 = const()[name = string("const_89_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_151_cast_fp16 = transpose(perm = var_4969, x = var_4965_cast_fp16)[name = string("transpose_77")];
+            tensor<fp16, [1, 1, 2560]> var_4977_cast_fp16 = mul(x = x_151_cast_fp16, y = const_89_promoted_to_fp16)[name = string("op_4977_cast_fp16")];
+            bool input_221_interleave_0 = const()[name = string("input_221_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_221_cast_fp16 = concat(axis = var_4975, interleave = input_221_interleave_0, values = (x_151_cast_fp16, var_4977_cast_fp16))[name = string("input_221_cast_fp16")];
+            tensor<int32, [1]> normed_209_axes_0 = const()[name = string("normed_209_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4972_to_fp16 = const()[name = string("op_4972_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_209_cast_fp16 = layer_norm(axes = normed_209_axes_0, epsilon = var_4972_to_fp16, x = input_221_cast_fp16)[name = string("normed_209_cast_fp16")];
+            tensor<int32, [2]> var_4982_split_sizes_0 = const()[name = string("op_4982_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4982_axis_0 = const()[name = string("op_4982_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_4982_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_4982_cast_fp16_1 = split(axis = var_4982_axis_0, split_sizes = var_4982_split_sizes_0, x = normed_209_cast_fp16)[name = string("op_4982_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_7_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(557328704)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_47_cast_fp16 = mul(x = var_4982_cast_fp16_0, y = layers_7_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_47_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_153_cast_fp16 = add(x = x_139_cast_fp16, y = attn_output_47_cast_fp16)[name = string("x_153_cast_fp16")];
+            int32 var_4991 = const()[name = string("op_4991"), val = int32(-1)];
+            fp16 const_90_promoted_to_fp16 = const()[name = string("const_90_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_4993_cast_fp16 = mul(x = x_153_cast_fp16, y = const_90_promoted_to_fp16)[name = string("op_4993_cast_fp16")];
+            bool input_223_interleave_0 = const()[name = string("input_223_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_223_cast_fp16 = concat(axis = var_4991, interleave = input_223_interleave_0, values = (x_153_cast_fp16, var_4993_cast_fp16))[name = string("input_223_cast_fp16")];
+            tensor<int32, [1]> normed_213_axes_0 = const()[name = string("normed_213_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4988_to_fp16 = const()[name = string("op_4988_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_213_cast_fp16 = layer_norm(axes = normed_213_axes_0, epsilon = var_4988_to_fp16, x = input_223_cast_fp16)[name = string("normed_213_cast_fp16")];
+            tensor<int32, [2]> var_4998_split_sizes_0 = const()[name = string("op_4998_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4998_axis_0 = const()[name = string("op_4998_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_4998_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_4998_cast_fp16_1 = split(axis = var_4998_axis_0, split_sizes = var_4998_split_sizes_0, x = normed_213_cast_fp16)[name = string("op_4998_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_7_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(557333888)))];
+            tensor<fp16, [1, 1, 2560]> h_45_cast_fp16 = mul(x = var_4998_cast_fp16_0, y = layers_7_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_45_cast_fp16")];
+            tensor<int32, [3]> var_5009 = const()[name = string("op_5009"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_225_axes_0 = const()[name = string("input_225_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_5010 = transpose(perm = var_5009, x = h_45_cast_fp16)[name = string("transpose_76")];
+            tensor<fp16, [1, 2560, 1, 1]> input_225 = expand_dims(axes = input_225_axes_0, x = var_5010)[name = string("input_225")];
+            string gate_29_pad_type_0 = const()[name = string("gate_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_29_strides_0 = const()[name = string("gate_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_29_pad_0 = const()[name = string("gate_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_29_dilations_0 = const()[name = string("gate_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_29_groups_0 = const()[name = string("gate_29_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_29 = conv(dilations = gate_29_dilations_0, groups = gate_29_groups_0, pad = gate_29_pad_0, pad_type = gate_29_pad_type_0, strides = gate_29_strides_0, weight = layers_7_mlp_gate_proj_weight_palettized, x = input_225)[name = string("gate_29")];
+            string up_15_pad_type_0 = const()[name = string("up_15_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_15_strides_0 = const()[name = string("up_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_15_pad_0 = const()[name = string("up_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_15_dilations_0 = const()[name = string("up_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_15_groups_0 = const()[name = string("up_15_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_15 = conv(dilations = up_15_dilations_0, groups = up_15_groups_0, pad = up_15_pad_0, pad_type = up_15_pad_type_0, strides = up_15_strides_0, weight = layers_7_mlp_up_proj_weight_palettized, x = input_225)[name = string("up_15")];
+            string gate_31_mode_0 = const()[name = string("gate_31_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_31 = gelu(mode = gate_31_mode_0, x = gate_29)[name = string("gate_31")];
+            tensor<fp16, [1, 10240, 1, 1]> input_227 = mul(x = gate_31, y = up_15)[name = string("input_227")];
+            string mlp_out_15_pad_type_0 = const()[name = string("mlp_out_15_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_15_strides_0 = const()[name = string("mlp_out_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_15_pad_0 = const()[name = string("mlp_out_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_15_dilations_0 = const()[name = string("mlp_out_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_15_groups_0 = const()[name = string("mlp_out_15_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_15 = conv(dilations = mlp_out_15_dilations_0, groups = mlp_out_15_groups_0, pad = mlp_out_15_pad_0, pad_type = mlp_out_15_pad_type_0, strides = mlp_out_15_strides_0, weight = layers_7_mlp_down_proj_weight_palettized, x = input_227)[name = string("mlp_out_15")];
+            tensor<int32, [1]> var_5050_axes_0 = const()[name = string("op_5050_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_5050 = squeeze(axes = var_5050_axes_0, x = mlp_out_15)[name = string("op_5050")];
+            tensor<int32, [3]> var_5054 = const()[name = string("op_5054"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_5060 = const()[name = string("op_5060"), val = int32(-1)];
+            fp16 const_91_promoted = const()[name = string("const_91_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_155 = transpose(perm = var_5054, x = var_5050)[name = string("transpose_75")];
+            tensor<fp16, [1, 1, 2560]> var_5062 = mul(x = x_155, y = const_91_promoted)[name = string("op_5062")];
+            bool input_229_interleave_0 = const()[name = string("input_229_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_229 = concat(axis = var_5060, interleave = input_229_interleave_0, values = (x_155, var_5062))[name = string("input_229")];
+            tensor<int32, [1]> normed_217_axes_0 = const()[name = string("normed_217_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5057_to_fp16 = const()[name = string("op_5057_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_217_cast_fp16 = layer_norm(axes = normed_217_axes_0, epsilon = var_5057_to_fp16, x = input_229)[name = string("normed_217_cast_fp16")];
+            tensor<int32, [2]> var_5067_split_sizes_0 = const()[name = string("op_5067_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5067_axis_0 = const()[name = string("op_5067_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_5067_0, tensor<fp16, [1, 1, 2560]> var_5067_1 = split(axis = var_5067_axis_0, split_sizes = var_5067_split_sizes_0, x = normed_217_cast_fp16)[name = string("op_5067")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_73 = mul(x = var_5067_0, y = layers_7_post_feedforward_layernorm_weight)[name = string("hidden_states_73")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_75_cast_fp16 = add(x = x_153_cast_fp16, y = hidden_states_73)[name = string("hidden_states_75_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_15_begin_0 = const()[name = string("per_layer_slice_15_begin_0"), val = tensor<int32, [3]>([0, 0, 4864])];
+            tensor<int32, [3]> per_layer_slice_15_end_0 = const()[name = string("per_layer_slice_15_end_0"), val = tensor<int32, [3]>([1, 1, 5120])];
+            tensor<bool, [3]> per_layer_slice_15_end_mask_0 = const()[name = string("per_layer_slice_15_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_15_cast_fp16 = slice_by_index(begin = per_layer_slice_15_begin_0, end = per_layer_slice_15_end_0, end_mask = per_layer_slice_15_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_15_cast_fp16")];
+            tensor<int32, [3]> var_5095 = const()[name = string("op_5095"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_231_axes_0 = const()[name = string("input_231_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_5096 = transpose(perm = var_5095, x = hidden_states_75_cast_fp16)[name = string("transpose_74")];
+            tensor<fp16, [1, 2560, 1, 1]> input_231 = expand_dims(axes = input_231_axes_0, x = var_5096)[name = string("input_231")];
+            string gated_43_pad_type_0 = const()[name = string("gated_43_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_43_strides_0 = const()[name = string("gated_43_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_43_pad_0 = const()[name = string("gated_43_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_43_dilations_0 = const()[name = string("gated_43_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_43_groups_0 = const()[name = string("gated_43_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_43 = conv(dilations = gated_43_dilations_0, groups = gated_43_groups_0, pad = gated_43_pad_0, pad_type = gated_43_pad_type_0, strides = gated_43_strides_0, weight = layers_7_per_layer_input_gate_weight_palettized, x = input_231)[name = string("gated_43")];
+            string gated_45_mode_0 = const()[name = string("gated_45_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_45 = gelu(mode = gated_45_mode_0, x = gated_43)[name = string("gated_45")];
+            tensor<int32, [3]> var_5115 = const()[name = string("op_5115"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_15_axes_0 = const()[name = string("per_layer_slice_conv_15_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_5116_cast_fp16 = transpose(perm = var_5115, x = per_layer_slice_15_cast_fp16)[name = string("transpose_73")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_15_cast_fp16 = expand_dims(axes = per_layer_slice_conv_15_axes_0, x = var_5116_cast_fp16)[name = string("per_layer_slice_conv_15_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_233_cast_fp16 = mul(x = gated_45, y = per_layer_slice_conv_15_cast_fp16)[name = string("input_233_cast_fp16")];
+            string gated_47_pad_type_0 = const()[name = string("gated_47_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_47_strides_0 = const()[name = string("gated_47_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_47_pad_0 = const()[name = string("gated_47_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_47_dilations_0 = const()[name = string("gated_47_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_47_groups_0 = const()[name = string("gated_47_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_7_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(557339072))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(557666816))))[name = string("layers_7_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_47_cast_fp16 = conv(dilations = gated_47_dilations_0, groups = gated_47_groups_0, pad = gated_47_pad_0, pad_type = gated_47_pad_type_0, strides = gated_47_strides_0, weight = layers_7_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_233_cast_fp16)[name = string("gated_47_cast_fp16")];
+            tensor<int32, [1]> var_5132_axes_0 = const()[name = string("op_5132_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_5132_cast_fp16 = squeeze(axes = var_5132_axes_0, x = gated_47_cast_fp16)[name = string("op_5132_cast_fp16")];
+            tensor<int32, [3]> var_5136 = const()[name = string("op_5136"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_5142 = const()[name = string("op_5142"), val = int32(-1)];
+            fp16 const_92_promoted_to_fp16 = const()[name = string("const_92_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_157_cast_fp16 = transpose(perm = var_5136, x = var_5132_cast_fp16)[name = string("transpose_72")];
+            tensor<fp16, [1, 1, 2560]> var_5144_cast_fp16 = mul(x = x_157_cast_fp16, y = const_92_promoted_to_fp16)[name = string("op_5144_cast_fp16")];
+            bool input_235_interleave_0 = const()[name = string("input_235_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_235_cast_fp16 = concat(axis = var_5142, interleave = input_235_interleave_0, values = (x_157_cast_fp16, var_5144_cast_fp16))[name = string("input_235_cast_fp16")];
+            tensor<int32, [1]> normed_221_axes_0 = const()[name = string("normed_221_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5139_to_fp16 = const()[name = string("op_5139_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_221_cast_fp16 = layer_norm(axes = normed_221_axes_0, epsilon = var_5139_to_fp16, x = input_235_cast_fp16)[name = string("normed_221_cast_fp16")];
+            tensor<int32, [2]> var_5149_split_sizes_0 = const()[name = string("op_5149_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5149_axis_0 = const()[name = string("op_5149_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_5149_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_5149_cast_fp16_1 = split(axis = var_5149_axis_0, split_sizes = var_5149_split_sizes_0, x = normed_221_cast_fp16)[name = string("op_5149_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_7_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(557669440)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_79_cast_fp16 = mul(x = var_5149_cast_fp16_0, y = layers_7_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_79_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_81_cast_fp16 = add(x = hidden_states_75_cast_fp16, y = hidden_states_79_cast_fp16)[name = string("hidden_states_81_cast_fp16")];
+            tensor<fp16, [1]> const_93_promoted_to_fp16 = const()[name = string("const_93_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.06p-1])];
+            tensor<fp16, [1, 1, 2560]> x_159_cast_fp16 = mul(x = hidden_states_81_cast_fp16, y = const_93_promoted_to_fp16)[name = string("x_159_cast_fp16")];
+            tensor<int32, [1]> var_5161_axes_0 = const()[name = string("op_5161_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_5161_cast_fp16 = squeeze(axes = var_5161_axes_0, x = K_sliding_out_13_cast_fp16)[name = string("op_5161_cast_fp16")];
+            tensor<int32, [1]> var_5163_axes_0 = const()[name = string("op_5163_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_5163_cast_fp16 = squeeze(axes = var_5163_axes_0, x = V_sliding_out_13_cast_fp16)[name = string("op_5163_cast_fp16")];
+            tensor<int32, [4]> var_5166_begin_0 = const()[name = string("op_5166_begin_0"), val = tensor<int32, [4]>([7, 0, 0, 0])];
+            tensor<int32, [4]> var_5166_end_0 = const()[name = string("op_5166_end_0"), val = tensor<int32, [4]>([8, 2, 512, 512])];
+            tensor<bool, [4]> var_5166_end_mask_0 = const()[name = string("op_5166_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_5166_squeeze_mask_0 = const()[name = string("op_5166_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_5166_cast_fp16 = slice_by_index(begin = var_5166_begin_0, end = var_5166_end_0, end_mask = var_5166_end_mask_0, squeeze_mask = var_5166_squeeze_mask_0, x = K_sliding_in)[name = string("op_5166_cast_fp16")];
+            tensor<int32, [1]> K_sliding_slot_15_axes_0 = const()[name = string("K_sliding_slot_15_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_slot_15_cast_fp16 = expand_dims(axes = K_sliding_slot_15_axes_0, x = var_5166_cast_fp16)[name = string("K_sliding_slot_15_cast_fp16")];
+            tensor<int32, [4]> var_5171_begin_0 = const()[name = string("op_5171_begin_0"), val = tensor<int32, [4]>([7, 0, 0, 0])];
+            tensor<int32, [4]> var_5171_end_0 = const()[name = string("op_5171_end_0"), val = tensor<int32, [4]>([8, 2, 512, 512])];
+            tensor<bool, [4]> var_5171_end_mask_0 = const()[name = string("op_5171_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_5171_squeeze_mask_0 = const()[name = string("op_5171_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_5171_cast_fp16 = slice_by_index(begin = var_5171_begin_0, end = var_5171_end_0, end_mask = var_5171_end_mask_0, squeeze_mask = var_5171_squeeze_mask_0, x = V_sliding_in)[name = string("op_5171_cast_fp16")];
+            tensor<int32, [1]> V_sliding_slot_15_axes_0 = const()[name = string("V_sliding_slot_15_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_slot_15_cast_fp16 = expand_dims(axes = V_sliding_slot_15_axes_0, x = var_5171_cast_fp16)[name = string("V_sliding_slot_15_cast_fp16")];
+            int32 var_5178 = const()[name = string("op_5178"), val = int32(-1)];
+            fp16 const_94_promoted_to_fp16 = const()[name = string("const_94_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_5180_cast_fp16 = mul(x = x_159_cast_fp16, y = const_94_promoted_to_fp16)[name = string("op_5180_cast_fp16")];
+            bool input_237_interleave_0 = const()[name = string("input_237_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_237_cast_fp16 = concat(axis = var_5178, interleave = input_237_interleave_0, values = (x_159_cast_fp16, var_5180_cast_fp16))[name = string("input_237_cast_fp16")];
+            tensor<int32, [1]> normed_225_axes_0 = const()[name = string("normed_225_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5175_to_fp16 = const()[name = string("op_5175_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_225_cast_fp16 = layer_norm(axes = normed_225_axes_0, epsilon = var_5175_to_fp16, x = input_237_cast_fp16)[name = string("normed_225_cast_fp16")];
+            tensor<int32, [2]> var_5185_split_sizes_0 = const()[name = string("op_5185_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5185_axis_0 = const()[name = string("op_5185_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_5185_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_5185_cast_fp16_1 = split(axis = var_5185_axis_0, split_sizes = var_5185_split_sizes_0, x = normed_225_cast_fp16)[name = string("op_5185_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_8_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(557674624)))];
+            tensor<fp16, [1, 1, 2560]> h_49_cast_fp16 = mul(x = var_5185_cast_fp16_0, y = layers_8_input_layernorm_weight_promoted_to_fp16)[name = string("h_49_cast_fp16")];
+            tensor<int32, [3]> var_5191 = const()[name = string("op_5191"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_5194_axes_0 = const()[name = string("op_5194_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_5192_cast_fp16 = transpose(perm = var_5191, x = h_49_cast_fp16)[name = string("transpose_71")];
+            tensor<fp16, [1, 2560, 1, 1]> var_5194_cast_fp16 = expand_dims(axes = var_5194_axes_0, x = var_5192_cast_fp16)[name = string("op_5194_cast_fp16")];
+            string var_5210_pad_type_0 = const()[name = string("op_5210_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_5210_strides_0 = const()[name = string("op_5210_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_5210_pad_0 = const()[name = string("op_5210_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_5210_dilations_0 = const()[name = string("op_5210_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_5210_groups_0 = const()[name = string("op_5210_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_5210 = conv(dilations = var_5210_dilations_0, groups = var_5210_groups_0, pad = var_5210_pad_0, pad_type = var_5210_pad_type_0, strides = var_5210_strides_0, weight = layers_8_self_attn_q_proj_weight_palettized, x = var_5194_cast_fp16)[name = string("op_5210")];
+            tensor<int32, [4]> var_5215 = const()[name = string("op_5215"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_5216 = reshape(shape = var_5215, x = var_5210)[name = string("op_5216")];
+            tensor<int32, [4]> var_5221 = const()[name = string("op_5221"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_5231 = const()[name = string("op_5231"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_5222 = transpose(perm = var_5221, x = var_5216)[name = string("transpose_70")];
+            tensor<fp16, [1, 8, 256]> x_161 = reshape(shape = var_5231, x = var_5222)[name = string("x_161")];
+            int32 var_5237 = const()[name = string("op_5237"), val = int32(-1)];
+            fp16 const_95_promoted = const()[name = string("const_95_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_5239 = mul(x = x_161, y = const_95_promoted)[name = string("op_5239")];
+            bool input_241_interleave_0 = const()[name = string("input_241_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_241 = concat(axis = var_5237, interleave = input_241_interleave_0, values = (x_161, var_5239))[name = string("input_241")];
+            tensor<int32, [1]> normed_229_axes_0 = const()[name = string("normed_229_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5234_to_fp16 = const()[name = string("op_5234_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_229_cast_fp16 = layer_norm(axes = normed_229_axes_0, epsilon = var_5234_to_fp16, x = input_241)[name = string("normed_229_cast_fp16")];
+            tensor<int32, [2]> var_5244_split_sizes_0 = const()[name = string("op_5244_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_5244_axis_0 = const()[name = string("op_5244_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_5244_0, tensor<fp16, [1, 8, 256]> var_5244_1 = split(axis = var_5244_axis_0, split_sizes = var_5244_split_sizes_0, x = normed_229_cast_fp16)[name = string("op_5244")];
+            tensor<int32, [4]> var_5251 = const()[name = string("op_5251"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_67 = reshape(shape = var_5251, x = var_5244_0)[name = string("q_67")];
+            tensor<fp16, [1, 8, 1, 256]> var_5253_cast_fp16 = mul(x = q_67, y = cos_s)[name = string("op_5253_cast_fp16")];
+            tensor<int32, [2]> var_5254_split_sizes_0 = const()[name = string("op_5254_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_5254_axis_0 = const()[name = string("op_5254_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_5254_0, tensor<fp16, [1, 8, 1, 128]> var_5254_1 = split(axis = var_5254_axis_0, split_sizes = var_5254_split_sizes_0, x = q_67)[name = string("op_5254")];
+            fp16 const_96_promoted = const()[name = string("const_96_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_5256 = mul(x = var_5254_1, y = const_96_promoted)[name = string("op_5256")];
+            int32 var_5258 = const()[name = string("op_5258"), val = int32(-1)];
+            bool var_5259_interleave_0 = const()[name = string("op_5259_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_5259 = concat(axis = var_5258, interleave = var_5259_interleave_0, values = (var_5256, var_5254_0))[name = string("op_5259")];
+            tensor<fp16, [1, 8, 1, 256]> var_5260_cast_fp16 = mul(x = var_5259, y = sin_s)[name = string("op_5260_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_71_cast_fp16 = add(x = var_5253_cast_fp16, y = var_5260_cast_fp16)[name = string("q_71_cast_fp16")];
+            string var_5273_pad_type_0 = const()[name = string("op_5273_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_5273_strides_0 = const()[name = string("op_5273_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_5273_pad_0 = const()[name = string("op_5273_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_5273_dilations_0 = const()[name = string("op_5273_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_5273_groups_0 = const()[name = string("op_5273_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_5273 = conv(dilations = var_5273_dilations_0, groups = var_5273_groups_0, pad = var_5273_pad_0, pad_type = var_5273_pad_type_0, strides = var_5273_strides_0, weight = layers_8_self_attn_k_proj_weight_palettized, x = var_5194_cast_fp16)[name = string("op_5273")];
+            tensor<int32, [4]> var_5278 = const()[name = string("op_5278"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_5279 = reshape(shape = var_5278, x = var_5273)[name = string("op_5279")];
+            tensor<int32, [4]> var_5284 = const()[name = string("op_5284"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_5301_pad_type_0 = const()[name = string("op_5301_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_5301_strides_0 = const()[name = string("op_5301_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_5301_pad_0 = const()[name = string("op_5301_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_5301_dilations_0 = const()[name = string("op_5301_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_5301_groups_0 = const()[name = string("op_5301_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_5301 = conv(dilations = var_5301_dilations_0, groups = var_5301_groups_0, pad = var_5301_pad_0, pad_type = var_5301_pad_type_0, strides = var_5301_strides_0, weight = layers_8_self_attn_v_proj_weight_palettized, x = var_5194_cast_fp16)[name = string("op_5301")];
+            tensor<int32, [4]> var_5306 = const()[name = string("op_5306"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_5307 = reshape(shape = var_5306, x = var_5301)[name = string("op_5307")];
+            tensor<int32, [4]> var_5312 = const()[name = string("op_5312"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_5322 = const()[name = string("op_5322"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp16, [1, 2, 1, 256]> var_5285 = transpose(perm = var_5284, x = var_5279)[name = string("transpose_69")];
+            tensor<fp16, [1, 2, 256]> x_163 = reshape(shape = var_5322, x = var_5285)[name = string("x_163")];
+            int32 var_5328 = const()[name = string("op_5328"), val = int32(-1)];
+            fp16 const_97_promoted = const()[name = string("const_97_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 256]> var_5330 = mul(x = x_163, y = const_97_promoted)[name = string("op_5330")];
+            bool input_243_interleave_0 = const()[name = string("input_243_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512]> input_243 = concat(axis = var_5328, interleave = input_243_interleave_0, values = (x_163, var_5330))[name = string("input_243")];
+            tensor<int32, [1]> normed_233_axes_0 = const()[name = string("normed_233_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5325_to_fp16 = const()[name = string("op_5325_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 512]> normed_233_cast_fp16 = layer_norm(axes = normed_233_axes_0, epsilon = var_5325_to_fp16, x = input_243)[name = string("normed_233_cast_fp16")];
+            tensor<int32, [2]> var_5335_split_sizes_0 = const()[name = string("op_5335_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_5335_axis_0 = const()[name = string("op_5335_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 256]> var_5335_0, tensor<fp16, [1, 2, 256]> var_5335_1 = split(axis = var_5335_axis_0, split_sizes = var_5335_split_sizes_0, x = normed_233_cast_fp16)[name = string("op_5335")];
+            tensor<fp16, [1, 2, 256]> var_5337 = mul(x = var_5335_0, y = layers_8_self_attn_k_norm_weight)[name = string("op_5337")];
+            tensor<int32, [4]> var_5342 = const()[name = string("op_5342"), val = tensor<int32, [4]>([1, 2, 1, 256])];
+            tensor<fp16, [1, 2, 1, 256]> q_69 = reshape(shape = var_5342, x = var_5337)[name = string("q_69")];
+            fp16 var_5344_promoted = const()[name = string("op_5344_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 256]> var_5313 = transpose(perm = var_5312, x = var_5307)[name = string("transpose_68")];
+            tensor<fp16, [1, 2, 1, 256]> var_5345 = pow(x = var_5313, y = var_5344_promoted)[name = string("op_5345")];
+            tensor<int32, [1]> var_5350_axes_0 = const()[name = string("op_5350_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5350_keep_dims_0 = const()[name = string("op_5350_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_5350 = reduce_mean(axes = var_5350_axes_0, keep_dims = var_5350_keep_dims_0, x = var_5345)[name = string("op_5350")];
+            fp16 var_5352_to_fp16 = const()[name = string("op_5352_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_17_cast_fp16 = add(x = var_5350, y = var_5352_to_fp16)[name = string("mean_sq_17_cast_fp16")];
+            fp32 var_5354_epsilon_0 = const()[name = string("op_5354_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_5354_cast_fp16 = rsqrt(epsilon = var_5354_epsilon_0, x = mean_sq_17_cast_fp16)[name = string("op_5354_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_247_cast_fp16 = mul(x = var_5313, y = var_5354_cast_fp16)[name = string("input_247_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> var_5356_cast_fp16 = mul(x = q_69, y = cos_s)[name = string("op_5356_cast_fp16")];
+            tensor<int32, [2]> var_5357_split_sizes_0 = const()[name = string("op_5357_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_5357_axis_0 = const()[name = string("op_5357_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 128]> var_5357_0, tensor<fp16, [1, 2, 1, 128]> var_5357_1 = split(axis = var_5357_axis_0, split_sizes = var_5357_split_sizes_0, x = q_69)[name = string("op_5357")];
+            fp16 const_98_promoted = const()[name = string("const_98_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 128]> var_5359 = mul(x = var_5357_1, y = const_98_promoted)[name = string("op_5359")];
+            int32 var_5361 = const()[name = string("op_5361"), val = int32(-1)];
+            bool var_5362_interleave_0 = const()[name = string("op_5362_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 256]> var_5362 = concat(axis = var_5361, interleave = var_5362_interleave_0, values = (var_5359, var_5357_0))[name = string("op_5362")];
+            tensor<fp16, [1, 2, 1, 256]> var_5363_cast_fp16 = mul(x = var_5362, y = sin_s)[name = string("op_5363_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_245_cast_fp16 = add(x = var_5356_cast_fp16, y = var_5363_cast_fp16)[name = string("input_245_cast_fp16")];
+            tensor<int32, [8]> k_padded_15_pad_0 = const()[name = string("k_padded_15_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_15_mode_0 = const()[name = string("k_padded_15_mode_0"), val = string("constant")];
+            fp16 const_99_to_fp16 = const()[name = string("const_99_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> k_padded_15_cast_fp16 = pad(constant_val = const_99_to_fp16, mode = k_padded_15_mode_0, pad = k_padded_15_pad_0, x = input_245_cast_fp16)[name = string("k_padded_15_cast_fp16")];
+            tensor<int32, [8]> v_padded_15_pad_0 = const()[name = string("v_padded_15_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_15_mode_0 = const()[name = string("v_padded_15_mode_0"), val = string("constant")];
+            fp16 const_100_to_fp16 = const()[name = string("const_100_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> v_padded_15_cast_fp16 = pad(constant_val = const_100_to_fp16, mode = v_padded_15_mode_0, pad = v_padded_15_pad_0, x = input_247_cast_fp16)[name = string("v_padded_15_cast_fp16")];
+            tensor<int32, [4]> var_5392_begin_0 = const()[name = string("op_5392_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_5392_end_0 = const()[name = string("op_5392_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_5392_end_mask_0 = const()[name = string("op_5392_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_5392_cast_fp16 = slice_by_index(begin = var_5392_begin_0, end = var_5392_end_0, end_mask = var_5392_end_mask_0, x = K_sliding_slot_15_cast_fp16)[name = string("op_5392_cast_fp16")];
+            int32 var_5399 = const()[name = string("op_5399"), val = int32(2)];
+            bool K_sliding_out_15_interleave_0 = const()[name = string("K_sliding_out_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_out_15_cast_fp16 = concat(axis = var_5399, interleave = K_sliding_out_15_interleave_0, values = (var_5392_cast_fp16, k_padded_15_cast_fp16))[name = string("K_sliding_out_15_cast_fp16")];
+            tensor<int32, [4]> var_5415_begin_0 = const()[name = string("op_5415_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_5415_end_0 = const()[name = string("op_5415_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_5415_end_mask_0 = const()[name = string("op_5415_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_5415_cast_fp16 = slice_by_index(begin = var_5415_begin_0, end = var_5415_end_0, end_mask = var_5415_end_mask_0, x = V_sliding_slot_15_cast_fp16)[name = string("op_5415_cast_fp16")];
+            int32 var_5422 = const()[name = string("op_5422"), val = int32(2)];
+            bool V_sliding_out_15_interleave_0 = const()[name = string("V_sliding_out_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_out_15_cast_fp16 = concat(axis = var_5422, interleave = V_sliding_out_15_interleave_0, values = (var_5415_cast_fp16, v_padded_15_cast_fp16))[name = string("V_sliding_out_15_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_17_begin_0 = const()[name = string("K_for_attn_17_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_17_end_0 = const()[name = string("K_for_attn_17_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_17_end_mask_0 = const()[name = string("K_for_attn_17_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_17_cast_fp16 = slice_by_index(begin = K_for_attn_17_begin_0, end = K_for_attn_17_end_0, end_mask = K_for_attn_17_end_mask_0, x = K_sliding_out_15_cast_fp16)[name = string("K_for_attn_17_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_17_begin_0 = const()[name = string("V_for_attn_17_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_17_end_0 = const()[name = string("V_for_attn_17_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_17_end_mask_0 = const()[name = string("V_for_attn_17_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_17_cast_fp16 = slice_by_index(begin = V_for_attn_17_begin_0, end = V_for_attn_17_end_0, end_mask = V_for_attn_17_end_mask_0, x = V_sliding_out_15_cast_fp16)[name = string("V_for_attn_17_cast_fp16")];
+            tensor<int32, [4]> transpose_32_perm_0 = const()[name = string("transpose_32_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_16_reps_0 = const()[name = string("tile_16_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_32_cast_fp16 = transpose(perm = transpose_32_perm_0, x = K_for_attn_17_cast_fp16)[name = string("transpose_67")];
+            tensor<fp16, [8, 1, 512, 256]> tile_16_cast_fp16 = tile(reps = tile_16_reps_0, x = transpose_32_cast_fp16)[name = string("tile_16_cast_fp16")];
+            tensor<int32, [5]> concat_32 = const()[name = string("concat_32"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_32_cast_fp16 = reshape(shape = concat_32, x = tile_16_cast_fp16)[name = string("reshape_32_cast_fp16")];
+            tensor<int32, [5]> transpose_33_perm_0 = const()[name = string("transpose_33_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_33 = const()[name = string("concat_33"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_33_cast_fp16 = transpose(perm = transpose_33_perm_0, x = reshape_32_cast_fp16)[name = string("transpose_66")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_33_cast_fp16 = reshape(shape = concat_33, x = transpose_33_cast_fp16)[name = string("reshape_33_cast_fp16")];
+            tensor<int32, [4]> transpose_56_perm_0 = const()[name = string("transpose_56_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_34_perm_0 = const()[name = string("transpose_34_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_17_reps_0 = const()[name = string("tile_17_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_34_cast_fp16 = transpose(perm = transpose_34_perm_0, x = V_for_attn_17_cast_fp16)[name = string("transpose_65")];
+            tensor<fp16, [8, 1, 512, 256]> tile_17_cast_fp16 = tile(reps = tile_17_reps_0, x = transpose_34_cast_fp16)[name = string("tile_17_cast_fp16")];
+            tensor<int32, [5]> concat_34 = const()[name = string("concat_34"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_34_cast_fp16 = reshape(shape = concat_34, x = tile_17_cast_fp16)[name = string("reshape_34_cast_fp16")];
+            tensor<int32, [5]> transpose_35_perm_0 = const()[name = string("transpose_35_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_35 = const()[name = string("concat_35"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_35_cast_fp16 = transpose(perm = transpose_35_perm_0, x = reshape_34_cast_fp16)[name = string("transpose_64")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_35_cast_fp16 = reshape(shape = concat_35, x = transpose_35_cast_fp16)[name = string("reshape_35_cast_fp16")];
+            tensor<int32, [4]> V_expanded_17_perm_0 = const()[name = string("V_expanded_17_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_33_transpose_x_0 = const()[name = string("attn_weights_33_transpose_x_0"), val = bool(false)];
+            bool attn_weights_33_transpose_y_0 = const()[name = string("attn_weights_33_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_56_cast_fp16 = transpose(perm = transpose_56_perm_0, x = reshape_33_cast_fp16)[name = string("transpose_63")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_33_cast_fp16 = matmul(transpose_x = attn_weights_33_transpose_x_0, transpose_y = attn_weights_33_transpose_y_0, x = q_71_cast_fp16, y = transpose_56_cast_fp16)[name = string("attn_weights_33_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_167_cast_fp16 = add(x = attn_weights_33_cast_fp16, y = causal_mask_sliding)[name = string("x_167_cast_fp16")];
+            tensor<int32, [1]> reduce_max_8_axes_0 = const()[name = string("reduce_max_8_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_8_keep_dims_0 = const()[name = string("reduce_max_8_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_8 = reduce_max(axes = reduce_max_8_axes_0, keep_dims = reduce_max_8_keep_dims_0, x = x_167_cast_fp16)[name = string("reduce_max_8")];
+            tensor<fp16, [1, 8, 1, 512]> var_5463 = sub(x = x_167_cast_fp16, y = reduce_max_8)[name = string("op_5463")];
+            tensor<fp16, [1, 8, 1, 512]> var_5469 = exp(x = var_5463)[name = string("op_5469")];
+            tensor<int32, [1]> var_5479_axes_0 = const()[name = string("op_5479_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5479_keep_dims_0 = const()[name = string("op_5479_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_5479 = reduce_sum(axes = var_5479_axes_0, keep_dims = var_5479_keep_dims_0, x = var_5469)[name = string("op_5479")];
+            tensor<fp16, [1, 8, 1, 512]> var_5485_cast_fp16 = real_div(x = var_5469, y = var_5479)[name = string("op_5485_cast_fp16")];
+            bool attn_output_49_transpose_x_0 = const()[name = string("attn_output_49_transpose_x_0"), val = bool(false)];
+            bool attn_output_49_transpose_y_0 = const()[name = string("attn_output_49_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_17_cast_fp16 = transpose(perm = V_expanded_17_perm_0, x = reshape_35_cast_fp16)[name = string("transpose_62")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_49_cast_fp16 = matmul(transpose_x = attn_output_49_transpose_x_0, transpose_y = attn_output_49_transpose_y_0, x = var_5485_cast_fp16, y = V_expanded_17_cast_fp16)[name = string("attn_output_49_cast_fp16")];
+            tensor<int32, [4]> var_5496 = const()[name = string("op_5496"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_5503 = const()[name = string("op_5503"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_5497_cast_fp16 = transpose(perm = var_5496, x = attn_output_49_cast_fp16)[name = string("transpose_61")];
+            tensor<fp16, [1, 1, 2048]> attn_output_51_cast_fp16 = reshape(shape = var_5503, x = var_5497_cast_fp16)[name = string("attn_output_51_cast_fp16")];
+            tensor<int32, [3]> var_5508 = const()[name = string("op_5508"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_5524_pad_type_0 = const()[name = string("op_5524_pad_type_0"), val = string("valid")];
+            int32 var_5524_groups_0 = const()[name = string("op_5524_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_5524_strides_0 = const()[name = string("op_5524_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_5524_pad_0 = const()[name = string("op_5524_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_5524_dilations_0 = const()[name = string("op_5524_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_8_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(557679808))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(560301312))))[name = string("squeeze_8_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_5509_cast_fp16 = transpose(perm = var_5508, x = attn_output_51_cast_fp16)[name = string("transpose_60")];
+            tensor<fp16, [1, 2560, 1]> var_5524_cast_fp16 = conv(dilations = var_5524_dilations_0, groups = var_5524_groups_0, pad = var_5524_pad_0, pad_type = var_5524_pad_type_0, strides = var_5524_strides_0, weight = squeeze_8_cast_fp16_to_fp32_to_fp16_palettized, x = var_5509_cast_fp16)[name = string("op_5524_cast_fp16")];
+            tensor<int32, [3]> var_5528 = const()[name = string("op_5528"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_5534 = const()[name = string("op_5534"), val = int32(-1)];
+            fp16 const_101_promoted_to_fp16 = const()[name = string("const_101_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_171_cast_fp16 = transpose(perm = var_5528, x = var_5524_cast_fp16)[name = string("transpose_59")];
+            tensor<fp16, [1, 1, 2560]> var_5536_cast_fp16 = mul(x = x_171_cast_fp16, y = const_101_promoted_to_fp16)[name = string("op_5536_cast_fp16")];
+            bool input_251_interleave_0 = const()[name = string("input_251_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_251_cast_fp16 = concat(axis = var_5534, interleave = input_251_interleave_0, values = (x_171_cast_fp16, var_5536_cast_fp16))[name = string("input_251_cast_fp16")];
+            tensor<int32, [1]> normed_237_axes_0 = const()[name = string("normed_237_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5531_to_fp16 = const()[name = string("op_5531_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_237_cast_fp16 = layer_norm(axes = normed_237_axes_0, epsilon = var_5531_to_fp16, x = input_251_cast_fp16)[name = string("normed_237_cast_fp16")];
+            tensor<int32, [2]> var_5541_split_sizes_0 = const()[name = string("op_5541_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5541_axis_0 = const()[name = string("op_5541_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_5541_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_5541_cast_fp16_1 = split(axis = var_5541_axis_0, split_sizes = var_5541_split_sizes_0, x = normed_237_cast_fp16)[name = string("op_5541_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_8_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(560303936)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_53_cast_fp16 = mul(x = var_5541_cast_fp16_0, y = layers_8_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_53_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_173_cast_fp16 = add(x = x_159_cast_fp16, y = attn_output_53_cast_fp16)[name = string("x_173_cast_fp16")];
+            int32 var_5550 = const()[name = string("op_5550"), val = int32(-1)];
+            fp16 const_102_promoted_to_fp16 = const()[name = string("const_102_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_5552_cast_fp16 = mul(x = x_173_cast_fp16, y = const_102_promoted_to_fp16)[name = string("op_5552_cast_fp16")];
+            bool input_253_interleave_0 = const()[name = string("input_253_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_253_cast_fp16 = concat(axis = var_5550, interleave = input_253_interleave_0, values = (x_173_cast_fp16, var_5552_cast_fp16))[name = string("input_253_cast_fp16")];
+            tensor<int32, [1]> normed_241_axes_0 = const()[name = string("normed_241_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5547_to_fp16 = const()[name = string("op_5547_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_241_cast_fp16 = layer_norm(axes = normed_241_axes_0, epsilon = var_5547_to_fp16, x = input_253_cast_fp16)[name = string("normed_241_cast_fp16")];
+            tensor<int32, [2]> var_5557_split_sizes_0 = const()[name = string("op_5557_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5557_axis_0 = const()[name = string("op_5557_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_5557_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_5557_cast_fp16_1 = split(axis = var_5557_axis_0, split_sizes = var_5557_split_sizes_0, x = normed_241_cast_fp16)[name = string("op_5557_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_8_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(560309120)))];
+            tensor<fp16, [1, 1, 2560]> h_51_cast_fp16 = mul(x = var_5557_cast_fp16_0, y = layers_8_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_51_cast_fp16")];
+            tensor<int32, [3]> var_5568 = const()[name = string("op_5568"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_255_axes_0 = const()[name = string("input_255_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_5569 = transpose(perm = var_5568, x = h_51_cast_fp16)[name = string("transpose_58")];
+            tensor<fp16, [1, 2560, 1, 1]> input_255 = expand_dims(axes = input_255_axes_0, x = var_5569)[name = string("input_255")];
+            string gate_33_pad_type_0 = const()[name = string("gate_33_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_33_strides_0 = const()[name = string("gate_33_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_33_pad_0 = const()[name = string("gate_33_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_33_dilations_0 = const()[name = string("gate_33_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_33_groups_0 = const()[name = string("gate_33_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_33 = conv(dilations = gate_33_dilations_0, groups = gate_33_groups_0, pad = gate_33_pad_0, pad_type = gate_33_pad_type_0, strides = gate_33_strides_0, weight = layers_8_mlp_gate_proj_weight_palettized, x = input_255)[name = string("gate_33")];
+            string up_17_pad_type_0 = const()[name = string("up_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_17_strides_0 = const()[name = string("up_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_17_pad_0 = const()[name = string("up_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_17_dilations_0 = const()[name = string("up_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_17_groups_0 = const()[name = string("up_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_17 = conv(dilations = up_17_dilations_0, groups = up_17_groups_0, pad = up_17_pad_0, pad_type = up_17_pad_type_0, strides = up_17_strides_0, weight = layers_8_mlp_up_proj_weight_palettized, x = input_255)[name = string("up_17")];
+            string gate_35_mode_0 = const()[name = string("gate_35_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_35 = gelu(mode = gate_35_mode_0, x = gate_33)[name = string("gate_35")];
+            tensor<fp16, [1, 10240, 1, 1]> input_257 = mul(x = gate_35, y = up_17)[name = string("input_257")];
+            string mlp_out_17_pad_type_0 = const()[name = string("mlp_out_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_17_strides_0 = const()[name = string("mlp_out_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_17_pad_0 = const()[name = string("mlp_out_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_17_dilations_0 = const()[name = string("mlp_out_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_17_groups_0 = const()[name = string("mlp_out_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_17 = conv(dilations = mlp_out_17_dilations_0, groups = mlp_out_17_groups_0, pad = mlp_out_17_pad_0, pad_type = mlp_out_17_pad_type_0, strides = mlp_out_17_strides_0, weight = layers_8_mlp_down_proj_weight_palettized, x = input_257)[name = string("mlp_out_17")];
+            tensor<int32, [1]> var_5609_axes_0 = const()[name = string("op_5609_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_5609 = squeeze(axes = var_5609_axes_0, x = mlp_out_17)[name = string("op_5609")];
+            tensor<int32, [3]> var_5613 = const()[name = string("op_5613"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_5619 = const()[name = string("op_5619"), val = int32(-1)];
+            fp16 const_103_promoted = const()[name = string("const_103_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_175 = transpose(perm = var_5613, x = var_5609)[name = string("transpose_57")];
+            tensor<fp16, [1, 1, 2560]> var_5621 = mul(x = x_175, y = const_103_promoted)[name = string("op_5621")];
+            bool input_259_interleave_0 = const()[name = string("input_259_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_259 = concat(axis = var_5619, interleave = input_259_interleave_0, values = (x_175, var_5621))[name = string("input_259")];
+            tensor<int32, [1]> normed_245_axes_0 = const()[name = string("normed_245_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5616_to_fp16 = const()[name = string("op_5616_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_245_cast_fp16 = layer_norm(axes = normed_245_axes_0, epsilon = var_5616_to_fp16, x = input_259)[name = string("normed_245_cast_fp16")];
+            tensor<int32, [2]> var_5626_split_sizes_0 = const()[name = string("op_5626_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5626_axis_0 = const()[name = string("op_5626_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_5626_0, tensor<fp16, [1, 1, 2560]> var_5626_1 = split(axis = var_5626_axis_0, split_sizes = var_5626_split_sizes_0, x = normed_245_cast_fp16)[name = string("op_5626")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_83 = mul(x = var_5626_0, y = layers_8_post_feedforward_layernorm_weight)[name = string("hidden_states_83")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_85_cast_fp16 = add(x = x_173_cast_fp16, y = hidden_states_83)[name = string("hidden_states_85_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_17_begin_0 = const()[name = string("per_layer_slice_17_begin_0"), val = tensor<int32, [3]>([0, 0, 5120])];
+            tensor<int32, [3]> per_layer_slice_17_end_0 = const()[name = string("per_layer_slice_17_end_0"), val = tensor<int32, [3]>([1, 1, 5376])];
+            tensor<bool, [3]> per_layer_slice_17_end_mask_0 = const()[name = string("per_layer_slice_17_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_17_cast_fp16 = slice_by_index(begin = per_layer_slice_17_begin_0, end = per_layer_slice_17_end_0, end_mask = per_layer_slice_17_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_17_cast_fp16")];
+            tensor<int32, [3]> var_5654 = const()[name = string("op_5654"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_261_axes_0 = const()[name = string("input_261_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_5655 = transpose(perm = var_5654, x = hidden_states_85_cast_fp16)[name = string("transpose_56")];
+            tensor<fp16, [1, 2560, 1, 1]> input_261 = expand_dims(axes = input_261_axes_0, x = var_5655)[name = string("input_261")];
+            string gated_49_pad_type_0 = const()[name = string("gated_49_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_49_strides_0 = const()[name = string("gated_49_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_49_pad_0 = const()[name = string("gated_49_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_49_dilations_0 = const()[name = string("gated_49_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_49_groups_0 = const()[name = string("gated_49_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_49 = conv(dilations = gated_49_dilations_0, groups = gated_49_groups_0, pad = gated_49_pad_0, pad_type = gated_49_pad_type_0, strides = gated_49_strides_0, weight = layers_8_per_layer_input_gate_weight_palettized, x = input_261)[name = string("gated_49")];
+            string gated_51_mode_0 = const()[name = string("gated_51_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_51 = gelu(mode = gated_51_mode_0, x = gated_49)[name = string("gated_51")];
+            tensor<int32, [3]> var_5674 = const()[name = string("op_5674"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_17_axes_0 = const()[name = string("per_layer_slice_conv_17_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_5675_cast_fp16 = transpose(perm = var_5674, x = per_layer_slice_17_cast_fp16)[name = string("transpose_55")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_17_cast_fp16 = expand_dims(axes = per_layer_slice_conv_17_axes_0, x = var_5675_cast_fp16)[name = string("per_layer_slice_conv_17_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_263_cast_fp16 = mul(x = gated_51, y = per_layer_slice_conv_17_cast_fp16)[name = string("input_263_cast_fp16")];
+            string gated_53_pad_type_0 = const()[name = string("gated_53_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_53_strides_0 = const()[name = string("gated_53_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_53_pad_0 = const()[name = string("gated_53_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_53_dilations_0 = const()[name = string("gated_53_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_53_groups_0 = const()[name = string("gated_53_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_8_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(560314304))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(560642048))))[name = string("layers_8_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_53_cast_fp16 = conv(dilations = gated_53_dilations_0, groups = gated_53_groups_0, pad = gated_53_pad_0, pad_type = gated_53_pad_type_0, strides = gated_53_strides_0, weight = layers_8_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_263_cast_fp16)[name = string("gated_53_cast_fp16")];
+            tensor<int32, [1]> var_5691_axes_0 = const()[name = string("op_5691_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_5691_cast_fp16 = squeeze(axes = var_5691_axes_0, x = gated_53_cast_fp16)[name = string("op_5691_cast_fp16")];
+            tensor<int32, [3]> var_5695 = const()[name = string("op_5695"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_5701 = const()[name = string("op_5701"), val = int32(-1)];
+            fp16 const_104_promoted_to_fp16 = const()[name = string("const_104_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_177_cast_fp16 = transpose(perm = var_5695, x = var_5691_cast_fp16)[name = string("transpose_54")];
+            tensor<fp16, [1, 1, 2560]> var_5703_cast_fp16 = mul(x = x_177_cast_fp16, y = const_104_promoted_to_fp16)[name = string("op_5703_cast_fp16")];
+            bool input_265_interleave_0 = const()[name = string("input_265_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_265_cast_fp16 = concat(axis = var_5701, interleave = input_265_interleave_0, values = (x_177_cast_fp16, var_5703_cast_fp16))[name = string("input_265_cast_fp16")];
+            tensor<int32, [1]> normed_249_axes_0 = const()[name = string("normed_249_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5698_to_fp16 = const()[name = string("op_5698_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_249_cast_fp16 = layer_norm(axes = normed_249_axes_0, epsilon = var_5698_to_fp16, x = input_265_cast_fp16)[name = string("normed_249_cast_fp16")];
+            tensor<int32, [2]> var_5708_split_sizes_0 = const()[name = string("op_5708_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5708_axis_0 = const()[name = string("op_5708_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_5708_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_5708_cast_fp16_1 = split(axis = var_5708_axis_0, split_sizes = var_5708_split_sizes_0, x = normed_249_cast_fp16)[name = string("op_5708_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_8_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(560644672)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_89_cast_fp16 = mul(x = var_5708_cast_fp16_0, y = layers_8_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_89_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_91_cast_fp16 = add(x = hidden_states_85_cast_fp16, y = hidden_states_89_cast_fp16)[name = string("hidden_states_91_cast_fp16")];
+            tensor<fp16, [1]> const_105_promoted_to_fp16 = const()[name = string("const_105_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.bap-2])];
+            tensor<fp16, [1, 1, 2560]> x_179_cast_fp16 = mul(x = hidden_states_91_cast_fp16, y = const_105_promoted_to_fp16)[name = string("x_179_cast_fp16")];
+            tensor<int32, [1]> var_5720_axes_0 = const()[name = string("op_5720_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_5720_cast_fp16 = squeeze(axes = var_5720_axes_0, x = K_sliding_out_15_cast_fp16)[name = string("op_5720_cast_fp16")];
+            tensor<int32, [1]> var_5722_axes_0 = const()[name = string("op_5722_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_5722_cast_fp16 = squeeze(axes = var_5722_axes_0, x = V_sliding_out_15_cast_fp16)[name = string("op_5722_cast_fp16")];
+            tensor<int32, [4]> var_5725_begin_0 = const()[name = string("op_5725_begin_0"), val = tensor<int32, [4]>([8, 0, 0, 0])];
+            tensor<int32, [4]> var_5725_end_0 = const()[name = string("op_5725_end_0"), val = tensor<int32, [4]>([9, 2, 512, 512])];
+            tensor<bool, [4]> var_5725_end_mask_0 = const()[name = string("op_5725_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_5725_squeeze_mask_0 = const()[name = string("op_5725_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_5725_cast_fp16 = slice_by_index(begin = var_5725_begin_0, end = var_5725_end_0, end_mask = var_5725_end_mask_0, squeeze_mask = var_5725_squeeze_mask_0, x = K_sliding_in)[name = string("op_5725_cast_fp16")];
+            tensor<int32, [1]> K_sliding_slot_17_axes_0 = const()[name = string("K_sliding_slot_17_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_slot_17_cast_fp16 = expand_dims(axes = K_sliding_slot_17_axes_0, x = var_5725_cast_fp16)[name = string("K_sliding_slot_17_cast_fp16")];
+            tensor<int32, [4]> var_5730_begin_0 = const()[name = string("op_5730_begin_0"), val = tensor<int32, [4]>([8, 0, 0, 0])];
+            tensor<int32, [4]> var_5730_end_0 = const()[name = string("op_5730_end_0"), val = tensor<int32, [4]>([9, 2, 512, 512])];
+            tensor<bool, [4]> var_5730_end_mask_0 = const()[name = string("op_5730_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_5730_squeeze_mask_0 = const()[name = string("op_5730_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_5730_cast_fp16 = slice_by_index(begin = var_5730_begin_0, end = var_5730_end_0, end_mask = var_5730_end_mask_0, squeeze_mask = var_5730_squeeze_mask_0, x = V_sliding_in)[name = string("op_5730_cast_fp16")];
+            tensor<int32, [1]> V_sliding_slot_17_axes_0 = const()[name = string("V_sliding_slot_17_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_slot_17_cast_fp16 = expand_dims(axes = V_sliding_slot_17_axes_0, x = var_5730_cast_fp16)[name = string("V_sliding_slot_17_cast_fp16")];
+            int32 var_5737 = const()[name = string("op_5737"), val = int32(-1)];
+            fp16 const_106_promoted_to_fp16 = const()[name = string("const_106_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_5739_cast_fp16 = mul(x = x_179_cast_fp16, y = const_106_promoted_to_fp16)[name = string("op_5739_cast_fp16")];
+            bool input_267_interleave_0 = const()[name = string("input_267_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_267_cast_fp16 = concat(axis = var_5737, interleave = input_267_interleave_0, values = (x_179_cast_fp16, var_5739_cast_fp16))[name = string("input_267_cast_fp16")];
+            tensor<int32, [1]> normed_253_axes_0 = const()[name = string("normed_253_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5734_to_fp16 = const()[name = string("op_5734_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_253_cast_fp16 = layer_norm(axes = normed_253_axes_0, epsilon = var_5734_to_fp16, x = input_267_cast_fp16)[name = string("normed_253_cast_fp16")];
+            tensor<int32, [2]> var_5744_split_sizes_0 = const()[name = string("op_5744_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5744_axis_0 = const()[name = string("op_5744_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_5744_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_5744_cast_fp16_1 = split(axis = var_5744_axis_0, split_sizes = var_5744_split_sizes_0, x = normed_253_cast_fp16)[name = string("op_5744_cast_fp16")];
+            tensor<fp16, [2560]> layers_9_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_9_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(560649856)))];
+            tensor<fp16, [1, 1, 2560]> h_55_cast_fp16 = mul(x = var_5744_cast_fp16_0, y = layers_9_input_layernorm_weight_promoted_to_fp16)[name = string("h_55_cast_fp16")];
+            tensor<int32, [3]> var_5750 = const()[name = string("op_5750"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_5753_axes_0 = const()[name = string("op_5753_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_5751_cast_fp16 = transpose(perm = var_5750, x = h_55_cast_fp16)[name = string("transpose_53")];
+            tensor<fp16, [1, 2560, 1, 1]> var_5753_cast_fp16 = expand_dims(axes = var_5753_axes_0, x = var_5751_cast_fp16)[name = string("op_5753_cast_fp16")];
+            string var_5769_pad_type_0 = const()[name = string("op_5769_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_5769_strides_0 = const()[name = string("op_5769_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_5769_pad_0 = const()[name = string("op_5769_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_5769_dilations_0 = const()[name = string("op_5769_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_5769_groups_0 = const()[name = string("op_5769_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_5769 = conv(dilations = var_5769_dilations_0, groups = var_5769_groups_0, pad = var_5769_pad_0, pad_type = var_5769_pad_type_0, strides = var_5769_strides_0, weight = layers_9_self_attn_q_proj_weight_palettized, x = var_5753_cast_fp16)[name = string("op_5769")];
+            tensor<int32, [4]> var_5774 = const()[name = string("op_5774"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_5775 = reshape(shape = var_5774, x = var_5769)[name = string("op_5775")];
+            tensor<int32, [4]> var_5780 = const()[name = string("op_5780"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_5790 = const()[name = string("op_5790"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_5781 = transpose(perm = var_5780, x = var_5775)[name = string("transpose_52")];
+            tensor<fp16, [1, 8, 256]> x_181 = reshape(shape = var_5790, x = var_5781)[name = string("x_181")];
+            int32 var_5796 = const()[name = string("op_5796"), val = int32(-1)];
+            fp16 const_107_promoted = const()[name = string("const_107_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_5798 = mul(x = x_181, y = const_107_promoted)[name = string("op_5798")];
+            bool input_271_interleave_0 = const()[name = string("input_271_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_271 = concat(axis = var_5796, interleave = input_271_interleave_0, values = (x_181, var_5798))[name = string("input_271")];
+            tensor<int32, [1]> normed_257_axes_0 = const()[name = string("normed_257_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5793_to_fp16 = const()[name = string("op_5793_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_257_cast_fp16 = layer_norm(axes = normed_257_axes_0, epsilon = var_5793_to_fp16, x = input_271)[name = string("normed_257_cast_fp16")];
+            tensor<int32, [2]> var_5803_split_sizes_0 = const()[name = string("op_5803_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_5803_axis_0 = const()[name = string("op_5803_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_5803_0, tensor<fp16, [1, 8, 256]> var_5803_1 = split(axis = var_5803_axis_0, split_sizes = var_5803_split_sizes_0, x = normed_257_cast_fp16)[name = string("op_5803")];
+            tensor<fp16, [1, 8, 256]> var_5805 = mul(x = var_5803_0, y = layers_9_self_attn_q_norm_weight)[name = string("op_5805")];
+            tensor<int32, [4]> var_5810 = const()[name = string("op_5810"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_75 = reshape(shape = var_5810, x = var_5805)[name = string("q_75")];
+            tensor<fp16, [1, 8, 1, 256]> var_5812_cast_fp16 = mul(x = q_75, y = cos_s)[name = string("op_5812_cast_fp16")];
+            tensor<int32, [2]> var_5813_split_sizes_0 = const()[name = string("op_5813_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_5813_axis_0 = const()[name = string("op_5813_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_5813_0, tensor<fp16, [1, 8, 1, 128]> var_5813_1 = split(axis = var_5813_axis_0, split_sizes = var_5813_split_sizes_0, x = q_75)[name = string("op_5813")];
+            fp16 const_108_promoted = const()[name = string("const_108_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_5815 = mul(x = var_5813_1, y = const_108_promoted)[name = string("op_5815")];
+            int32 var_5817 = const()[name = string("op_5817"), val = int32(-1)];
+            bool var_5818_interleave_0 = const()[name = string("op_5818_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_5818 = concat(axis = var_5817, interleave = var_5818_interleave_0, values = (var_5815, var_5813_0))[name = string("op_5818")];
+            tensor<fp16, [1, 8, 1, 256]> var_5819_cast_fp16 = mul(x = var_5818, y = sin_s)[name = string("op_5819_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_79_cast_fp16 = add(x = var_5812_cast_fp16, y = var_5819_cast_fp16)[name = string("q_79_cast_fp16")];
+            string var_5832_pad_type_0 = const()[name = string("op_5832_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_5832_strides_0 = const()[name = string("op_5832_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_5832_pad_0 = const()[name = string("op_5832_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_5832_dilations_0 = const()[name = string("op_5832_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_5832_groups_0 = const()[name = string("op_5832_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_5832 = conv(dilations = var_5832_dilations_0, groups = var_5832_groups_0, pad = var_5832_pad_0, pad_type = var_5832_pad_type_0, strides = var_5832_strides_0, weight = layers_9_self_attn_k_proj_weight_palettized, x = var_5753_cast_fp16)[name = string("op_5832")];
+            tensor<int32, [4]> var_5837 = const()[name = string("op_5837"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_5838 = reshape(shape = var_5837, x = var_5832)[name = string("op_5838")];
+            tensor<int32, [4]> var_5843 = const()[name = string("op_5843"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_5860_pad_type_0 = const()[name = string("op_5860_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_5860_strides_0 = const()[name = string("op_5860_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_5860_pad_0 = const()[name = string("op_5860_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_5860_dilations_0 = const()[name = string("op_5860_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_5860_groups_0 = const()[name = string("op_5860_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_5860 = conv(dilations = var_5860_dilations_0, groups = var_5860_groups_0, pad = var_5860_pad_0, pad_type = var_5860_pad_type_0, strides = var_5860_strides_0, weight = layers_9_self_attn_v_proj_weight_palettized, x = var_5753_cast_fp16)[name = string("op_5860")];
+            tensor<int32, [4]> var_5865 = const()[name = string("op_5865"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_5866 = reshape(shape = var_5865, x = var_5860)[name = string("op_5866")];
+            tensor<int32, [4]> var_5871 = const()[name = string("op_5871"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_5881 = const()[name = string("op_5881"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp16, [1, 2, 1, 256]> var_5844 = transpose(perm = var_5843, x = var_5838)[name = string("transpose_51")];
+            tensor<fp16, [1, 2, 256]> x_183 = reshape(shape = var_5881, x = var_5844)[name = string("x_183")];
+            int32 var_5887 = const()[name = string("op_5887"), val = int32(-1)];
+            fp16 const_109_promoted = const()[name = string("const_109_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 256]> var_5889 = mul(x = x_183, y = const_109_promoted)[name = string("op_5889")];
+            bool input_273_interleave_0 = const()[name = string("input_273_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512]> input_273 = concat(axis = var_5887, interleave = input_273_interleave_0, values = (x_183, var_5889))[name = string("input_273")];
+            tensor<int32, [1]> normed_261_axes_0 = const()[name = string("normed_261_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5884_to_fp16 = const()[name = string("op_5884_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 512]> normed_261_cast_fp16 = layer_norm(axes = normed_261_axes_0, epsilon = var_5884_to_fp16, x = input_273)[name = string("normed_261_cast_fp16")];
+            tensor<int32, [2]> var_5894_split_sizes_0 = const()[name = string("op_5894_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_5894_axis_0 = const()[name = string("op_5894_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 256]> var_5894_0, tensor<fp16, [1, 2, 256]> var_5894_1 = split(axis = var_5894_axis_0, split_sizes = var_5894_split_sizes_0, x = normed_261_cast_fp16)[name = string("op_5894")];
+            tensor<fp16, [1, 2, 256]> var_5896 = mul(x = var_5894_0, y = layers_9_self_attn_k_norm_weight)[name = string("op_5896")];
+            tensor<int32, [4]> var_5901 = const()[name = string("op_5901"), val = tensor<int32, [4]>([1, 2, 1, 256])];
+            tensor<fp16, [1, 2, 1, 256]> q_77 = reshape(shape = var_5901, x = var_5896)[name = string("q_77")];
+            fp16 var_5903_promoted = const()[name = string("op_5903_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 256]> var_5872 = transpose(perm = var_5871, x = var_5866)[name = string("transpose_50")];
+            tensor<fp16, [1, 2, 1, 256]> var_5904 = pow(x = var_5872, y = var_5903_promoted)[name = string("op_5904")];
+            tensor<int32, [1]> var_5909_axes_0 = const()[name = string("op_5909_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5909_keep_dims_0 = const()[name = string("op_5909_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_5909 = reduce_mean(axes = var_5909_axes_0, keep_dims = var_5909_keep_dims_0, x = var_5904)[name = string("op_5909")];
+            fp16 var_5911_to_fp16 = const()[name = string("op_5911_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_19_cast_fp16 = add(x = var_5909, y = var_5911_to_fp16)[name = string("mean_sq_19_cast_fp16")];
+            fp32 var_5913_epsilon_0 = const()[name = string("op_5913_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_5913_cast_fp16 = rsqrt(epsilon = var_5913_epsilon_0, x = mean_sq_19_cast_fp16)[name = string("op_5913_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_277_cast_fp16 = mul(x = var_5872, y = var_5913_cast_fp16)[name = string("input_277_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> var_5915_cast_fp16 = mul(x = q_77, y = cos_s)[name = string("op_5915_cast_fp16")];
+            tensor<int32, [2]> var_5916_split_sizes_0 = const()[name = string("op_5916_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_5916_axis_0 = const()[name = string("op_5916_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 128]> var_5916_0, tensor<fp16, [1, 2, 1, 128]> var_5916_1 = split(axis = var_5916_axis_0, split_sizes = var_5916_split_sizes_0, x = q_77)[name = string("op_5916")];
+            fp16 const_110_promoted = const()[name = string("const_110_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 128]> var_5918 = mul(x = var_5916_1, y = const_110_promoted)[name = string("op_5918")];
+            int32 var_5920 = const()[name = string("op_5920"), val = int32(-1)];
+            bool var_5921_interleave_0 = const()[name = string("op_5921_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 256]> var_5921 = concat(axis = var_5920, interleave = var_5921_interleave_0, values = (var_5918, var_5916_0))[name = string("op_5921")];
+            tensor<fp16, [1, 2, 1, 256]> var_5922_cast_fp16 = mul(x = var_5921, y = sin_s)[name = string("op_5922_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_275_cast_fp16 = add(x = var_5915_cast_fp16, y = var_5922_cast_fp16)[name = string("input_275_cast_fp16")];
+            tensor<int32, [8]> k_padded_17_pad_0 = const()[name = string("k_padded_17_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_17_mode_0 = const()[name = string("k_padded_17_mode_0"), val = string("constant")];
+            fp16 const_111_to_fp16 = const()[name = string("const_111_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> k_padded_17_cast_fp16 = pad(constant_val = const_111_to_fp16, mode = k_padded_17_mode_0, pad = k_padded_17_pad_0, x = input_275_cast_fp16)[name = string("k_padded_17_cast_fp16")];
+            tensor<int32, [8]> v_padded_17_pad_0 = const()[name = string("v_padded_17_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_17_mode_0 = const()[name = string("v_padded_17_mode_0"), val = string("constant")];
+            fp16 const_112_to_fp16 = const()[name = string("const_112_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> v_padded_17_cast_fp16 = pad(constant_val = const_112_to_fp16, mode = v_padded_17_mode_0, pad = v_padded_17_pad_0, x = input_277_cast_fp16)[name = string("v_padded_17_cast_fp16")];
+            tensor<int32, [4]> var_5951_begin_0 = const()[name = string("op_5951_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_5951_end_0 = const()[name = string("op_5951_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_5951_end_mask_0 = const()[name = string("op_5951_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_5951_cast_fp16 = slice_by_index(begin = var_5951_begin_0, end = var_5951_end_0, end_mask = var_5951_end_mask_0, x = K_sliding_slot_17_cast_fp16)[name = string("op_5951_cast_fp16")];
+            int32 var_5958 = const()[name = string("op_5958"), val = int32(2)];
+            bool K_sliding_out_17_interleave_0 = const()[name = string("K_sliding_out_17_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_out_17_cast_fp16 = concat(axis = var_5958, interleave = K_sliding_out_17_interleave_0, values = (var_5951_cast_fp16, k_padded_17_cast_fp16))[name = string("K_sliding_out_17_cast_fp16")];
+            tensor<int32, [4]> var_5974_begin_0 = const()[name = string("op_5974_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_5974_end_0 = const()[name = string("op_5974_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_5974_end_mask_0 = const()[name = string("op_5974_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_5974_cast_fp16 = slice_by_index(begin = var_5974_begin_0, end = var_5974_end_0, end_mask = var_5974_end_mask_0, x = V_sliding_slot_17_cast_fp16)[name = string("op_5974_cast_fp16")];
+            int32 var_5981 = const()[name = string("op_5981"), val = int32(2)];
+            bool V_sliding_out_17_interleave_0 = const()[name = string("V_sliding_out_17_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_out_17_cast_fp16 = concat(axis = var_5981, interleave = V_sliding_out_17_interleave_0, values = (var_5974_cast_fp16, v_padded_17_cast_fp16))[name = string("V_sliding_out_17_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_19_begin_0 = const()[name = string("K_for_attn_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_19_end_0 = const()[name = string("K_for_attn_19_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_19_end_mask_0 = const()[name = string("K_for_attn_19_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_19_cast_fp16 = slice_by_index(begin = K_for_attn_19_begin_0, end = K_for_attn_19_end_0, end_mask = K_for_attn_19_end_mask_0, x = K_sliding_out_17_cast_fp16)[name = string("K_for_attn_19_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_19_begin_0 = const()[name = string("V_for_attn_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_19_end_0 = const()[name = string("V_for_attn_19_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_19_end_mask_0 = const()[name = string("V_for_attn_19_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_19_cast_fp16 = slice_by_index(begin = V_for_attn_19_begin_0, end = V_for_attn_19_end_0, end_mask = V_for_attn_19_end_mask_0, x = V_sliding_out_17_cast_fp16)[name = string("V_for_attn_19_cast_fp16")];
+            tensor<int32, [4]> transpose_36_perm_0 = const()[name = string("transpose_36_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_18_reps_0 = const()[name = string("tile_18_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_36_cast_fp16 = transpose(perm = transpose_36_perm_0, x = K_for_attn_19_cast_fp16)[name = string("transpose_49")];
+            tensor<fp16, [8, 1, 512, 256]> tile_18_cast_fp16 = tile(reps = tile_18_reps_0, x = transpose_36_cast_fp16)[name = string("tile_18_cast_fp16")];
+            tensor<int32, [5]> concat_36 = const()[name = string("concat_36"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_36_cast_fp16 = reshape(shape = concat_36, x = tile_18_cast_fp16)[name = string("reshape_36_cast_fp16")];
+            tensor<int32, [5]> transpose_37_perm_0 = const()[name = string("transpose_37_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_37 = const()[name = string("concat_37"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_37_cast_fp16 = transpose(perm = transpose_37_perm_0, x = reshape_36_cast_fp16)[name = string("transpose_48")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_37_cast_fp16 = reshape(shape = concat_37, x = transpose_37_cast_fp16)[name = string("reshape_37_cast_fp16")];
+            tensor<int32, [4]> transpose_57_perm_0 = const()[name = string("transpose_57_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_38_perm_0 = const()[name = string("transpose_38_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_19_reps_0 = const()[name = string("tile_19_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_38_cast_fp16 = transpose(perm = transpose_38_perm_0, x = V_for_attn_19_cast_fp16)[name = string("transpose_47")];
+            tensor<fp16, [8, 1, 512, 256]> tile_19_cast_fp16 = tile(reps = tile_19_reps_0, x = transpose_38_cast_fp16)[name = string("tile_19_cast_fp16")];
+            tensor<int32, [5]> concat_38 = const()[name = string("concat_38"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_38_cast_fp16 = reshape(shape = concat_38, x = tile_19_cast_fp16)[name = string("reshape_38_cast_fp16")];
+            tensor<int32, [5]> transpose_39_perm_0 = const()[name = string("transpose_39_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_39 = const()[name = string("concat_39"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_39_cast_fp16 = transpose(perm = transpose_39_perm_0, x = reshape_38_cast_fp16)[name = string("transpose_46")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_39_cast_fp16 = reshape(shape = concat_39, x = transpose_39_cast_fp16)[name = string("reshape_39_cast_fp16")];
+            tensor<int32, [4]> V_expanded_19_perm_0 = const()[name = string("V_expanded_19_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_37_transpose_x_0 = const()[name = string("attn_weights_37_transpose_x_0"), val = bool(false)];
+            bool attn_weights_37_transpose_y_0 = const()[name = string("attn_weights_37_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_57_cast_fp16 = transpose(perm = transpose_57_perm_0, x = reshape_37_cast_fp16)[name = string("transpose_45")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_37_cast_fp16 = matmul(transpose_x = attn_weights_37_transpose_x_0, transpose_y = attn_weights_37_transpose_y_0, x = q_79_cast_fp16, y = transpose_57_cast_fp16)[name = string("attn_weights_37_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_187_cast_fp16 = add(x = attn_weights_37_cast_fp16, y = causal_mask_sliding)[name = string("x_187_cast_fp16")];
+            tensor<int32, [1]> reduce_max_9_axes_0 = const()[name = string("reduce_max_9_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_9_keep_dims_0 = const()[name = string("reduce_max_9_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_9 = reduce_max(axes = reduce_max_9_axes_0, keep_dims = reduce_max_9_keep_dims_0, x = x_187_cast_fp16)[name = string("reduce_max_9")];
+            tensor<fp16, [1, 8, 1, 512]> var_6022 = sub(x = x_187_cast_fp16, y = reduce_max_9)[name = string("op_6022")];
+            tensor<fp16, [1, 8, 1, 512]> var_6028 = exp(x = var_6022)[name = string("op_6028")];
+            tensor<int32, [1]> var_6038_axes_0 = const()[name = string("op_6038_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_6038_keep_dims_0 = const()[name = string("op_6038_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_6038 = reduce_sum(axes = var_6038_axes_0, keep_dims = var_6038_keep_dims_0, x = var_6028)[name = string("op_6038")];
+            tensor<fp16, [1, 8, 1, 512]> var_6044_cast_fp16 = real_div(x = var_6028, y = var_6038)[name = string("op_6044_cast_fp16")];
+            bool attn_output_55_transpose_x_0 = const()[name = string("attn_output_55_transpose_x_0"), val = bool(false)];
+            bool attn_output_55_transpose_y_0 = const()[name = string("attn_output_55_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_19_cast_fp16 = transpose(perm = V_expanded_19_perm_0, x = reshape_39_cast_fp16)[name = string("transpose_44")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_55_cast_fp16 = matmul(transpose_x = attn_output_55_transpose_x_0, transpose_y = attn_output_55_transpose_y_0, x = var_6044_cast_fp16, y = V_expanded_19_cast_fp16)[name = string("attn_output_55_cast_fp16")];
+            tensor<int32, [4]> var_6055 = const()[name = string("op_6055"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_6062 = const()[name = string("op_6062"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_6056_cast_fp16 = transpose(perm = var_6055, x = attn_output_55_cast_fp16)[name = string("transpose_43")];
+            tensor<fp16, [1, 1, 2048]> attn_output_57_cast_fp16 = reshape(shape = var_6062, x = var_6056_cast_fp16)[name = string("attn_output_57_cast_fp16")];
+            tensor<int32, [3]> var_6067 = const()[name = string("op_6067"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_6083_pad_type_0 = const()[name = string("op_6083_pad_type_0"), val = string("valid")];
+            int32 var_6083_groups_0 = const()[name = string("op_6083_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_6083_strides_0 = const()[name = string("op_6083_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_6083_pad_0 = const()[name = string("op_6083_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_6083_dilations_0 = const()[name = string("op_6083_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_9_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(560655040))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(563276544))))[name = string("squeeze_9_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_6068_cast_fp16 = transpose(perm = var_6067, x = attn_output_57_cast_fp16)[name = string("transpose_42")];
+            tensor<fp16, [1, 2560, 1]> var_6083_cast_fp16 = conv(dilations = var_6083_dilations_0, groups = var_6083_groups_0, pad = var_6083_pad_0, pad_type = var_6083_pad_type_0, strides = var_6083_strides_0, weight = squeeze_9_cast_fp16_to_fp32_to_fp16_palettized, x = var_6068_cast_fp16)[name = string("op_6083_cast_fp16")];
+            tensor<int32, [3]> var_6087 = const()[name = string("op_6087"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_6093 = const()[name = string("op_6093"), val = int32(-1)];
+            fp16 const_113_promoted_to_fp16 = const()[name = string("const_113_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_191_cast_fp16 = transpose(perm = var_6087, x = var_6083_cast_fp16)[name = string("transpose_41")];
+            tensor<fp16, [1, 1, 2560]> var_6095_cast_fp16 = mul(x = x_191_cast_fp16, y = const_113_promoted_to_fp16)[name = string("op_6095_cast_fp16")];
+            bool input_281_interleave_0 = const()[name = string("input_281_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_281_cast_fp16 = concat(axis = var_6093, interleave = input_281_interleave_0, values = (x_191_cast_fp16, var_6095_cast_fp16))[name = string("input_281_cast_fp16")];
+            tensor<int32, [1]> normed_265_axes_0 = const()[name = string("normed_265_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6090_to_fp16 = const()[name = string("op_6090_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_265_cast_fp16 = layer_norm(axes = normed_265_axes_0, epsilon = var_6090_to_fp16, x = input_281_cast_fp16)[name = string("normed_265_cast_fp16")];
+            tensor<int32, [2]> var_6100_split_sizes_0 = const()[name = string("op_6100_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6100_axis_0 = const()[name = string("op_6100_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_6100_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_6100_cast_fp16_1 = split(axis = var_6100_axis_0, split_sizes = var_6100_split_sizes_0, x = normed_265_cast_fp16)[name = string("op_6100_cast_fp16")];
+            tensor<fp16, [2560]> layers_9_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_9_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(563279168)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_59_cast_fp16 = mul(x = var_6100_cast_fp16_0, y = layers_9_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_59_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_193_cast_fp16 = add(x = x_179_cast_fp16, y = attn_output_59_cast_fp16)[name = string("x_193_cast_fp16")];
+            int32 var_6109 = const()[name = string("op_6109"), val = int32(-1)];
+            fp16 const_114_promoted_to_fp16 = const()[name = string("const_114_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_6111_cast_fp16 = mul(x = x_193_cast_fp16, y = const_114_promoted_to_fp16)[name = string("op_6111_cast_fp16")];
+            bool input_283_interleave_0 = const()[name = string("input_283_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_283_cast_fp16 = concat(axis = var_6109, interleave = input_283_interleave_0, values = (x_193_cast_fp16, var_6111_cast_fp16))[name = string("input_283_cast_fp16")];
+            tensor<int32, [1]> normed_269_axes_0 = const()[name = string("normed_269_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6106_to_fp16 = const()[name = string("op_6106_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_269_cast_fp16 = layer_norm(axes = normed_269_axes_0, epsilon = var_6106_to_fp16, x = input_283_cast_fp16)[name = string("normed_269_cast_fp16")];
+            tensor<int32, [2]> var_6116_split_sizes_0 = const()[name = string("op_6116_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6116_axis_0 = const()[name = string("op_6116_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_6116_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_6116_cast_fp16_1 = split(axis = var_6116_axis_0, split_sizes = var_6116_split_sizes_0, x = normed_269_cast_fp16)[name = string("op_6116_cast_fp16")];
+            tensor<fp16, [2560]> layers_9_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_9_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(563284352)))];
+            tensor<fp16, [1, 1, 2560]> h_57_cast_fp16 = mul(x = var_6116_cast_fp16_0, y = layers_9_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_57_cast_fp16")];
+            tensor<int32, [3]> var_6127 = const()[name = string("op_6127"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_285_axes_0 = const()[name = string("input_285_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_6128 = transpose(perm = var_6127, x = h_57_cast_fp16)[name = string("transpose_40")];
+            tensor<fp16, [1, 2560, 1, 1]> input_285 = expand_dims(axes = input_285_axes_0, x = var_6128)[name = string("input_285")];
+            string gate_37_pad_type_0 = const()[name = string("gate_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_37_strides_0 = const()[name = string("gate_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_37_pad_0 = const()[name = string("gate_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_37_dilations_0 = const()[name = string("gate_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_37_groups_0 = const()[name = string("gate_37_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_37 = conv(dilations = gate_37_dilations_0, groups = gate_37_groups_0, pad = gate_37_pad_0, pad_type = gate_37_pad_type_0, strides = gate_37_strides_0, weight = layers_9_mlp_gate_proj_weight_palettized, x = input_285)[name = string("gate_37")];
+            string up_19_pad_type_0 = const()[name = string("up_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_19_strides_0 = const()[name = string("up_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_19_pad_0 = const()[name = string("up_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_19_dilations_0 = const()[name = string("up_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_19_groups_0 = const()[name = string("up_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_19 = conv(dilations = up_19_dilations_0, groups = up_19_groups_0, pad = up_19_pad_0, pad_type = up_19_pad_type_0, strides = up_19_strides_0, weight = layers_9_mlp_up_proj_weight_palettized, x = input_285)[name = string("up_19")];
+            string gate_39_mode_0 = const()[name = string("gate_39_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_39 = gelu(mode = gate_39_mode_0, x = gate_37)[name = string("gate_39")];
+            tensor<fp16, [1, 10240, 1, 1]> input_287 = mul(x = gate_39, y = up_19)[name = string("input_287")];
+            string mlp_out_19_pad_type_0 = const()[name = string("mlp_out_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_19_strides_0 = const()[name = string("mlp_out_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_19_pad_0 = const()[name = string("mlp_out_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_19_dilations_0 = const()[name = string("mlp_out_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_19_groups_0 = const()[name = string("mlp_out_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_19 = conv(dilations = mlp_out_19_dilations_0, groups = mlp_out_19_groups_0, pad = mlp_out_19_pad_0, pad_type = mlp_out_19_pad_type_0, strides = mlp_out_19_strides_0, weight = layers_9_mlp_down_proj_weight_palettized, x = input_287)[name = string("mlp_out_19")];
+            tensor<int32, [1]> var_6168_axes_0 = const()[name = string("op_6168_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_6168 = squeeze(axes = var_6168_axes_0, x = mlp_out_19)[name = string("op_6168")];
+            tensor<int32, [3]> var_6172 = const()[name = string("op_6172"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_6178 = const()[name = string("op_6178"), val = int32(-1)];
+            fp16 const_115_promoted = const()[name = string("const_115_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_195 = transpose(perm = var_6172, x = var_6168)[name = string("transpose_39")];
+            tensor<fp16, [1, 1, 2560]> var_6180 = mul(x = x_195, y = const_115_promoted)[name = string("op_6180")];
+            bool input_289_interleave_0 = const()[name = string("input_289_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_289 = concat(axis = var_6178, interleave = input_289_interleave_0, values = (x_195, var_6180))[name = string("input_289")];
+            tensor<int32, [1]> normed_273_axes_0 = const()[name = string("normed_273_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6175_to_fp16 = const()[name = string("op_6175_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_273_cast_fp16 = layer_norm(axes = normed_273_axes_0, epsilon = var_6175_to_fp16, x = input_289)[name = string("normed_273_cast_fp16")];
+            tensor<int32, [2]> var_6185_split_sizes_0 = const()[name = string("op_6185_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6185_axis_0 = const()[name = string("op_6185_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_6185_0, tensor<fp16, [1, 1, 2560]> var_6185_1 = split(axis = var_6185_axis_0, split_sizes = var_6185_split_sizes_0, x = normed_273_cast_fp16)[name = string("op_6185")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_93 = mul(x = var_6185_0, y = layers_9_post_feedforward_layernorm_weight)[name = string("hidden_states_93")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_95_cast_fp16 = add(x = x_193_cast_fp16, y = hidden_states_93)[name = string("hidden_states_95_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_19_begin_0 = const()[name = string("per_layer_slice_19_begin_0"), val = tensor<int32, [3]>([0, 0, 5376])];
+            tensor<int32, [3]> per_layer_slice_19_end_0 = const()[name = string("per_layer_slice_19_end_0"), val = tensor<int32, [3]>([1, 1, 5632])];
+            tensor<bool, [3]> per_layer_slice_19_end_mask_0 = const()[name = string("per_layer_slice_19_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_19_cast_fp16 = slice_by_index(begin = per_layer_slice_19_begin_0, end = per_layer_slice_19_end_0, end_mask = per_layer_slice_19_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_19_cast_fp16")];
+            tensor<int32, [3]> var_6213 = const()[name = string("op_6213"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_291_axes_0 = const()[name = string("input_291_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_6214 = transpose(perm = var_6213, x = hidden_states_95_cast_fp16)[name = string("transpose_38")];
+            tensor<fp16, [1, 2560, 1, 1]> input_291 = expand_dims(axes = input_291_axes_0, x = var_6214)[name = string("input_291")];
+            string gated_55_pad_type_0 = const()[name = string("gated_55_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_55_strides_0 = const()[name = string("gated_55_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_55_pad_0 = const()[name = string("gated_55_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_55_dilations_0 = const()[name = string("gated_55_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_55_groups_0 = const()[name = string("gated_55_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_55 = conv(dilations = gated_55_dilations_0, groups = gated_55_groups_0, pad = gated_55_pad_0, pad_type = gated_55_pad_type_0, strides = gated_55_strides_0, weight = layers_9_per_layer_input_gate_weight_palettized, x = input_291)[name = string("gated_55")];
+            string gated_57_mode_0 = const()[name = string("gated_57_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_57 = gelu(mode = gated_57_mode_0, x = gated_55)[name = string("gated_57")];
+            tensor<int32, [3]> var_6233 = const()[name = string("op_6233"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_19_axes_0 = const()[name = string("per_layer_slice_conv_19_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_6234_cast_fp16 = transpose(perm = var_6233, x = per_layer_slice_19_cast_fp16)[name = string("transpose_37")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_19_cast_fp16 = expand_dims(axes = per_layer_slice_conv_19_axes_0, x = var_6234_cast_fp16)[name = string("per_layer_slice_conv_19_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_293_cast_fp16 = mul(x = gated_57, y = per_layer_slice_conv_19_cast_fp16)[name = string("input_293_cast_fp16")];
+            string gated_59_pad_type_0 = const()[name = string("gated_59_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_59_strides_0 = const()[name = string("gated_59_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_59_pad_0 = const()[name = string("gated_59_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_59_dilations_0 = const()[name = string("gated_59_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_59_groups_0 = const()[name = string("gated_59_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_9_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(563289536))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(563617280))))[name = string("layers_9_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_59_cast_fp16 = conv(dilations = gated_59_dilations_0, groups = gated_59_groups_0, pad = gated_59_pad_0, pad_type = gated_59_pad_type_0, strides = gated_59_strides_0, weight = layers_9_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_293_cast_fp16)[name = string("gated_59_cast_fp16")];
+            tensor<int32, [1]> var_6250_axes_0 = const()[name = string("op_6250_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_6250_cast_fp16 = squeeze(axes = var_6250_axes_0, x = gated_59_cast_fp16)[name = string("op_6250_cast_fp16")];
+            tensor<int32, [3]> var_6254 = const()[name = string("op_6254"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_6260 = const()[name = string("op_6260"), val = int32(-1)];
+            fp16 const_116_promoted_to_fp16 = const()[name = string("const_116_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_197_cast_fp16 = transpose(perm = var_6254, x = var_6250_cast_fp16)[name = string("transpose_36")];
+            tensor<fp16, [1, 1, 2560]> var_6262_cast_fp16 = mul(x = x_197_cast_fp16, y = const_116_promoted_to_fp16)[name = string("op_6262_cast_fp16")];
+            bool input_295_interleave_0 = const()[name = string("input_295_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_295_cast_fp16 = concat(axis = var_6260, interleave = input_295_interleave_0, values = (x_197_cast_fp16, var_6262_cast_fp16))[name = string("input_295_cast_fp16")];
+            tensor<int32, [1]> normed_277_axes_0 = const()[name = string("normed_277_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6257_to_fp16 = const()[name = string("op_6257_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_277_cast_fp16 = layer_norm(axes = normed_277_axes_0, epsilon = var_6257_to_fp16, x = input_295_cast_fp16)[name = string("normed_277_cast_fp16")];
+            tensor<int32, [2]> var_6267_split_sizes_0 = const()[name = string("op_6267_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6267_axis_0 = const()[name = string("op_6267_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_6267_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_6267_cast_fp16_1 = split(axis = var_6267_axis_0, split_sizes = var_6267_split_sizes_0, x = normed_277_cast_fp16)[name = string("op_6267_cast_fp16")];
+            tensor<fp16, [2560]> layers_9_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_9_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(563619904)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_99_cast_fp16 = mul(x = var_6267_cast_fp16_0, y = layers_9_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_99_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_101_cast_fp16 = add(x = hidden_states_95_cast_fp16, y = hidden_states_99_cast_fp16)[name = string("hidden_states_101_cast_fp16")];
+            tensor<fp16, [1]> const_117_promoted_to_fp16 = const()[name = string("const_117_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.d8p-2])];
+            tensor<fp16, [1, 1, 2560]> x_199_cast_fp16 = mul(x = hidden_states_101_cast_fp16, y = const_117_promoted_to_fp16)[name = string("x_199_cast_fp16")];
+            tensor<int32, [1]> var_6279_axes_0 = const()[name = string("op_6279_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_6279_cast_fp16 = squeeze(axes = var_6279_axes_0, x = K_sliding_out_17_cast_fp16)[name = string("op_6279_cast_fp16")];
+            tensor<int32, [1]> var_6281_axes_0 = const()[name = string("op_6281_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_6281_cast_fp16 = squeeze(axes = var_6281_axes_0, x = V_sliding_out_17_cast_fp16)[name = string("op_6281_cast_fp16")];
+            tensor<int32, [4]> var_6284_begin_0 = const()[name = string("op_6284_begin_0"), val = tensor<int32, [4]>([9, 0, 0, 0])];
+            tensor<int32, [4]> var_6284_end_0 = const()[name = string("op_6284_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_6284_end_mask_0 = const()[name = string("op_6284_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_6284_squeeze_mask_0 = const()[name = string("op_6284_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_6284_cast_fp16 = slice_by_index(begin = var_6284_begin_0, end = var_6284_end_0, end_mask = var_6284_end_mask_0, squeeze_mask = var_6284_squeeze_mask_0, x = K_sliding_in)[name = string("op_6284_cast_fp16")];
+            tensor<int32, [1]> K_sliding_slot_axes_0 = const()[name = string("K_sliding_slot_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_slot_cast_fp16 = expand_dims(axes = K_sliding_slot_axes_0, x = var_6284_cast_fp16)[name = string("K_sliding_slot_cast_fp16")];
+            tensor<int32, [4]> var_6289_begin_0 = const()[name = string("op_6289_begin_0"), val = tensor<int32, [4]>([9, 0, 0, 0])];
+            tensor<int32, [4]> var_6289_end_0 = const()[name = string("op_6289_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_6289_end_mask_0 = const()[name = string("op_6289_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_6289_squeeze_mask_0 = const()[name = string("op_6289_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_6289_cast_fp16 = slice_by_index(begin = var_6289_begin_0, end = var_6289_end_0, end_mask = var_6289_end_mask_0, squeeze_mask = var_6289_squeeze_mask_0, x = V_sliding_in)[name = string("op_6289_cast_fp16")];
+            tensor<int32, [1]> V_sliding_slot_axes_0 = const()[name = string("V_sliding_slot_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_slot_cast_fp16 = expand_dims(axes = V_sliding_slot_axes_0, x = var_6289_cast_fp16)[name = string("V_sliding_slot_cast_fp16")];
+            int32 var_6296 = const()[name = string("op_6296"), val = int32(-1)];
+            fp16 const_118_promoted_to_fp16 = const()[name = string("const_118_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_6298_cast_fp16 = mul(x = x_199_cast_fp16, y = const_118_promoted_to_fp16)[name = string("op_6298_cast_fp16")];
+            bool input_297_interleave_0 = const()[name = string("input_297_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_297_cast_fp16 = concat(axis = var_6296, interleave = input_297_interleave_0, values = (x_199_cast_fp16, var_6298_cast_fp16))[name = string("input_297_cast_fp16")];
+            tensor<int32, [1]> normed_281_axes_0 = const()[name = string("normed_281_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6293_to_fp16 = const()[name = string("op_6293_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_281_cast_fp16 = layer_norm(axes = normed_281_axes_0, epsilon = var_6293_to_fp16, x = input_297_cast_fp16)[name = string("normed_281_cast_fp16")];
+            tensor<int32, [2]> var_6303_split_sizes_0 = const()[name = string("op_6303_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6303_axis_0 = const()[name = string("op_6303_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_6303_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_6303_cast_fp16_1 = split(axis = var_6303_axis_0, split_sizes = var_6303_split_sizes_0, x = normed_281_cast_fp16)[name = string("op_6303_cast_fp16")];
+            tensor<fp16, [2560]> layers_10_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_10_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(563625088)))];
+            tensor<fp16, [1, 1, 2560]> h_61_cast_fp16 = mul(x = var_6303_cast_fp16_0, y = layers_10_input_layernorm_weight_promoted_to_fp16)[name = string("h_61_cast_fp16")];
+            tensor<int32, [3]> var_6309 = const()[name = string("op_6309"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_6312_axes_0 = const()[name = string("op_6312_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_6310_cast_fp16 = transpose(perm = var_6309, x = h_61_cast_fp16)[name = string("transpose_35")];
+            tensor<fp16, [1, 2560, 1, 1]> var_6312_cast_fp16 = expand_dims(axes = var_6312_axes_0, x = var_6310_cast_fp16)[name = string("op_6312_cast_fp16")];
+            string var_6328_pad_type_0 = const()[name = string("op_6328_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_6328_strides_0 = const()[name = string("op_6328_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_6328_pad_0 = const()[name = string("op_6328_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_6328_dilations_0 = const()[name = string("op_6328_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_6328_groups_0 = const()[name = string("op_6328_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_6328 = conv(dilations = var_6328_dilations_0, groups = var_6328_groups_0, pad = var_6328_pad_0, pad_type = var_6328_pad_type_0, strides = var_6328_strides_0, weight = layers_10_self_attn_q_proj_weight_palettized, x = var_6312_cast_fp16)[name = string("op_6328")];
+            tensor<int32, [4]> var_6333 = const()[name = string("op_6333"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_6334 = reshape(shape = var_6333, x = var_6328)[name = string("op_6334")];
+            tensor<int32, [4]> var_6339 = const()[name = string("op_6339"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_6349 = const()[name = string("op_6349"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_6340 = transpose(perm = var_6339, x = var_6334)[name = string("transpose_34")];
+            tensor<fp16, [1, 8, 256]> x_201 = reshape(shape = var_6349, x = var_6340)[name = string("x_201")];
+            int32 var_6355 = const()[name = string("op_6355"), val = int32(-1)];
+            fp16 const_119_promoted = const()[name = string("const_119_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_6357 = mul(x = x_201, y = const_119_promoted)[name = string("op_6357")];
+            bool input_301_interleave_0 = const()[name = string("input_301_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_301 = concat(axis = var_6355, interleave = input_301_interleave_0, values = (x_201, var_6357))[name = string("input_301")];
+            tensor<int32, [1]> normed_285_axes_0 = const()[name = string("normed_285_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6352_to_fp16 = const()[name = string("op_6352_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_285_cast_fp16 = layer_norm(axes = normed_285_axes_0, epsilon = var_6352_to_fp16, x = input_301)[name = string("normed_285_cast_fp16")];
+            tensor<int32, [2]> var_6362_split_sizes_0 = const()[name = string("op_6362_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_6362_axis_0 = const()[name = string("op_6362_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_6362_0, tensor<fp16, [1, 8, 256]> var_6362_1 = split(axis = var_6362_axis_0, split_sizes = var_6362_split_sizes_0, x = normed_285_cast_fp16)[name = string("op_6362")];
+            tensor<fp16, [1, 8, 256]> var_6364 = mul(x = var_6362_0, y = layers_10_self_attn_q_norm_weight)[name = string("op_6364")];
+            tensor<int32, [4]> var_6369 = const()[name = string("op_6369"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_83 = reshape(shape = var_6369, x = var_6364)[name = string("q_83")];
+            tensor<fp16, [1, 8, 1, 256]> var_6371_cast_fp16 = mul(x = q_83, y = cos_s)[name = string("op_6371_cast_fp16")];
+            tensor<int32, [2]> var_6372_split_sizes_0 = const()[name = string("op_6372_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_6372_axis_0 = const()[name = string("op_6372_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_6372_0, tensor<fp16, [1, 8, 1, 128]> var_6372_1 = split(axis = var_6372_axis_0, split_sizes = var_6372_split_sizes_0, x = q_83)[name = string("op_6372")];
+            fp16 const_120_promoted = const()[name = string("const_120_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_6374 = mul(x = var_6372_1, y = const_120_promoted)[name = string("op_6374")];
+            int32 var_6376 = const()[name = string("op_6376"), val = int32(-1)];
+            bool var_6377_interleave_0 = const()[name = string("op_6377_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_6377 = concat(axis = var_6376, interleave = var_6377_interleave_0, values = (var_6374, var_6372_0))[name = string("op_6377")];
+            tensor<fp16, [1, 8, 1, 256]> var_6378_cast_fp16 = mul(x = var_6377, y = sin_s)[name = string("op_6378_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_87_cast_fp16 = add(x = var_6371_cast_fp16, y = var_6378_cast_fp16)[name = string("q_87_cast_fp16")];
+            string var_6391_pad_type_0 = const()[name = string("op_6391_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_6391_strides_0 = const()[name = string("op_6391_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_6391_pad_0 = const()[name = string("op_6391_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_6391_dilations_0 = const()[name = string("op_6391_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_6391_groups_0 = const()[name = string("op_6391_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_6391 = conv(dilations = var_6391_dilations_0, groups = var_6391_groups_0, pad = var_6391_pad_0, pad_type = var_6391_pad_type_0, strides = var_6391_strides_0, weight = layers_10_self_attn_k_proj_weight_palettized, x = var_6312_cast_fp16)[name = string("op_6391")];
+            tensor<int32, [4]> var_6396 = const()[name = string("op_6396"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_6397 = reshape(shape = var_6396, x = var_6391)[name = string("op_6397")];
+            tensor<int32, [4]> var_6402 = const()[name = string("op_6402"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_6419_pad_type_0 = const()[name = string("op_6419_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_6419_strides_0 = const()[name = string("op_6419_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_6419_pad_0 = const()[name = string("op_6419_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_6419_dilations_0 = const()[name = string("op_6419_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_6419_groups_0 = const()[name = string("op_6419_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_6419 = conv(dilations = var_6419_dilations_0, groups = var_6419_groups_0, pad = var_6419_pad_0, pad_type = var_6419_pad_type_0, strides = var_6419_strides_0, weight = layers_10_self_attn_v_proj_weight_palettized, x = var_6312_cast_fp16)[name = string("op_6419")];
+            tensor<int32, [4]> var_6424 = const()[name = string("op_6424"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_6425 = reshape(shape = var_6424, x = var_6419)[name = string("op_6425")];
+            tensor<int32, [4]> var_6430 = const()[name = string("op_6430"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_6440 = const()[name = string("op_6440"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp16, [1, 2, 1, 256]> var_6403 = transpose(perm = var_6402, x = var_6397)[name = string("transpose_33")];
+            tensor<fp16, [1, 2, 256]> x_203 = reshape(shape = var_6440, x = var_6403)[name = string("x_203")];
+            int32 var_6446 = const()[name = string("op_6446"), val = int32(-1)];
+            fp16 const_121_promoted = const()[name = string("const_121_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 256]> var_6448 = mul(x = x_203, y = const_121_promoted)[name = string("op_6448")];
+            bool input_303_interleave_0 = const()[name = string("input_303_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512]> input_303 = concat(axis = var_6446, interleave = input_303_interleave_0, values = (x_203, var_6448))[name = string("input_303")];
+            tensor<int32, [1]> normed_289_axes_0 = const()[name = string("normed_289_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6443_to_fp16 = const()[name = string("op_6443_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 512]> normed_289_cast_fp16 = layer_norm(axes = normed_289_axes_0, epsilon = var_6443_to_fp16, x = input_303)[name = string("normed_289_cast_fp16")];
+            tensor<int32, [2]> var_6453_split_sizes_0 = const()[name = string("op_6453_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_6453_axis_0 = const()[name = string("op_6453_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 256]> var_6453_0, tensor<fp16, [1, 2, 256]> var_6453_1 = split(axis = var_6453_axis_0, split_sizes = var_6453_split_sizes_0, x = normed_289_cast_fp16)[name = string("op_6453")];
+            tensor<fp16, [1, 2, 256]> var_6455 = mul(x = var_6453_0, y = layers_4_self_attn_k_norm_weight)[name = string("op_6455")];
+            tensor<int32, [4]> var_6460 = const()[name = string("op_6460"), val = tensor<int32, [4]>([1, 2, 1, 256])];
+            tensor<fp16, [1, 2, 1, 256]> q_85 = reshape(shape = var_6460, x = var_6455)[name = string("q_85")];
+            fp16 var_6462_promoted = const()[name = string("op_6462_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 256]> var_6431 = transpose(perm = var_6430, x = var_6425)[name = string("transpose_32")];
+            tensor<fp16, [1, 2, 1, 256]> var_6463 = pow(x = var_6431, y = var_6462_promoted)[name = string("op_6463")];
+            tensor<int32, [1]> var_6468_axes_0 = const()[name = string("op_6468_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_6468_keep_dims_0 = const()[name = string("op_6468_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_6468 = reduce_mean(axes = var_6468_axes_0, keep_dims = var_6468_keep_dims_0, x = var_6463)[name = string("op_6468")];
+            fp16 var_6470_to_fp16 = const()[name = string("op_6470_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_21_cast_fp16 = add(x = var_6468, y = var_6470_to_fp16)[name = string("mean_sq_21_cast_fp16")];
+            fp32 var_6472_epsilon_0 = const()[name = string("op_6472_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_6472_cast_fp16 = rsqrt(epsilon = var_6472_epsilon_0, x = mean_sq_21_cast_fp16)[name = string("op_6472_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_307_cast_fp16 = mul(x = var_6431, y = var_6472_cast_fp16)[name = string("input_307_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> var_6474_cast_fp16 = mul(x = q_85, y = cos_s)[name = string("op_6474_cast_fp16")];
+            tensor<int32, [2]> var_6475_split_sizes_0 = const()[name = string("op_6475_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_6475_axis_0 = const()[name = string("op_6475_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 128]> var_6475_0, tensor<fp16, [1, 2, 1, 128]> var_6475_1 = split(axis = var_6475_axis_0, split_sizes = var_6475_split_sizes_0, x = q_85)[name = string("op_6475")];
+            fp16 const_122_promoted = const()[name = string("const_122_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 128]> var_6477 = mul(x = var_6475_1, y = const_122_promoted)[name = string("op_6477")];
+            int32 var_6479 = const()[name = string("op_6479"), val = int32(-1)];
+            bool var_6480_interleave_0 = const()[name = string("op_6480_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 256]> var_6480 = concat(axis = var_6479, interleave = var_6480_interleave_0, values = (var_6477, var_6475_0))[name = string("op_6480")];
+            tensor<fp16, [1, 2, 1, 256]> var_6481_cast_fp16 = mul(x = var_6480, y = sin_s)[name = string("op_6481_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_305_cast_fp16 = add(x = var_6474_cast_fp16, y = var_6481_cast_fp16)[name = string("input_305_cast_fp16")];
+            tensor<int32, [8]> k_padded_pad_0 = const()[name = string("k_padded_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_mode_0 = const()[name = string("k_padded_mode_0"), val = string("constant")];
+            fp16 const_123_to_fp16 = const()[name = string("const_123_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> k_padded_cast_fp16 = pad(constant_val = const_123_to_fp16, mode = k_padded_mode_0, pad = k_padded_pad_0, x = input_305_cast_fp16)[name = string("k_padded_cast_fp16")];
+            tensor<int32, [8]> v_padded_pad_0 = const()[name = string("v_padded_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_mode_0 = const()[name = string("v_padded_mode_0"), val = string("constant")];
+            fp16 const_124_to_fp16 = const()[name = string("const_124_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> v_padded_cast_fp16 = pad(constant_val = const_124_to_fp16, mode = v_padded_mode_0, pad = v_padded_pad_0, x = input_307_cast_fp16)[name = string("v_padded_cast_fp16")];
+            tensor<int32, [4]> var_6510_begin_0 = const()[name = string("op_6510_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_6510_end_0 = const()[name = string("op_6510_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_6510_end_mask_0 = const()[name = string("op_6510_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_6510_cast_fp16 = slice_by_index(begin = var_6510_begin_0, end = var_6510_end_0, end_mask = var_6510_end_mask_0, x = K_sliding_slot_cast_fp16)[name = string("op_6510_cast_fp16")];
+            int32 var_6517 = const()[name = string("op_6517"), val = int32(2)];
+            bool K_sliding_out_interleave_0 = const()[name = string("K_sliding_out_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_out_cast_fp16 = concat(axis = var_6517, interleave = K_sliding_out_interleave_0, values = (var_6510_cast_fp16, k_padded_cast_fp16))[name = string("K_sliding_out_cast_fp16")];
+            tensor<int32, [4]> var_6533_begin_0 = const()[name = string("op_6533_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_6533_end_0 = const()[name = string("op_6533_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_6533_end_mask_0 = const()[name = string("op_6533_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_6533_cast_fp16 = slice_by_index(begin = var_6533_begin_0, end = var_6533_end_0, end_mask = var_6533_end_mask_0, x = V_sliding_slot_cast_fp16)[name = string("op_6533_cast_fp16")];
+            int32 var_6540 = const()[name = string("op_6540"), val = int32(2)];
+            bool V_sliding_out_interleave_0 = const()[name = string("V_sliding_out_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_out_cast_fp16 = concat(axis = var_6540, interleave = V_sliding_out_interleave_0, values = (var_6533_cast_fp16, v_padded_cast_fp16))[name = string("V_sliding_out_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_21_begin_0 = const()[name = string("K_for_attn_21_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_21_end_0 = const()[name = string("K_for_attn_21_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_21_end_mask_0 = const()[name = string("K_for_attn_21_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> kv13_k = slice_by_index(begin = K_for_attn_21_begin_0, end = K_for_attn_21_end_0, end_mask = K_for_attn_21_end_mask_0, x = K_sliding_out_cast_fp16)[name = string("K_for_attn_21_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_21_begin_0 = const()[name = string("V_for_attn_21_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_21_end_0 = const()[name = string("V_for_attn_21_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_21_end_mask_0 = const()[name = string("V_for_attn_21_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> kv13_v = slice_by_index(begin = V_for_attn_21_begin_0, end = V_for_attn_21_end_0, end_mask = V_for_attn_21_end_mask_0, x = V_sliding_out_cast_fp16)[name = string("V_for_attn_21_cast_fp16")];
+            tensor<int32, [4]> transpose_40_perm_0 = const()[name = string("transpose_40_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_20_reps_0 = const()[name = string("tile_20_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_40_cast_fp16 = transpose(perm = transpose_40_perm_0, x = kv13_k)[name = string("transpose_31")];
+            tensor<fp16, [8, 1, 512, 256]> tile_20_cast_fp16 = tile(reps = tile_20_reps_0, x = transpose_40_cast_fp16)[name = string("tile_20_cast_fp16")];
+            tensor<int32, [5]> concat_40 = const()[name = string("concat_40"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_40_cast_fp16 = reshape(shape = concat_40, x = tile_20_cast_fp16)[name = string("reshape_40_cast_fp16")];
+            tensor<int32, [5]> transpose_41_perm_0 = const()[name = string("transpose_41_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_41 = const()[name = string("concat_41"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_41_cast_fp16 = transpose(perm = transpose_41_perm_0, x = reshape_40_cast_fp16)[name = string("transpose_30")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_41_cast_fp16 = reshape(shape = concat_41, x = transpose_41_cast_fp16)[name = string("reshape_41_cast_fp16")];
+            tensor<int32, [4]> transpose_58_perm_0 = const()[name = string("transpose_58_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_42_perm_0 = const()[name = string("transpose_42_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_21_reps_0 = const()[name = string("tile_21_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_42_cast_fp16 = transpose(perm = transpose_42_perm_0, x = kv13_v)[name = string("transpose_29")];
+            tensor<fp16, [8, 1, 512, 256]> tile_21_cast_fp16 = tile(reps = tile_21_reps_0, x = transpose_42_cast_fp16)[name = string("tile_21_cast_fp16")];
+            tensor<int32, [5]> concat_42 = const()[name = string("concat_42"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_42_cast_fp16 = reshape(shape = concat_42, x = tile_21_cast_fp16)[name = string("reshape_42_cast_fp16")];
+            tensor<int32, [5]> transpose_43_perm_0 = const()[name = string("transpose_43_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_43 = const()[name = string("concat_43"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_43_cast_fp16 = transpose(perm = transpose_43_perm_0, x = reshape_42_cast_fp16)[name = string("transpose_28")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_43_cast_fp16 = reshape(shape = concat_43, x = transpose_43_cast_fp16)[name = string("reshape_43_cast_fp16")];
+            tensor<int32, [4]> V_expanded_21_perm_0 = const()[name = string("V_expanded_21_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_41_transpose_x_0 = const()[name = string("attn_weights_41_transpose_x_0"), val = bool(false)];
+            bool attn_weights_41_transpose_y_0 = const()[name = string("attn_weights_41_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_58_cast_fp16 = transpose(perm = transpose_58_perm_0, x = reshape_41_cast_fp16)[name = string("transpose_27")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_41_cast_fp16 = matmul(transpose_x = attn_weights_41_transpose_x_0, transpose_y = attn_weights_41_transpose_y_0, x = q_87_cast_fp16, y = transpose_58_cast_fp16)[name = string("attn_weights_41_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_207_cast_fp16 = add(x = attn_weights_41_cast_fp16, y = causal_mask_sliding)[name = string("x_207_cast_fp16")];
+            tensor<int32, [1]> reduce_max_10_axes_0 = const()[name = string("reduce_max_10_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_10_keep_dims_0 = const()[name = string("reduce_max_10_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_10 = reduce_max(axes = reduce_max_10_axes_0, keep_dims = reduce_max_10_keep_dims_0, x = x_207_cast_fp16)[name = string("reduce_max_10")];
+            tensor<fp16, [1, 8, 1, 512]> var_6591 = sub(x = x_207_cast_fp16, y = reduce_max_10)[name = string("op_6591")];
+            tensor<fp16, [1, 8, 1, 512]> var_6597 = exp(x = var_6591)[name = string("op_6597")];
+            tensor<int32, [1]> var_6607_axes_0 = const()[name = string("op_6607_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_6607_keep_dims_0 = const()[name = string("op_6607_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_6607 = reduce_sum(axes = var_6607_axes_0, keep_dims = var_6607_keep_dims_0, x = var_6597)[name = string("op_6607")];
+            tensor<fp16, [1, 8, 1, 512]> var_6613_cast_fp16 = real_div(x = var_6597, y = var_6607)[name = string("op_6613_cast_fp16")];
+            bool attn_output_61_transpose_x_0 = const()[name = string("attn_output_61_transpose_x_0"), val = bool(false)];
+            bool attn_output_61_transpose_y_0 = const()[name = string("attn_output_61_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_21_cast_fp16 = transpose(perm = V_expanded_21_perm_0, x = reshape_43_cast_fp16)[name = string("transpose_26")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_61_cast_fp16 = matmul(transpose_x = attn_output_61_transpose_x_0, transpose_y = attn_output_61_transpose_y_0, x = var_6613_cast_fp16, y = V_expanded_21_cast_fp16)[name = string("attn_output_61_cast_fp16")];
+            tensor<int32, [4]> var_6624 = const()[name = string("op_6624"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_6631 = const()[name = string("op_6631"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_6625_cast_fp16 = transpose(perm = var_6624, x = attn_output_61_cast_fp16)[name = string("transpose_25")];
+            tensor<fp16, [1, 1, 2048]> attn_output_63_cast_fp16 = reshape(shape = var_6631, x = var_6625_cast_fp16)[name = string("attn_output_63_cast_fp16")];
+            tensor<int32, [3]> var_6636 = const()[name = string("op_6636"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_6652_pad_type_0 = const()[name = string("op_6652_pad_type_0"), val = string("valid")];
+            int32 var_6652_groups_0 = const()[name = string("op_6652_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_6652_strides_0 = const()[name = string("op_6652_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_6652_pad_0 = const()[name = string("op_6652_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_6652_dilations_0 = const()[name = string("op_6652_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_10_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(563630272))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(566251776))))[name = string("squeeze_10_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_6637_cast_fp16 = transpose(perm = var_6636, x = attn_output_63_cast_fp16)[name = string("transpose_24")];
+            tensor<fp16, [1, 2560, 1]> var_6652_cast_fp16 = conv(dilations = var_6652_dilations_0, groups = var_6652_groups_0, pad = var_6652_pad_0, pad_type = var_6652_pad_type_0, strides = var_6652_strides_0, weight = squeeze_10_cast_fp16_to_fp32_to_fp16_palettized, x = var_6637_cast_fp16)[name = string("op_6652_cast_fp16")];
+            tensor<int32, [3]> var_6656 = const()[name = string("op_6656"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_6662 = const()[name = string("op_6662"), val = int32(-1)];
+            fp16 const_125_promoted_to_fp16 = const()[name = string("const_125_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_211_cast_fp16 = transpose(perm = var_6656, x = var_6652_cast_fp16)[name = string("transpose_23")];
+            tensor<fp16, [1, 1, 2560]> var_6664_cast_fp16 = mul(x = x_211_cast_fp16, y = const_125_promoted_to_fp16)[name = string("op_6664_cast_fp16")];
+            bool input_311_interleave_0 = const()[name = string("input_311_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_311_cast_fp16 = concat(axis = var_6662, interleave = input_311_interleave_0, values = (x_211_cast_fp16, var_6664_cast_fp16))[name = string("input_311_cast_fp16")];
+            tensor<int32, [1]> normed_293_axes_0 = const()[name = string("normed_293_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6659_to_fp16 = const()[name = string("op_6659_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_293_cast_fp16 = layer_norm(axes = normed_293_axes_0, epsilon = var_6659_to_fp16, x = input_311_cast_fp16)[name = string("normed_293_cast_fp16")];
+            tensor<int32, [2]> var_6669_split_sizes_0 = const()[name = string("op_6669_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6669_axis_0 = const()[name = string("op_6669_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_6669_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_6669_cast_fp16_1 = split(axis = var_6669_axis_0, split_sizes = var_6669_split_sizes_0, x = normed_293_cast_fp16)[name = string("op_6669_cast_fp16")];
+            tensor<fp16, [2560]> layers_10_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_10_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(566254400)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_65_cast_fp16 = mul(x = var_6669_cast_fp16_0, y = layers_10_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_65_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_213_cast_fp16 = add(x = x_199_cast_fp16, y = attn_output_65_cast_fp16)[name = string("x_213_cast_fp16")];
+            int32 var_6678 = const()[name = string("op_6678"), val = int32(-1)];
+            fp16 const_126_promoted_to_fp16 = const()[name = string("const_126_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_6680_cast_fp16 = mul(x = x_213_cast_fp16, y = const_126_promoted_to_fp16)[name = string("op_6680_cast_fp16")];
+            bool input_313_interleave_0 = const()[name = string("input_313_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_313_cast_fp16 = concat(axis = var_6678, interleave = input_313_interleave_0, values = (x_213_cast_fp16, var_6680_cast_fp16))[name = string("input_313_cast_fp16")];
+            tensor<int32, [1]> normed_297_axes_0 = const()[name = string("normed_297_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6675_to_fp16 = const()[name = string("op_6675_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_297_cast_fp16 = layer_norm(axes = normed_297_axes_0, epsilon = var_6675_to_fp16, x = input_313_cast_fp16)[name = string("normed_297_cast_fp16")];
+            tensor<int32, [2]> var_6685_split_sizes_0 = const()[name = string("op_6685_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6685_axis_0 = const()[name = string("op_6685_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_6685_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_6685_cast_fp16_1 = split(axis = var_6685_axis_0, split_sizes = var_6685_split_sizes_0, x = normed_297_cast_fp16)[name = string("op_6685_cast_fp16")];
+            tensor<fp16, [2560]> layers_10_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_10_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(566259584)))];
+            tensor<fp16, [1, 1, 2560]> h_63_cast_fp16 = mul(x = var_6685_cast_fp16_0, y = layers_10_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_63_cast_fp16")];
+            tensor<int32, [3]> var_6696 = const()[name = string("op_6696"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_315_axes_0 = const()[name = string("input_315_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_6697 = transpose(perm = var_6696, x = h_63_cast_fp16)[name = string("transpose_22")];
+            tensor<fp16, [1, 2560, 1, 1]> input_315 = expand_dims(axes = input_315_axes_0, x = var_6697)[name = string("input_315")];
+            string gate_41_pad_type_0 = const()[name = string("gate_41_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_41_strides_0 = const()[name = string("gate_41_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_41_pad_0 = const()[name = string("gate_41_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_41_dilations_0 = const()[name = string("gate_41_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_41_groups_0 = const()[name = string("gate_41_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_41 = conv(dilations = gate_41_dilations_0, groups = gate_41_groups_0, pad = gate_41_pad_0, pad_type = gate_41_pad_type_0, strides = gate_41_strides_0, weight = layers_10_mlp_gate_proj_weight_palettized, x = input_315)[name = string("gate_41")];
+            string up_21_pad_type_0 = const()[name = string("up_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_21_strides_0 = const()[name = string("up_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_21_pad_0 = const()[name = string("up_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_21_dilations_0 = const()[name = string("up_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_21_groups_0 = const()[name = string("up_21_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_21 = conv(dilations = up_21_dilations_0, groups = up_21_groups_0, pad = up_21_pad_0, pad_type = up_21_pad_type_0, strides = up_21_strides_0, weight = layers_10_mlp_up_proj_weight_palettized, x = input_315)[name = string("up_21")];
+            string gate_43_mode_0 = const()[name = string("gate_43_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_43 = gelu(mode = gate_43_mode_0, x = gate_41)[name = string("gate_43")];
+            tensor<fp16, [1, 10240, 1, 1]> input_317 = mul(x = gate_43, y = up_21)[name = string("input_317")];
+            string mlp_out_21_pad_type_0 = const()[name = string("mlp_out_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_21_strides_0 = const()[name = string("mlp_out_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_21_pad_0 = const()[name = string("mlp_out_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_21_dilations_0 = const()[name = string("mlp_out_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_21_groups_0 = const()[name = string("mlp_out_21_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_21 = conv(dilations = mlp_out_21_dilations_0, groups = mlp_out_21_groups_0, pad = mlp_out_21_pad_0, pad_type = mlp_out_21_pad_type_0, strides = mlp_out_21_strides_0, weight = layers_10_mlp_down_proj_weight_palettized, x = input_317)[name = string("mlp_out_21")];
+            tensor<int32, [1]> var_6737_axes_0 = const()[name = string("op_6737_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_6737 = squeeze(axes = var_6737_axes_0, x = mlp_out_21)[name = string("op_6737")];
+            tensor<int32, [3]> var_6741 = const()[name = string("op_6741"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_6747 = const()[name = string("op_6747"), val = int32(-1)];
+            fp16 const_127_promoted = const()[name = string("const_127_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_215 = transpose(perm = var_6741, x = var_6737)[name = string("transpose_21")];
+            tensor<fp16, [1, 1, 2560]> var_6749 = mul(x = x_215, y = const_127_promoted)[name = string("op_6749")];
+            bool input_319_interleave_0 = const()[name = string("input_319_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_319 = concat(axis = var_6747, interleave = input_319_interleave_0, values = (x_215, var_6749))[name = string("input_319")];
+            tensor<int32, [1]> normed_301_axes_0 = const()[name = string("normed_301_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6744_to_fp16 = const()[name = string("op_6744_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_301_cast_fp16 = layer_norm(axes = normed_301_axes_0, epsilon = var_6744_to_fp16, x = input_319)[name = string("normed_301_cast_fp16")];
+            tensor<int32, [2]> var_6754_split_sizes_0 = const()[name = string("op_6754_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6754_axis_0 = const()[name = string("op_6754_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_6754_0, tensor<fp16, [1, 1, 2560]> var_6754_1 = split(axis = var_6754_axis_0, split_sizes = var_6754_split_sizes_0, x = normed_301_cast_fp16)[name = string("op_6754")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_103 = mul(x = var_6754_0, y = layers_10_post_feedforward_layernorm_weight)[name = string("hidden_states_103")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_105_cast_fp16 = add(x = x_213_cast_fp16, y = hidden_states_103)[name = string("hidden_states_105_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_21_begin_0 = const()[name = string("per_layer_slice_21_begin_0"), val = tensor<int32, [3]>([0, 0, 5632])];
+            tensor<int32, [3]> per_layer_slice_21_end_0 = const()[name = string("per_layer_slice_21_end_0"), val = tensor<int32, [3]>([1, 1, 5888])];
+            tensor<bool, [3]> per_layer_slice_21_end_mask_0 = const()[name = string("per_layer_slice_21_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_21_cast_fp16 = slice_by_index(begin = per_layer_slice_21_begin_0, end = per_layer_slice_21_end_0, end_mask = per_layer_slice_21_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_21_cast_fp16")];
+            tensor<int32, [3]> var_6782 = const()[name = string("op_6782"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_321_axes_0 = const()[name = string("input_321_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_6783 = transpose(perm = var_6782, x = hidden_states_105_cast_fp16)[name = string("transpose_20")];
+            tensor<fp16, [1, 2560, 1, 1]> input_321 = expand_dims(axes = input_321_axes_0, x = var_6783)[name = string("input_321")];
+            string gated_61_pad_type_0 = const()[name = string("gated_61_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_61_strides_0 = const()[name = string("gated_61_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_61_pad_0 = const()[name = string("gated_61_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_61_dilations_0 = const()[name = string("gated_61_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_61_groups_0 = const()[name = string("gated_61_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_61 = conv(dilations = gated_61_dilations_0, groups = gated_61_groups_0, pad = gated_61_pad_0, pad_type = gated_61_pad_type_0, strides = gated_61_strides_0, weight = layers_10_per_layer_input_gate_weight_palettized, x = input_321)[name = string("gated_61")];
+            string gated_63_mode_0 = const()[name = string("gated_63_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_63 = gelu(mode = gated_63_mode_0, x = gated_61)[name = string("gated_63")];
+            tensor<int32, [3]> var_6802 = const()[name = string("op_6802"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_21_axes_0 = const()[name = string("per_layer_slice_conv_21_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_6803_cast_fp16 = transpose(perm = var_6802, x = per_layer_slice_21_cast_fp16)[name = string("transpose_19")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_21_cast_fp16 = expand_dims(axes = per_layer_slice_conv_21_axes_0, x = var_6803_cast_fp16)[name = string("per_layer_slice_conv_21_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_323_cast_fp16 = mul(x = gated_63, y = per_layer_slice_conv_21_cast_fp16)[name = string("input_323_cast_fp16")];
+            string gated_65_pad_type_0 = const()[name = string("gated_65_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_65_strides_0 = const()[name = string("gated_65_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_65_pad_0 = const()[name = string("gated_65_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_65_dilations_0 = const()[name = string("gated_65_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_65_groups_0 = const()[name = string("gated_65_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_10_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(566264768))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(566592512))))[name = string("layers_10_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_65_cast_fp16 = conv(dilations = gated_65_dilations_0, groups = gated_65_groups_0, pad = gated_65_pad_0, pad_type = gated_65_pad_type_0, strides = gated_65_strides_0, weight = layers_10_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_323_cast_fp16)[name = string("gated_65_cast_fp16")];
+            tensor<int32, [1]> var_6819_axes_0 = const()[name = string("op_6819_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_6819_cast_fp16 = squeeze(axes = var_6819_axes_0, x = gated_65_cast_fp16)[name = string("op_6819_cast_fp16")];
+            tensor<int32, [3]> var_6823 = const()[name = string("op_6823"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_6829 = const()[name = string("op_6829"), val = int32(-1)];
+            fp16 const_128_promoted_to_fp16 = const()[name = string("const_128_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_217_cast_fp16 = transpose(perm = var_6823, x = var_6819_cast_fp16)[name = string("transpose_18")];
+            tensor<fp16, [1, 1, 2560]> var_6831_cast_fp16 = mul(x = x_217_cast_fp16, y = const_128_promoted_to_fp16)[name = string("op_6831_cast_fp16")];
+            bool input_325_interleave_0 = const()[name = string("input_325_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_325_cast_fp16 = concat(axis = var_6829, interleave = input_325_interleave_0, values = (x_217_cast_fp16, var_6831_cast_fp16))[name = string("input_325_cast_fp16")];
+            tensor<int32, [1]> normed_305_axes_0 = const()[name = string("normed_305_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6826_to_fp16 = const()[name = string("op_6826_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_305_cast_fp16 = layer_norm(axes = normed_305_axes_0, epsilon = var_6826_to_fp16, x = input_325_cast_fp16)[name = string("normed_305_cast_fp16")];
+            tensor<int32, [2]> var_6836_split_sizes_0 = const()[name = string("op_6836_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6836_axis_0 = const()[name = string("op_6836_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_6836_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_6836_cast_fp16_1 = split(axis = var_6836_axis_0, split_sizes = var_6836_split_sizes_0, x = normed_305_cast_fp16)[name = string("op_6836_cast_fp16")];
+            tensor<fp16, [2560]> layers_10_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_10_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(566595136)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_109_cast_fp16 = mul(x = var_6836_cast_fp16_0, y = layers_10_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_109_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_111_cast_fp16 = add(x = hidden_states_105_cast_fp16, y = hidden_states_109_cast_fp16)[name = string("hidden_states_111_cast_fp16")];
+            tensor<fp16, [1]> const_129_promoted_to_fp16 = const()[name = string("const_129_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.42p-3])];
+            tensor<fp16, [1, 1, 2560]> x_219_cast_fp16 = mul(x = hidden_states_111_cast_fp16, y = const_129_promoted_to_fp16)[name = string("x_219_cast_fp16")];
+            tensor<int32, [1]> var_6848_axes_0 = const()[name = string("op_6848_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_6848_cast_fp16 = squeeze(axes = var_6848_axes_0, x = K_sliding_out_cast_fp16)[name = string("op_6848_cast_fp16")];
+            tensor<int32, [1]> var_6850_axes_0 = const()[name = string("op_6850_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_6850_cast_fp16 = squeeze(axes = var_6850_axes_0, x = V_sliding_out_cast_fp16)[name = string("op_6850_cast_fp16")];
+            tensor<int32, [4]> var_6853_begin_0 = const()[name = string("op_6853_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> var_6853_end_0 = const()[name = string("op_6853_end_0"), val = tensor<int32, [4]>([2, 2, 2048, 512])];
+            tensor<bool, [4]> var_6853_end_mask_0 = const()[name = string("op_6853_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_6853_squeeze_mask_0 = const()[name = string("op_6853_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 2048, 512]> var_6853_cast_fp16 = slice_by_index(begin = var_6853_begin_0, end = var_6853_end_0, end_mask = var_6853_end_mask_0, squeeze_mask = var_6853_squeeze_mask_0, x = K_full_in)[name = string("op_6853_cast_fp16")];
+            tensor<int32, [1]> K_full_slot_axes_0 = const()[name = string("K_full_slot_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 2048, 512]> K_full_slot_cast_fp16 = expand_dims(axes = K_full_slot_axes_0, x = var_6853_cast_fp16)[name = string("K_full_slot_cast_fp16")];
+            tensor<int32, [4]> var_6858_begin_0 = const()[name = string("op_6858_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> var_6858_end_0 = const()[name = string("op_6858_end_0"), val = tensor<int32, [4]>([2, 2, 2048, 512])];
+            tensor<bool, [4]> var_6858_end_mask_0 = const()[name = string("op_6858_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_6858_squeeze_mask_0 = const()[name = string("op_6858_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 2048, 512]> var_6858_cast_fp16 = slice_by_index(begin = var_6858_begin_0, end = var_6858_end_0, end_mask = var_6858_end_mask_0, squeeze_mask = var_6858_squeeze_mask_0, x = V_full_in)[name = string("op_6858_cast_fp16")];
+            tensor<int32, [1]> V_full_slot_axes_0 = const()[name = string("V_full_slot_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 2048, 512]> V_full_slot_cast_fp16 = expand_dims(axes = V_full_slot_axes_0, x = var_6858_cast_fp16)[name = string("V_full_slot_cast_fp16")];
+            int32 var_6865 = const()[name = string("op_6865"), val = int32(-1)];
+            fp16 const_130_promoted_to_fp16 = const()[name = string("const_130_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_6867_cast_fp16 = mul(x = x_219_cast_fp16, y = const_130_promoted_to_fp16)[name = string("op_6867_cast_fp16")];
+            bool input_327_interleave_0 = const()[name = string("input_327_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_327_cast_fp16 = concat(axis = var_6865, interleave = input_327_interleave_0, values = (x_219_cast_fp16, var_6867_cast_fp16))[name = string("input_327_cast_fp16")];
+            tensor<int32, [1]> normed_309_axes_0 = const()[name = string("normed_309_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6862_to_fp16 = const()[name = string("op_6862_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_309_cast_fp16 = layer_norm(axes = normed_309_axes_0, epsilon = var_6862_to_fp16, x = input_327_cast_fp16)[name = string("normed_309_cast_fp16")];
+            tensor<int32, [2]> var_6872_split_sizes_0 = const()[name = string("op_6872_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6872_axis_0 = const()[name = string("op_6872_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_6872_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_6872_cast_fp16_1 = split(axis = var_6872_axis_0, split_sizes = var_6872_split_sizes_0, x = normed_309_cast_fp16)[name = string("op_6872_cast_fp16")];
+            tensor<fp16, [2560]> layers_11_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_11_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(566600320)))];
+            tensor<fp16, [1, 1, 2560]> h_67_cast_fp16 = mul(x = var_6872_cast_fp16_0, y = layers_11_input_layernorm_weight_promoted_to_fp16)[name = string("h_67_cast_fp16")];
+            tensor<int32, [3]> var_6878 = const()[name = string("op_6878"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_6881_axes_0 = const()[name = string("op_6881_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_6879_cast_fp16 = transpose(perm = var_6878, x = h_67_cast_fp16)[name = string("transpose_17")];
+            tensor<fp16, [1, 2560, 1, 1]> var_6881_cast_fp16 = expand_dims(axes = var_6881_axes_0, x = var_6879_cast_fp16)[name = string("op_6881_cast_fp16")];
+            string var_6897_pad_type_0 = const()[name = string("op_6897_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_6897_strides_0 = const()[name = string("op_6897_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_6897_pad_0 = const()[name = string("op_6897_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_6897_dilations_0 = const()[name = string("op_6897_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_6897_groups_0 = const()[name = string("op_6897_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 4096, 1, 1]> var_6897 = conv(dilations = var_6897_dilations_0, groups = var_6897_groups_0, pad = var_6897_pad_0, pad_type = var_6897_pad_type_0, strides = var_6897_strides_0, weight = layers_11_self_attn_q_proj_weight_palettized, x = var_6881_cast_fp16)[name = string("op_6897")];
+            tensor<int32, [4]> var_6902 = const()[name = string("op_6902"), val = tensor<int32, [4]>([1, 8, 512, 1])];
+            tensor<fp16, [1, 8, 512, 1]> var_6903 = reshape(shape = var_6902, x = var_6897)[name = string("op_6903")];
+            tensor<int32, [4]> var_6908 = const()[name = string("op_6908"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_6918 = const()[name = string("op_6918"), val = tensor<int32, [3]>([1, 8, 512])];
+            tensor<fp16, [1, 8, 1, 512]> var_6909 = transpose(perm = var_6908, x = var_6903)[name = string("transpose_16")];
+            tensor<fp16, [1, 8, 512]> x_221 = reshape(shape = var_6918, x = var_6909)[name = string("x_221")];
+            int32 var_6924 = const()[name = string("op_6924"), val = int32(-1)];
+            fp16 const_131_promoted = const()[name = string("const_131_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 512]> var_6926 = mul(x = x_221, y = const_131_promoted)[name = string("op_6926")];
+            bool input_331_interleave_0 = const()[name = string("input_331_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1024]> input_331 = concat(axis = var_6924, interleave = input_331_interleave_0, values = (x_221, var_6926))[name = string("input_331")];
+            tensor<int32, [1]> normed_313_axes_0 = const()[name = string("normed_313_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6921_to_fp16 = const()[name = string("op_6921_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 1024]> normed_313_cast_fp16 = layer_norm(axes = normed_313_axes_0, epsilon = var_6921_to_fp16, x = input_331)[name = string("normed_313_cast_fp16")];
+            tensor<int32, [2]> var_6931_split_sizes_0 = const()[name = string("op_6931_split_sizes_0"), val = tensor<int32, [2]>([512, 512])];
+            int32 var_6931_axis_0 = const()[name = string("op_6931_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 512]> var_6931_0, tensor<fp16, [1, 8, 512]> var_6931_1 = split(axis = var_6931_axis_0, split_sizes = var_6931_split_sizes_0, x = normed_313_cast_fp16)[name = string("op_6931")];
+            tensor<fp16, [1, 8, 512]> var_6933 = mul(x = var_6931_0, y = layers_11_self_attn_q_norm_weight)[name = string("op_6933")];
+            tensor<int32, [4]> var_6938 = const()[name = string("op_6938"), val = tensor<int32, [4]>([1, 8, 1, 512])];
+            tensor<fp16, [1, 8, 1, 512]> q_91 = reshape(shape = var_6938, x = var_6933)[name = string("q_91")];
+            tensor<fp16, [1, 8, 1, 512]> var_6940_cast_fp16 = mul(x = q_91, y = cos_f)[name = string("op_6940_cast_fp16")];
+            tensor<int32, [2]> var_6941_split_sizes_0 = const()[name = string("op_6941_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_6941_axis_0 = const()[name = string("op_6941_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 256]> var_6941_0, tensor<fp16, [1, 8, 1, 256]> var_6941_1 = split(axis = var_6941_axis_0, split_sizes = var_6941_split_sizes_0, x = q_91)[name = string("op_6941")];
+            fp16 const_132_promoted = const()[name = string("const_132_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 256]> var_6943 = mul(x = var_6941_1, y = const_132_promoted)[name = string("op_6943")];
+            int32 var_6945 = const()[name = string("op_6945"), val = int32(-1)];
+            bool var_6946_interleave_0 = const()[name = string("op_6946_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> var_6946 = concat(axis = var_6945, interleave = var_6946_interleave_0, values = (var_6943, var_6941_0))[name = string("op_6946")];
+            tensor<fp16, [1, 8, 1, 512]> var_6947_cast_fp16 = mul(x = var_6946, y = sin_f)[name = string("op_6947_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> q_cast_fp16 = add(x = var_6940_cast_fp16, y = var_6947_cast_fp16)[name = string("q_cast_fp16")];
+            string var_6960_pad_type_0 = const()[name = string("op_6960_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_6960_strides_0 = const()[name = string("op_6960_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_6960_pad_0 = const()[name = string("op_6960_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_6960_dilations_0 = const()[name = string("op_6960_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_6960_groups_0 = const()[name = string("op_6960_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> var_6960 = conv(dilations = var_6960_dilations_0, groups = var_6960_groups_0, pad = var_6960_pad_0, pad_type = var_6960_pad_type_0, strides = var_6960_strides_0, weight = layers_11_self_attn_k_proj_weight_palettized, x = var_6881_cast_fp16)[name = string("op_6960")];
+            tensor<int32, [4]> var_6965 = const()[name = string("op_6965"), val = tensor<int32, [4]>([1, 2, 512, 1])];
+            tensor<fp16, [1, 2, 512, 1]> var_6966 = reshape(shape = var_6965, x = var_6960)[name = string("op_6966")];
+            tensor<int32, [4]> var_6971 = const()[name = string("op_6971"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_6988_pad_type_0 = const()[name = string("op_6988_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_6988_strides_0 = const()[name = string("op_6988_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_6988_pad_0 = const()[name = string("op_6988_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_6988_dilations_0 = const()[name = string("op_6988_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_6988_groups_0 = const()[name = string("op_6988_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> var_6988 = conv(dilations = var_6988_dilations_0, groups = var_6988_groups_0, pad = var_6988_pad_0, pad_type = var_6988_pad_type_0, strides = var_6988_strides_0, weight = layers_11_self_attn_v_proj_weight_palettized, x = var_6881_cast_fp16)[name = string("op_6988")];
+            tensor<int32, [4]> var_6993 = const()[name = string("op_6993"), val = tensor<int32, [4]>([1, 2, 512, 1])];
+            tensor<fp16, [1, 2, 512, 1]> var_6994 = reshape(shape = var_6993, x = var_6988)[name = string("op_6994")];
+            tensor<int32, [4]> var_6999 = const()[name = string("op_6999"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_7009 = const()[name = string("op_7009"), val = tensor<int32, [3]>([1, 2, 512])];
+            tensor<fp16, [1, 2, 1, 512]> var_6972 = transpose(perm = var_6971, x = var_6966)[name = string("transpose_15")];
+            tensor<fp16, [1, 2, 512]> x_223 = reshape(shape = var_7009, x = var_6972)[name = string("x_223")];
+            int32 var_7015 = const()[name = string("op_7015"), val = int32(-1)];
+            fp16 const_133_promoted = const()[name = string("const_133_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 512]> var_7017 = mul(x = x_223, y = const_133_promoted)[name = string("op_7017")];
+            bool input_333_interleave_0 = const()[name = string("input_333_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1024]> input_333 = concat(axis = var_7015, interleave = input_333_interleave_0, values = (x_223, var_7017))[name = string("input_333")];
+            tensor<int32, [1]> normed_317_axes_0 = const()[name = string("normed_317_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7012_to_fp16 = const()[name = string("op_7012_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1024]> normed_317_cast_fp16 = layer_norm(axes = normed_317_axes_0, epsilon = var_7012_to_fp16, x = input_333)[name = string("normed_317_cast_fp16")];
+            tensor<int32, [2]> var_7022_split_sizes_0 = const()[name = string("op_7022_split_sizes_0"), val = tensor<int32, [2]>([512, 512])];
+            int32 var_7022_axis_0 = const()[name = string("op_7022_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 512]> var_7022_0, tensor<fp16, [1, 2, 512]> var_7022_1 = split(axis = var_7022_axis_0, split_sizes = var_7022_split_sizes_0, x = normed_317_cast_fp16)[name = string("op_7022")];
+            tensor<fp16, [1, 2, 512]> var_7024 = mul(x = var_7022_0, y = layers_11_self_attn_k_norm_weight)[name = string("op_7024")];
+            tensor<int32, [4]> var_7029 = const()[name = string("op_7029"), val = tensor<int32, [4]>([1, 2, 1, 512])];
+            tensor<fp16, [1, 2, 1, 512]> q_93 = reshape(shape = var_7029, x = var_7024)[name = string("q_93")];
+            fp16 var_7031_promoted = const()[name = string("op_7031_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 512]> var_7000 = transpose(perm = var_6999, x = var_6994)[name = string("transpose_14")];
+            tensor<fp16, [1, 2, 1, 512]> var_7032 = pow(x = var_7000, y = var_7031_promoted)[name = string("op_7032")];
+            tensor<int32, [1]> var_7037_axes_0 = const()[name = string("op_7037_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_7037_keep_dims_0 = const()[name = string("op_7037_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_7037 = reduce_mean(axes = var_7037_axes_0, keep_dims = var_7037_keep_dims_0, x = var_7032)[name = string("op_7037")];
+            fp16 var_7039_to_fp16 = const()[name = string("op_7039_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_cast_fp16 = add(x = var_7037, y = var_7039_to_fp16)[name = string("mean_sq_cast_fp16")];
+            fp32 var_7041_epsilon_0 = const()[name = string("op_7041_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_7041_cast_fp16 = rsqrt(epsilon = var_7041_epsilon_0, x = mean_sq_cast_fp16)[name = string("op_7041_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 512]> v_cast_fp16 = mul(x = var_7000, y = var_7041_cast_fp16)[name = string("v_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 512]> var_7043_cast_fp16 = mul(x = q_93, y = cos_f)[name = string("op_7043_cast_fp16")];
+            tensor<int32, [2]> var_7044_split_sizes_0 = const()[name = string("op_7044_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_7044_axis_0 = const()[name = string("op_7044_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 256]> var_7044_0, tensor<fp16, [1, 2, 1, 256]> var_7044_1 = split(axis = var_7044_axis_0, split_sizes = var_7044_split_sizes_0, x = q_93)[name = string("op_7044")];
+            fp16 const_134_promoted = const()[name = string("const_134_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 256]> var_7046 = mul(x = var_7044_1, y = const_134_promoted)[name = string("op_7046")];
+            int32 var_7048 = const()[name = string("op_7048"), val = int32(-1)];
+            bool var_7049_interleave_0 = const()[name = string("op_7049_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 512]> var_7049 = concat(axis = var_7048, interleave = var_7049_interleave_0, values = (var_7046, var_7044_0))[name = string("op_7049")];
+            tensor<fp16, [1, 2, 1, 512]> var_7050_cast_fp16 = mul(x = var_7049, y = sin_f)[name = string("op_7050_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 512]> k_cast_fp16 = add(x = var_7043_cast_fp16, y = var_7050_cast_fp16)[name = string("k_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_7056_cast_fp16 = mul(x = K_full_slot_cast_fp16, y = var_3733_cast_fp16)[name = string("op_7056_cast_fp16")];
+            tensor<int32, [4]> var_7057_reps_0 = const()[name = string("op_7057_reps_0"), val = tensor<int32, [4]>([1, 1, 2048, 1])];
+            tensor<fp16, [1, 2, 2048, 512]> var_7057_cast_fp16 = tile(reps = var_7057_reps_0, x = k_cast_fp16)[name = string("op_7057_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_7058_cast_fp16 = mul(x = var_7057_cast_fp16, y = update_mask)[name = string("op_7058_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> kv14_k = add(x = var_7056_cast_fp16, y = var_7058_cast_fp16)[name = string("K_full_out_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_7064_cast_fp16 = mul(x = V_full_slot_cast_fp16, y = var_3733_cast_fp16)[name = string("op_7064_cast_fp16")];
+            tensor<int32, [4]> var_7065_reps_0 = const()[name = string("op_7065_reps_0"), val = tensor<int32, [4]>([1, 1, 2048, 1])];
+            tensor<fp16, [1, 2, 2048, 512]> var_7065_cast_fp16 = tile(reps = var_7065_reps_0, x = v_cast_fp16)[name = string("op_7065_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_7066_cast_fp16 = mul(x = var_7065_cast_fp16, y = update_mask)[name = string("op_7066_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> kv14_v = add(x = var_7064_cast_fp16, y = var_7066_cast_fp16)[name = string("V_full_out_cast_fp16")];
+            tensor<int32, [4]> transpose_44_perm_0 = const()[name = string("transpose_44_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_22_reps_0 = const()[name = string("tile_22_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 2048, 512]> transpose_44_cast_fp16 = transpose(perm = transpose_44_perm_0, x = kv14_k)[name = string("transpose_13")];
+            tensor<fp16, [8, 1, 2048, 512]> tile_22_cast_fp16 = tile(reps = tile_22_reps_0, x = transpose_44_cast_fp16)[name = string("tile_22_cast_fp16")];
+            tensor<int32, [5]> concat_44 = const()[name = string("concat_44"), val = tensor<int32, [5]>([4, 2, 1, 2048, 512])];
+            tensor<fp16, [4, 2, 1, 2048, 512]> reshape_44_cast_fp16 = reshape(shape = concat_44, x = tile_22_cast_fp16)[name = string("reshape_44_cast_fp16")];
+            tensor<int32, [5]> transpose_45_perm_0 = const()[name = string("transpose_45_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_45 = const()[name = string("concat_45"), val = tensor<int32, [4]>([-1, 1, 2048, 512])];
+            tensor<fp16, [2, 4, 1, 2048, 512]> transpose_45_cast_fp16 = transpose(perm = transpose_45_perm_0, x = reshape_44_cast_fp16)[name = string("transpose_12")];
+            tensor<fp16, [8, 1, 2048, 512]> reshape_45_cast_fp16 = reshape(shape = concat_45, x = transpose_45_cast_fp16)[name = string("reshape_45_cast_fp16")];
+            tensor<int32, [4]> transpose_59_perm_0 = const()[name = string("transpose_59_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_46_perm_0 = const()[name = string("transpose_46_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_23_reps_0 = const()[name = string("tile_23_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 2048, 512]> transpose_46_cast_fp16 = transpose(perm = transpose_46_perm_0, x = kv14_v)[name = string("transpose_11")];
+            tensor<fp16, [8, 1, 2048, 512]> tile_23_cast_fp16 = tile(reps = tile_23_reps_0, x = transpose_46_cast_fp16)[name = string("tile_23_cast_fp16")];
+            tensor<int32, [5]> concat_46 = const()[name = string("concat_46"), val = tensor<int32, [5]>([4, 2, 1, 2048, 512])];
+            tensor<fp16, [4, 2, 1, 2048, 512]> reshape_46_cast_fp16 = reshape(shape = concat_46, x = tile_23_cast_fp16)[name = string("reshape_46_cast_fp16")];
+            tensor<int32, [5]> transpose_47_perm_0 = const()[name = string("transpose_47_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_47 = const()[name = string("concat_47"), val = tensor<int32, [4]>([-1, 1, 2048, 512])];
+            tensor<fp16, [2, 4, 1, 2048, 512]> transpose_47_cast_fp16 = transpose(perm = transpose_47_perm_0, x = reshape_46_cast_fp16)[name = string("transpose_10")];
+            tensor<fp16, [8, 1, 2048, 512]> reshape_47_cast_fp16 = reshape(shape = concat_47, x = transpose_47_cast_fp16)[name = string("reshape_47_cast_fp16")];
+            tensor<int32, [4]> V_expanded_perm_0 = const()[name = string("V_expanded_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_45_transpose_x_0 = const()[name = string("attn_weights_45_transpose_x_0"), val = bool(false)];
+            bool attn_weights_45_transpose_y_0 = const()[name = string("attn_weights_45_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 2048]> transpose_59_cast_fp16 = transpose(perm = transpose_59_perm_0, x = reshape_45_cast_fp16)[name = string("transpose_9")];
+            tensor<fp16, [1, 8, 1, 2048]> attn_weights_45_cast_fp16 = matmul(transpose_x = attn_weights_45_transpose_x_0, transpose_y = attn_weights_45_transpose_y_0, x = q_cast_fp16, y = transpose_59_cast_fp16)[name = string("attn_weights_45_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 2048]> x_227_cast_fp16 = add(x = attn_weights_45_cast_fp16, y = causal_mask_full)[name = string("x_227_cast_fp16")];
+            tensor<int32, [1]> reduce_max_11_axes_0 = const()[name = string("reduce_max_11_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_11_keep_dims_0 = const()[name = string("reduce_max_11_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_11 = reduce_max(axes = reduce_max_11_axes_0, keep_dims = reduce_max_11_keep_dims_0, x = x_227_cast_fp16)[name = string("reduce_max_11")];
+            tensor<fp16, [1, 8, 1, 2048]> var_7118 = sub(x = x_227_cast_fp16, y = reduce_max_11)[name = string("op_7118")];
+            tensor<fp16, [1, 8, 1, 2048]> var_7124 = exp(x = var_7118)[name = string("op_7124")];
+            tensor<int32, [1]> var_7134_axes_0 = const()[name = string("op_7134_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_7134_keep_dims_0 = const()[name = string("op_7134_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_7134 = reduce_sum(axes = var_7134_axes_0, keep_dims = var_7134_keep_dims_0, x = var_7124)[name = string("op_7134")];
+            tensor<fp16, [1, 8, 1, 2048]> var_7140_cast_fp16 = real_div(x = var_7124, y = var_7134)[name = string("op_7140_cast_fp16")];
+            bool attn_output_67_transpose_x_0 = const()[name = string("attn_output_67_transpose_x_0"), val = bool(false)];
+            bool attn_output_67_transpose_y_0 = const()[name = string("attn_output_67_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 2048, 512]> V_expanded_cast_fp16 = transpose(perm = V_expanded_perm_0, x = reshape_47_cast_fp16)[name = string("transpose_8")];
+            tensor<fp16, [1, 8, 1, 512]> attn_output_67_cast_fp16 = matmul(transpose_x = attn_output_67_transpose_x_0, transpose_y = attn_output_67_transpose_y_0, x = var_7140_cast_fp16, y = V_expanded_cast_fp16)[name = string("attn_output_67_cast_fp16")];
+            tensor<int32, [4]> var_7151 = const()[name = string("op_7151"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_7158 = const()[name = string("op_7158"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 512]> var_7152_cast_fp16 = transpose(perm = var_7151, x = attn_output_67_cast_fp16)[name = string("transpose_7")];
+            tensor<fp16, [1, 1, 4096]> attn_output_69_cast_fp16 = reshape(shape = var_7158, x = var_7152_cast_fp16)[name = string("attn_output_69_cast_fp16")];
+            tensor<int32, [3]> var_7163 = const()[name = string("op_7163"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_7179_pad_type_0 = const()[name = string("op_7179_pad_type_0"), val = string("valid")];
+            int32 var_7179_groups_0 = const()[name = string("op_7179_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_7179_strides_0 = const()[name = string("op_7179_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_7179_pad_0 = const()[name = string("op_7179_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_7179_dilations_0 = const()[name = string("op_7179_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 4096, 1]> squeeze_11_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 4096, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(566605504))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(571848448))))[name = string("squeeze_11_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 4096, 1]> var_7164_cast_fp16 = transpose(perm = var_7163, x = attn_output_69_cast_fp16)[name = string("transpose_6")];
+            tensor<fp16, [1, 2560, 1]> var_7179_cast_fp16 = conv(dilations = var_7179_dilations_0, groups = var_7179_groups_0, pad = var_7179_pad_0, pad_type = var_7179_pad_type_0, strides = var_7179_strides_0, weight = squeeze_11_cast_fp16_to_fp32_to_fp16_palettized, x = var_7164_cast_fp16)[name = string("op_7179_cast_fp16")];
+            tensor<int32, [3]> var_7183 = const()[name = string("op_7183"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_7189 = const()[name = string("op_7189"), val = int32(-1)];
+            fp16 const_135_promoted_to_fp16 = const()[name = string("const_135_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_231_cast_fp16 = transpose(perm = var_7183, x = var_7179_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 1, 2560]> var_7191_cast_fp16 = mul(x = x_231_cast_fp16, y = const_135_promoted_to_fp16)[name = string("op_7191_cast_fp16")];
+            bool input_337_interleave_0 = const()[name = string("input_337_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_337_cast_fp16 = concat(axis = var_7189, interleave = input_337_interleave_0, values = (x_231_cast_fp16, var_7191_cast_fp16))[name = string("input_337_cast_fp16")];
+            tensor<int32, [1]> normed_321_axes_0 = const()[name = string("normed_321_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7186_to_fp16 = const()[name = string("op_7186_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_321_cast_fp16 = layer_norm(axes = normed_321_axes_0, epsilon = var_7186_to_fp16, x = input_337_cast_fp16)[name = string("normed_321_cast_fp16")];
+            tensor<int32, [2]> var_7196_split_sizes_0 = const()[name = string("op_7196_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7196_axis_0 = const()[name = string("op_7196_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_7196_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_7196_cast_fp16_1 = split(axis = var_7196_axis_0, split_sizes = var_7196_split_sizes_0, x = normed_321_cast_fp16)[name = string("op_7196_cast_fp16")];
+            tensor<fp16, [2560]> layers_11_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_11_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(571851072)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_cast_fp16 = mul(x = var_7196_cast_fp16_0, y = layers_11_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_233_cast_fp16 = add(x = x_219_cast_fp16, y = attn_output_cast_fp16)[name = string("x_233_cast_fp16")];
+            int32 var_7205 = const()[name = string("op_7205"), val = int32(-1)];
+            fp16 const_136_promoted_to_fp16 = const()[name = string("const_136_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_7207_cast_fp16 = mul(x = x_233_cast_fp16, y = const_136_promoted_to_fp16)[name = string("op_7207_cast_fp16")];
+            bool input_339_interleave_0 = const()[name = string("input_339_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_339_cast_fp16 = concat(axis = var_7205, interleave = input_339_interleave_0, values = (x_233_cast_fp16, var_7207_cast_fp16))[name = string("input_339_cast_fp16")];
+            tensor<int32, [1]> normed_325_axes_0 = const()[name = string("normed_325_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7202_to_fp16 = const()[name = string("op_7202_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_325_cast_fp16 = layer_norm(axes = normed_325_axes_0, epsilon = var_7202_to_fp16, x = input_339_cast_fp16)[name = string("normed_325_cast_fp16")];
+            tensor<int32, [2]> var_7212_split_sizes_0 = const()[name = string("op_7212_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7212_axis_0 = const()[name = string("op_7212_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_7212_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_7212_cast_fp16_1 = split(axis = var_7212_axis_0, split_sizes = var_7212_split_sizes_0, x = normed_325_cast_fp16)[name = string("op_7212_cast_fp16")];
+            tensor<fp16, [2560]> layers_11_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_11_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(571856256)))];
+            tensor<fp16, [1, 1, 2560]> h_69_cast_fp16 = mul(x = var_7212_cast_fp16_0, y = layers_11_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_69_cast_fp16")];
+            tensor<int32, [3]> var_7223 = const()[name = string("op_7223"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_341_axes_0 = const()[name = string("input_341_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_7224 = transpose(perm = var_7223, x = h_69_cast_fp16)[name = string("transpose_4")];
+            tensor<fp16, [1, 2560, 1, 1]> input_341 = expand_dims(axes = input_341_axes_0, x = var_7224)[name = string("input_341")];
+            string gate_45_pad_type_0 = const()[name = string("gate_45_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_45_strides_0 = const()[name = string("gate_45_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_45_pad_0 = const()[name = string("gate_45_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_45_dilations_0 = const()[name = string("gate_45_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_45_groups_0 = const()[name = string("gate_45_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_45 = conv(dilations = gate_45_dilations_0, groups = gate_45_groups_0, pad = gate_45_pad_0, pad_type = gate_45_pad_type_0, strides = gate_45_strides_0, weight = layers_11_mlp_gate_proj_weight_palettized, x = input_341)[name = string("gate_45")];
+            string up_pad_type_0 = const()[name = string("up_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_strides_0 = const()[name = string("up_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_pad_0 = const()[name = string("up_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_dilations_0 = const()[name = string("up_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_groups_0 = const()[name = string("up_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up = conv(dilations = up_dilations_0, groups = up_groups_0, pad = up_pad_0, pad_type = up_pad_type_0, strides = up_strides_0, weight = layers_11_mlp_up_proj_weight_palettized, x = input_341)[name = string("up")];
+            string gate_mode_0 = const()[name = string("gate_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate = gelu(mode = gate_mode_0, x = gate_45)[name = string("gate")];
+            tensor<fp16, [1, 10240, 1, 1]> input_343 = mul(x = gate, y = up)[name = string("input_343")];
+            string mlp_out_pad_type_0 = const()[name = string("mlp_out_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_strides_0 = const()[name = string("mlp_out_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_pad_0 = const()[name = string("mlp_out_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_dilations_0 = const()[name = string("mlp_out_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_groups_0 = const()[name = string("mlp_out_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out = conv(dilations = mlp_out_dilations_0, groups = mlp_out_groups_0, pad = mlp_out_pad_0, pad_type = mlp_out_pad_type_0, strides = mlp_out_strides_0, weight = layers_11_mlp_down_proj_weight_palettized, x = input_343)[name = string("mlp_out")];
+            tensor<int32, [1]> var_7264_axes_0 = const()[name = string("op_7264_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_7264 = squeeze(axes = var_7264_axes_0, x = mlp_out)[name = string("op_7264")];
+            tensor<int32, [3]> var_7268 = const()[name = string("op_7268"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_7274 = const()[name = string("op_7274"), val = int32(-1)];
+            fp16 const_137_promoted = const()[name = string("const_137_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_235 = transpose(perm = var_7268, x = var_7264)[name = string("transpose_3")];
+            tensor<fp16, [1, 1, 2560]> var_7276 = mul(x = x_235, y = const_137_promoted)[name = string("op_7276")];
+            bool input_345_interleave_0 = const()[name = string("input_345_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_345 = concat(axis = var_7274, interleave = input_345_interleave_0, values = (x_235, var_7276))[name = string("input_345")];
+            tensor<int32, [1]> normed_329_axes_0 = const()[name = string("normed_329_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7271_to_fp16 = const()[name = string("op_7271_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_329_cast_fp16 = layer_norm(axes = normed_329_axes_0, epsilon = var_7271_to_fp16, x = input_345)[name = string("normed_329_cast_fp16")];
+            tensor<int32, [2]> var_7281_split_sizes_0 = const()[name = string("op_7281_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7281_axis_0 = const()[name = string("op_7281_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_7281_0, tensor<fp16, [1, 1, 2560]> var_7281_1 = split(axis = var_7281_axis_0, split_sizes = var_7281_split_sizes_0, x = normed_329_cast_fp16)[name = string("op_7281")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_113 = mul(x = var_7281_0, y = layers_11_post_feedforward_layernorm_weight)[name = string("hidden_states_113")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_115_cast_fp16 = add(x = x_233_cast_fp16, y = hidden_states_113)[name = string("hidden_states_115_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_begin_0 = const()[name = string("per_layer_slice_begin_0"), val = tensor<int32, [3]>([0, 0, 5888])];
+            tensor<int32, [3]> per_layer_slice_end_0 = const()[name = string("per_layer_slice_end_0"), val = tensor<int32, [3]>([1, 1, 6144])];
+            tensor<bool, [3]> per_layer_slice_end_mask_0 = const()[name = string("per_layer_slice_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_cast_fp16 = slice_by_index(begin = per_layer_slice_begin_0, end = per_layer_slice_end_0, end_mask = per_layer_slice_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_cast_fp16")];
+            tensor<int32, [3]> var_7309 = const()[name = string("op_7309"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_347_axes_0 = const()[name = string("input_347_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_7310 = transpose(perm = var_7309, x = hidden_states_115_cast_fp16)[name = string("transpose_2")];
+            tensor<fp16, [1, 2560, 1, 1]> input_347 = expand_dims(axes = input_347_axes_0, x = var_7310)[name = string("input_347")];
+            string gated_67_pad_type_0 = const()[name = string("gated_67_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_67_strides_0 = const()[name = string("gated_67_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_67_pad_0 = const()[name = string("gated_67_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_67_dilations_0 = const()[name = string("gated_67_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_67_groups_0 = const()[name = string("gated_67_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_67 = conv(dilations = gated_67_dilations_0, groups = gated_67_groups_0, pad = gated_67_pad_0, pad_type = gated_67_pad_type_0, strides = gated_67_strides_0, weight = layers_11_per_layer_input_gate_weight_palettized, x = input_347)[name = string("gated_67")];
+            string gated_69_mode_0 = const()[name = string("gated_69_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_69 = gelu(mode = gated_69_mode_0, x = gated_67)[name = string("gated_69")];
+            tensor<int32, [3]> var_7329 = const()[name = string("op_7329"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_axes_0 = const()[name = string("per_layer_slice_conv_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_7330_cast_fp16 = transpose(perm = var_7329, x = per_layer_slice_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_cast_fp16 = expand_dims(axes = per_layer_slice_conv_axes_0, x = var_7330_cast_fp16)[name = string("per_layer_slice_conv_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_349_cast_fp16 = mul(x = gated_69, y = per_layer_slice_conv_cast_fp16)[name = string("input_349_cast_fp16")];
+            string gated_pad_type_0 = const()[name = string("gated_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_strides_0 = const()[name = string("gated_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_pad_0 = const()[name = string("gated_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_dilations_0 = const()[name = string("gated_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_groups_0 = const()[name = string("gated_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_11_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(571861440))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(572189184))))[name = string("layers_11_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_cast_fp16 = conv(dilations = gated_dilations_0, groups = gated_groups_0, pad = gated_pad_0, pad_type = gated_pad_type_0, strides = gated_strides_0, weight = layers_11_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_349_cast_fp16)[name = string("gated_cast_fp16")];
+            tensor<int32, [1]> var_7346_axes_0 = const()[name = string("op_7346_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_7346_cast_fp16 = squeeze(axes = var_7346_axes_0, x = gated_cast_fp16)[name = string("op_7346_cast_fp16")];
+            tensor<int32, [3]> var_7350 = const()[name = string("op_7350"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_7356 = const()[name = string("op_7356"), val = int32(-1)];
+            fp16 const_138_promoted_to_fp16 = const()[name = string("const_138_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_cast_fp16 = transpose(perm = var_7350, x = var_7346_cast_fp16)[name = string("transpose_0")];
+            tensor<fp16, [1, 1, 2560]> var_7358_cast_fp16 = mul(x = x_cast_fp16, y = const_138_promoted_to_fp16)[name = string("op_7358_cast_fp16")];
+            bool input_interleave_0 = const()[name = string("input_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_cast_fp16 = concat(axis = var_7356, interleave = input_interleave_0, values = (x_cast_fp16, var_7358_cast_fp16))[name = string("input_cast_fp16")];
+            tensor<int32, [1]> normed_333_axes_0 = const()[name = string("normed_333_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7353_to_fp16 = const()[name = string("op_7353_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_333_cast_fp16 = layer_norm(axes = normed_333_axes_0, epsilon = var_7353_to_fp16, x = input_cast_fp16)[name = string("normed_333_cast_fp16")];
+            tensor<int32, [2]> var_7363_split_sizes_0 = const()[name = string("op_7363_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7363_axis_0 = const()[name = string("op_7363_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_7363_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_7363_cast_fp16_1 = split(axis = var_7363_axis_0, split_sizes = var_7363_split_sizes_0, x = normed_333_cast_fp16)[name = string("op_7363_cast_fp16")];
+            tensor<fp16, [2560]> layers_11_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_11_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(572191808)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_119_cast_fp16 = mul(x = var_7363_cast_fp16_0, y = layers_11_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_119_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_cast_fp16 = add(x = hidden_states_115_cast_fp16, y = hidden_states_119_cast_fp16)[name = string("hidden_states_cast_fp16")];
+            tensor<fp16, [1]> const_139_promoted_to_fp16 = const()[name = string("const_139_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.0cp-4])];
+            tensor<fp16, [1, 1, 2560]> hidden_states_out = mul(x = hidden_states_cast_fp16, y = const_139_promoted_to_fp16)[name = string("op_7373_cast_fp16")];
+            tensor<int32, [1]> var_7375_axes_0 = const()[name = string("op_7375_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 2048, 512]> var_7375_cast_fp16 = squeeze(axes = var_7375_axes_0, x = kv14_k)[name = string("op_7375_cast_fp16")];
+            tensor<int32, [1]> var_7377_axes_0 = const()[name = string("op_7377_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 2048, 512]> var_7377_cast_fp16 = squeeze(axes = var_7377_axes_0, x = kv14_v)[name = string("op_7377_cast_fp16")];
+            int32 var_7380_axis_0 = const()[name = string("op_7380_axis_0"), val = int32(0)];
+            tensor<fp16, [10, 2, 512, 512]> K_sliding_out = stack(axis = var_7380_axis_0, values = (var_1290_cast_fp16, var_1849_cast_fp16, var_2408_cast_fp16, var_2967_cast_fp16, var_3526_cast_fp16, var_4602_cast_fp16, var_5161_cast_fp16, var_5720_cast_fp16, var_6279_cast_fp16, var_6848_cast_fp16))[name = string("op_7380_cast_fp16")];
+            int32 var_7383_axis_0 = const()[name = string("op_7383_axis_0"), val = int32(0)];
+            tensor<fp16, [10, 2, 512, 512]> V_sliding_out = stack(axis = var_7383_axis_0, values = (var_1292_cast_fp16, var_1851_cast_fp16, var_2410_cast_fp16, var_2969_cast_fp16, var_3528_cast_fp16, var_4604_cast_fp16, var_5163_cast_fp16, var_5722_cast_fp16, var_6281_cast_fp16, var_6850_cast_fp16))[name = string("op_7383_cast_fp16")];
+            int32 var_7386_axis_0 = const()[name = string("op_7386_axis_0"), val = int32(0)];
+            tensor<fp16, [2, 2, 2048, 512]> K_full_out = stack(axis = var_7386_axis_0, values = (var_4043_cast_fp16, var_7375_cast_fp16))[name = string("op_7386_cast_fp16")];
+            int32 var_7389_axis_0 = const()[name = string("op_7389_axis_0"), val = int32(0)];
+            tensor<fp16, [2, 2, 2048, 512]> V_full_out = stack(axis = var_7389_axis_0, values = (var_4045_cast_fp16, var_7377_cast_fp16))[name = string("op_7389_cast_fp16")];
+        } -> (hidden_states_out, K_sliding_out, V_sliding_out, K_full_out, V_full_out, kv13_k, kv13_v, kv14_k, kv14_v);
+    func verify_qK<ios18>(tensor<fp16, [2, 2, 2048, 512]> K_full_in, tensor<fp16, [10, 2, 512, 512]> K_sliding_in, tensor<fp16, [2, 2, 2048, 512]> V_full_in, tensor<fp16, [10, 2, 512, 512]> V_sliding_in, tensor<fp16, [1, 1, 3, 2048]> causal_mask_full, tensor<fp16, [1, 1, 3, 512]> causal_mask_sliding, tensor<fp16, [1, 1, 3, 512]> cos_f, tensor<fp16, [1, 1, 3, 256]> cos_s, tensor<fp16, [1, 3, 2560]> hidden_states, tensor<fp16, [1, 3, 10752]> per_layer_combined, tensor<fp16, [1, 1, 3, 512]> sin_f, tensor<fp16, [1, 1, 3, 256]> sin_s, tensor<fp16, [1, 1, 2048, 3]> update_indicator) {
+            tensor<fp16, [2048, 2560, 1, 1]> layers_0_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2621568))))[name = string("layers_0_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_0_self_attn_q_norm_weight = const()[name = string("layers_0_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2623680)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_0_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2624256))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(3279680))))[name = string("layers_0_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_0_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(3280256))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(3935680))))[name = string("layers_0_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_0_self_attn_k_norm_weight = const()[name = string("layers_0_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(3936256)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_0_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(3936832))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17044096))))[name = string("layers_0_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_0_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17054400))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(30161664))))[name = string("layers_0_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_0_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(30171968))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(43279232))))[name = string("layers_0_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_0_post_feedforward_layernorm_weight = const()[name = string("layers_0_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(43281856)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_0_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(43287040))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(43614784))))[name = string("layers_0_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_1_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(43615104))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(46236608))))[name = string("layers_1_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_1_self_attn_q_norm_weight = const()[name = string("layers_1_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(46238720)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_1_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(46239296))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(46894720))))[name = string("layers_1_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_1_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(46895296))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(47550720))))[name = string("layers_1_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_1_self_attn_k_norm_weight = const()[name = string("layers_1_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(47551296)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_1_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(47551872))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(60659136))))[name = string("layers_1_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_1_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(60669440))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(73776704))))[name = string("layers_1_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_1_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(73787008))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(86894272))))[name = string("layers_1_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_1_post_feedforward_layernorm_weight = const()[name = string("layers_1_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(86896896)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_1_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(86902080))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(87229824))))[name = string("layers_1_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_2_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(87230144))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(89851648))))[name = string("layers_2_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_2_self_attn_q_norm_weight = const()[name = string("layers_2_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(89853760)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_2_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(89854336))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(90509760))))[name = string("layers_2_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_2_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(90510336))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91165760))))[name = string("layers_2_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_2_self_attn_k_norm_weight = const()[name = string("layers_2_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91166336)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_2_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91166912))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(104274176))))[name = string("layers_2_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_2_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(104284480))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(117391744))))[name = string("layers_2_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_2_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(117402048))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(130509312))))[name = string("layers_2_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_2_post_feedforward_layernorm_weight = const()[name = string("layers_2_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(130511936)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_2_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(130517120))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(130844864))))[name = string("layers_2_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_3_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(130845184))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(133466688))))[name = string("layers_3_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_3_self_attn_q_norm_weight = const()[name = string("layers_3_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(133468800)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_3_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(133469376))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(134124800))))[name = string("layers_3_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_3_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(134125376))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(134780800))))[name = string("layers_3_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_3_self_attn_k_norm_weight = const()[name = string("layers_3_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(134781376)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_3_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(134781952))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(147889216))))[name = string("layers_3_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_3_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(147899520))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(161006784))))[name = string("layers_3_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_3_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(161017088))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(174124352))))[name = string("layers_3_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_3_post_feedforward_layernorm_weight = const()[name = string("layers_3_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(174126976)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_3_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(174132160))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(174459904))))[name = string("layers_3_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_4_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(174460224))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(177081728))))[name = string("layers_4_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_4_self_attn_q_norm_weight = const()[name = string("layers_4_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(177083840)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_4_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(177084416))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(177739840))))[name = string("layers_4_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_4_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(177740416))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(178395840))))[name = string("layers_4_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_4_self_attn_k_norm_weight = const()[name = string("layers_4_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(178396416)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_4_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(178396992))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(191504256))))[name = string("layers_4_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_4_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(191514560))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(204621824))))[name = string("layers_4_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_4_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(204632128))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(217739392))))[name = string("layers_4_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_4_post_feedforward_layernorm_weight = const()[name = string("layers_4_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(217742016)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_4_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(217747200))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(218074944))))[name = string("layers_4_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [4096, 2560, 1, 1]> layers_5_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(218075264))), lut = tensor<fp16, [128, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(223318208))))[name = string("layers_5_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [512]> layers_5_self_attn_q_norm_weight = const()[name = string("layers_5_self_attn_q_norm_weight"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(223322368)))];
+            tensor<fp16, [1024, 2560, 1, 1]> layers_5_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(223323456))), lut = tensor<fp16, [32, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(224634240))))[name = string("layers_5_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [1024, 2560, 1, 1]> layers_5_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(224635328))), lut = tensor<fp16, [32, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(225946112))))[name = string("layers_5_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [512]> layers_5_self_attn_k_norm_weight = const()[name = string("layers_5_self_attn_k_norm_weight"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(225947200)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_5_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(225948288))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(239055552))))[name = string("layers_5_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_5_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(239065856))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(252173120))))[name = string("layers_5_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_5_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(252183424))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(265290688))))[name = string("layers_5_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_5_post_feedforward_layernorm_weight = const()[name = string("layers_5_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(265293312)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_5_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(265298496))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(265626240))))[name = string("layers_5_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_6_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(265626560))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(268248064))))[name = string("layers_6_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_6_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(268250176))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(268905600))))[name = string("layers_6_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_6_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(268906176))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(269561600))))[name = string("layers_6_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_6_self_attn_k_norm_weight = const()[name = string("layers_6_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(269562176)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_6_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(269562752))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(282670016))))[name = string("layers_6_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_6_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(282680320))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(295787584))))[name = string("layers_6_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_6_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(295797888))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(308905152))))[name = string("layers_6_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_6_post_feedforward_layernorm_weight = const()[name = string("layers_6_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(308907776)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_6_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(308912960))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(309240704))))[name = string("layers_6_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_7_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(309241024))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(311862528))))[name = string("layers_7_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_7_self_attn_q_norm_weight = const()[name = string("layers_7_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(311864640)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_7_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(311865216))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(312520640))))[name = string("layers_7_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_7_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(312521216))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(313176640))))[name = string("layers_7_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_7_self_attn_k_norm_weight = const()[name = string("layers_7_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(313177216)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_7_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(313177792))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(326285056))))[name = string("layers_7_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_7_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(326295360))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(339402624))))[name = string("layers_7_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_7_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(339412928))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(352520192))))[name = string("layers_7_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_7_post_feedforward_layernorm_weight = const()[name = string("layers_7_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(352522816)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_7_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(352528000))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(352855744))))[name = string("layers_7_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_8_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(352856064))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(355477568))))[name = string("layers_8_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_8_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(355479680))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(356135104))))[name = string("layers_8_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_8_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(356135680))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(356791104))))[name = string("layers_8_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_8_self_attn_k_norm_weight = const()[name = string("layers_8_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(356791680)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_8_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(356792256))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(369899520))))[name = string("layers_8_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_8_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(369909824))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(383017088))))[name = string("layers_8_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_8_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(383027392))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(396134656))))[name = string("layers_8_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_8_post_feedforward_layernorm_weight = const()[name = string("layers_8_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(396137280)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_8_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(396142464))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(396470208))))[name = string("layers_8_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_9_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(396470528))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(399092032))))[name = string("layers_9_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_9_self_attn_q_norm_weight = const()[name = string("layers_9_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(399094144)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_9_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(399094720))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(399750144))))[name = string("layers_9_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_9_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(399750720))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400406144))))[name = string("layers_9_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_9_self_attn_k_norm_weight = const()[name = string("layers_9_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400406720)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_9_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400407296))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(413514560))))[name = string("layers_9_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_9_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(413524864))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(426632128))))[name = string("layers_9_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_9_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(426642432))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(439749696))))[name = string("layers_9_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_9_post_feedforward_layernorm_weight = const()[name = string("layers_9_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(439752320)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_9_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(439757504))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(440085248))))[name = string("layers_9_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_10_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(440085568))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(442707072))))[name = string("layers_10_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_10_self_attn_q_norm_weight = const()[name = string("layers_10_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(442709184)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_10_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(442709760))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(443365184))))[name = string("layers_10_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_10_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(443365760))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(444021184))))[name = string("layers_10_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_10_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(444021760))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(457129024))))[name = string("layers_10_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_10_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(457139328))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(470246592))))[name = string("layers_10_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_10_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(470256896))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(483364160))))[name = string("layers_10_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_10_post_feedforward_layernorm_weight = const()[name = string("layers_10_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(483366784)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_10_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(483371968))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(483699712))))[name = string("layers_10_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [4096, 2560, 1, 1]> layers_11_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(483700032))), lut = tensor<fp16, [128, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(488942976))))[name = string("layers_11_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [512]> layers_11_self_attn_q_norm_weight = const()[name = string("layers_11_self_attn_q_norm_weight"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(488947136)))];
+            tensor<fp16, [1024, 2560, 1, 1]> layers_11_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(488948224))), lut = tensor<fp16, [32, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(490259008))))[name = string("layers_11_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [1024, 2560, 1, 1]> layers_11_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(490260096))), lut = tensor<fp16, [32, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(491570880))))[name = string("layers_11_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [512]> layers_11_self_attn_k_norm_weight = const()[name = string("layers_11_self_attn_k_norm_weight"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(491571968)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_11_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(491573056))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(504680320))))[name = string("layers_11_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_11_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(504690624))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(517797888))))[name = string("layers_11_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_11_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(517808192))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(530915456))))[name = string("layers_11_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_11_post_feedforward_layernorm_weight = const()[name = string("layers_11_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(530918080)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_11_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(530923264))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(531251008))))[name = string("layers_11_per_layer_input_gate_weight_palettized")];
+            int32 var_738 = const()[name = string("op_738"), val = int32(-1)];
+            fp16 const_0_promoted_to_fp16 = const()[name = string("const_0_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_740_cast_fp16 = mul(x = hidden_states, y = const_0_promoted_to_fp16)[name = string("op_740_cast_fp16")];
+            bool input_1_interleave_0 = const()[name = string("input_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_1_cast_fp16 = concat(axis = var_738, interleave = input_1_interleave_0, values = (hidden_states, var_740_cast_fp16))[name = string("input_1_cast_fp16")];
+            tensor<int32, [1]> normed_1_axes_0 = const()[name = string("normed_1_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_735_to_fp16 = const()[name = string("op_735_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_1_cast_fp16 = layer_norm(axes = normed_1_axes_0, epsilon = var_735_to_fp16, x = input_1_cast_fp16)[name = string("normed_1_cast_fp16")];
+            tensor<int32, [2]> var_745_split_sizes_0 = const()[name = string("op_745_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_745_axis_0 = const()[name = string("op_745_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_745_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_745_cast_fp16_1 = split(axis = var_745_axis_0, split_sizes = var_745_split_sizes_0, x = normed_1_cast_fp16)[name = string("op_745_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_0_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(531251328)))];
+            tensor<fp16, [1, 3, 2560]> h_1_cast_fp16 = mul(x = var_745_cast_fp16_0, y = layers_0_input_layernorm_weight_promoted_to_fp16)[name = string("h_1_cast_fp16")];
+            tensor<int32, [3]> var_751 = const()[name = string("op_751"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_754_axes_0 = const()[name = string("op_754_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_752_cast_fp16 = transpose(perm = var_751, x = h_1_cast_fp16)[name = string("transpose_239")];
+            tensor<fp16, [1, 2560, 1, 3]> var_754_cast_fp16 = expand_dims(axes = var_754_axes_0, x = var_752_cast_fp16)[name = string("op_754_cast_fp16")];
+            string q_1_pad_type_0 = const()[name = string("q_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_1_strides_0 = const()[name = string("q_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_1_pad_0 = const()[name = string("q_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_1_dilations_0 = const()[name = string("q_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_1_groups_0 = const()[name = string("q_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_1 = conv(dilations = q_1_dilations_0, groups = q_1_groups_0, pad = q_1_pad_0, pad_type = q_1_pad_type_0, strides = q_1_strides_0, weight = layers_0_self_attn_q_proj_weight_palettized, x = var_754_cast_fp16)[name = string("q_1")];
+            tensor<int32, [4]> var_775 = const()[name = string("op_775"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_776 = reshape(shape = var_775, x = q_1)[name = string("op_776")];
+            tensor<int32, [4]> transpose_48_perm_0 = const()[name = string("transpose_48_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_799 = const()[name = string("op_799"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_48 = transpose(perm = transpose_48_perm_0, x = var_776)[name = string("transpose_238")];
+            tensor<fp16, [3, 8, 256]> x_1 = reshape(shape = var_799, x = transpose_48)[name = string("x_1")];
+            int32 var_805 = const()[name = string("op_805"), val = int32(-1)];
+            fp16 const_1_promoted = const()[name = string("const_1_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_807 = mul(x = x_1, y = const_1_promoted)[name = string("op_807")];
+            bool input_5_interleave_0 = const()[name = string("input_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_5 = concat(axis = var_805, interleave = input_5_interleave_0, values = (x_1, var_807))[name = string("input_5")];
+            tensor<int32, [1]> normed_5_axes_0 = const()[name = string("normed_5_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_802_to_fp16 = const()[name = string("op_802_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_5_cast_fp16 = layer_norm(axes = normed_5_axes_0, epsilon = var_802_to_fp16, x = input_5)[name = string("normed_5_cast_fp16")];
+            tensor<int32, [2]> var_812_split_sizes_0 = const()[name = string("op_812_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_812_axis_0 = const()[name = string("op_812_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_812_0, tensor<fp16, [3, 8, 256]> var_812_1 = split(axis = var_812_axis_0, split_sizes = var_812_split_sizes_0, x = normed_5_cast_fp16)[name = string("op_812")];
+            tensor<fp16, [3, 8, 256]> q_5 = mul(x = var_812_0, y = layers_0_self_attn_q_norm_weight)[name = string("q_5")];
+            tensor<int32, [4]> var_819 = const()[name = string("op_819"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_820 = reshape(shape = var_819, x = q_5)[name = string("op_820")];
+            tensor<int32, [4]> var_825 = const()[name = string("op_825"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_7 = transpose(perm = var_825, x = var_820)[name = string("transpose_237")];
+            tensor<fp16, [1, 8, 3, 256]> var_827_cast_fp16 = mul(x = q_7, y = cos_s)[name = string("op_827_cast_fp16")];
+            tensor<int32, [2]> var_828_split_sizes_0 = const()[name = string("op_828_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_828_axis_0 = const()[name = string("op_828_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_828_0, tensor<fp16, [1, 8, 3, 128]> var_828_1 = split(axis = var_828_axis_0, split_sizes = var_828_split_sizes_0, x = q_7)[name = string("op_828")];
+            fp16 const_2_promoted = const()[name = string("const_2_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_830 = mul(x = var_828_1, y = const_2_promoted)[name = string("op_830")];
+            int32 var_832 = const()[name = string("op_832"), val = int32(-1)];
+            bool var_833_interleave_0 = const()[name = string("op_833_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_833 = concat(axis = var_832, interleave = var_833_interleave_0, values = (var_830, var_828_0))[name = string("op_833")];
+            tensor<fp16, [1, 8, 3, 256]> var_834_cast_fp16 = mul(x = var_833, y = sin_s)[name = string("op_834_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_11_cast_fp16 = add(x = var_827_cast_fp16, y = var_834_cast_fp16)[name = string("q_11_cast_fp16")];
+            string k_1_pad_type_0 = const()[name = string("k_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_1_strides_0 = const()[name = string("k_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_1_pad_0 = const()[name = string("k_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_1_dilations_0 = const()[name = string("k_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_1_groups_0 = const()[name = string("k_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> k_1 = conv(dilations = k_1_dilations_0, groups = k_1_groups_0, pad = k_1_pad_0, pad_type = k_1_pad_type_0, strides = k_1_strides_0, weight = layers_0_self_attn_k_proj_weight_palettized, x = var_754_cast_fp16)[name = string("k_1")];
+            tensor<int32, [4]> var_852 = const()[name = string("op_852"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_853 = reshape(shape = var_852, x = k_1)[name = string("op_853")];
+            tensor<int32, [4]> transpose_49_perm_0 = const()[name = string("transpose_49_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            string v_1_pad_type_0 = const()[name = string("v_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_1_strides_0 = const()[name = string("v_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_1_pad_0 = const()[name = string("v_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_1_dilations_0 = const()[name = string("v_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_1_groups_0 = const()[name = string("v_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> v_1 = conv(dilations = v_1_dilations_0, groups = v_1_groups_0, pad = v_1_pad_0, pad_type = v_1_pad_type_0, strides = v_1_strides_0, weight = layers_0_self_attn_v_proj_weight_palettized, x = var_754_cast_fp16)[name = string("v_1")];
+            tensor<int32, [4]> var_880 = const()[name = string("op_880"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_881 = reshape(shape = var_880, x = v_1)[name = string("op_881")];
+            tensor<int32, [4]> var_886 = const()[name = string("op_886"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_904 = const()[name = string("op_904"), val = tensor<int32, [3]>([3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> transpose_49 = transpose(perm = transpose_49_perm_0, x = var_853)[name = string("transpose_236")];
+            tensor<fp16, [3, 2, 256]> x_3 = reshape(shape = var_904, x = transpose_49)[name = string("x_3")];
+            int32 var_910 = const()[name = string("op_910"), val = int32(-1)];
+            fp16 const_3_promoted = const()[name = string("const_3_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 2, 256]> var_912 = mul(x = x_3, y = const_3_promoted)[name = string("op_912")];
+            bool input_7_interleave_0 = const()[name = string("input_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 2, 512]> input_7 = concat(axis = var_910, interleave = input_7_interleave_0, values = (x_3, var_912))[name = string("input_7")];
+            tensor<int32, [1]> normed_9_axes_0 = const()[name = string("normed_9_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_907_to_fp16 = const()[name = string("op_907_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 2, 512]> normed_9_cast_fp16 = layer_norm(axes = normed_9_axes_0, epsilon = var_907_to_fp16, x = input_7)[name = string("normed_9_cast_fp16")];
+            tensor<int32, [2]> var_917_split_sizes_0 = const()[name = string("op_917_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_917_axis_0 = const()[name = string("op_917_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 2, 256]> var_917_0, tensor<fp16, [3, 2, 256]> var_917_1 = split(axis = var_917_axis_0, split_sizes = var_917_split_sizes_0, x = normed_9_cast_fp16)[name = string("op_917")];
+            tensor<fp16, [3, 2, 256]> k_5 = mul(x = var_917_0, y = layers_0_self_attn_k_norm_weight)[name = string("k_5")];
+            tensor<int32, [4]> var_924 = const()[name = string("op_924"), val = tensor<int32, [4]>([1, 3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> var_925 = reshape(shape = var_924, x = k_5)[name = string("op_925")];
+            tensor<int32, [4]> var_930 = const()[name = string("op_930"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            fp16 var_932_promoted = const()[name = string("op_932_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 3, 256]> var_887 = transpose(perm = var_886, x = var_881)[name = string("transpose_235")];
+            tensor<fp16, [1, 2, 3, 256]> var_933 = pow(x = var_887, y = var_932_promoted)[name = string("op_933")];
+            tensor<int32, [1]> var_938_axes_0 = const()[name = string("op_938_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_938_keep_dims_0 = const()[name = string("op_938_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 3, 1]> var_938 = reduce_mean(axes = var_938_axes_0, keep_dims = var_938_keep_dims_0, x = var_933)[name = string("op_938")];
+            fp16 var_940_to_fp16 = const()[name = string("op_940_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 3, 1]> mean_sq_1_cast_fp16 = add(x = var_938, y = var_940_to_fp16)[name = string("mean_sq_1_cast_fp16")];
+            fp32 var_942_epsilon_0 = const()[name = string("op_942_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 3, 1]> var_942_cast_fp16 = rsqrt(epsilon = var_942_epsilon_0, x = mean_sq_1_cast_fp16)[name = string("op_942_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_11_cast_fp16 = mul(x = var_887, y = var_942_cast_fp16)[name = string("input_11_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> q_9 = transpose(perm = var_930, x = var_925)[name = string("transpose_234")];
+            tensor<fp16, [1, 2, 3, 256]> var_944_cast_fp16 = mul(x = q_9, y = cos_s)[name = string("op_944_cast_fp16")];
+            tensor<int32, [2]> var_945_split_sizes_0 = const()[name = string("op_945_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_945_axis_0 = const()[name = string("op_945_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 3, 128]> var_945_0, tensor<fp16, [1, 2, 3, 128]> var_945_1 = split(axis = var_945_axis_0, split_sizes = var_945_split_sizes_0, x = q_9)[name = string("op_945")];
+            fp16 const_4_promoted = const()[name = string("const_4_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 3, 128]> var_947 = mul(x = var_945_1, y = const_4_promoted)[name = string("op_947")];
+            int32 var_949 = const()[name = string("op_949"), val = int32(-1)];
+            bool var_950_interleave_0 = const()[name = string("op_950_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 3, 256]> var_950 = concat(axis = var_949, interleave = var_950_interleave_0, values = (var_947, var_945_0))[name = string("op_950")];
+            tensor<fp16, [1, 2, 3, 256]> var_951_cast_fp16 = mul(x = var_950, y = sin_s)[name = string("op_951_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_9_cast_fp16 = add(x = var_944_cast_fp16, y = var_951_cast_fp16)[name = string("input_9_cast_fp16")];
+            tensor<int32, [8]> k_padded_1_pad_0 = const()[name = string("k_padded_1_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_1_mode_0 = const()[name = string("k_padded_1_mode_0"), val = string("constant")];
+            fp16 const_5_to_fp16 = const()[name = string("const_5_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> k_padded_1_cast_fp16 = pad(constant_val = const_5_to_fp16, mode = k_padded_1_mode_0, pad = k_padded_1_pad_0, x = input_9_cast_fp16)[name = string("k_padded_1_cast_fp16")];
+            tensor<int32, [8]> v_padded_1_pad_0 = const()[name = string("v_padded_1_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_1_mode_0 = const()[name = string("v_padded_1_mode_0"), val = string("constant")];
+            fp16 const_6_to_fp16 = const()[name = string("const_6_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> v_padded_1_cast_fp16 = pad(constant_val = const_6_to_fp16, mode = v_padded_1_mode_0, pad = v_padded_1_pad_0, x = input_11_cast_fp16)[name = string("v_padded_1_cast_fp16")];
+            tensor<int32, [4]> slot_k_1_begin_0 = const()[name = string("slot_k_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> slot_k_1_end_0 = const()[name = string("slot_k_1_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> slot_k_1_end_mask_0 = const()[name = string("slot_k_1_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_k_1_cast_fp16 = slice_by_index(begin = slot_k_1_begin_0, end = slot_k_1_end_0, end_mask = slot_k_1_end_mask_0, x = K_sliding_in)[name = string("slot_k_1_cast_fp16")];
+            tensor<int32, [4]> slot_v_1_begin_0 = const()[name = string("slot_v_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> slot_v_1_end_0 = const()[name = string("slot_v_1_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> slot_v_1_end_mask_0 = const()[name = string("slot_v_1_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_v_1_cast_fp16 = slice_by_index(begin = slot_v_1_begin_0, end = slot_v_1_end_0, end_mask = slot_v_1_end_mask_0, x = V_sliding_in)[name = string("slot_v_1_cast_fp16")];
+            tensor<int32, [4]> var_990_begin_0 = const()[name = string("op_990_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_990_end_0 = const()[name = string("op_990_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_990_end_mask_0 = const()[name = string("op_990_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_990_cast_fp16 = slice_by_index(begin = var_990_begin_0, end = var_990_end_0, end_mask = var_990_end_mask_0, x = slot_k_1_cast_fp16)[name = string("op_990_cast_fp16")];
+            int32 var_997 = const()[name = string("op_997"), val = int32(2)];
+            bool new_k_1_interleave_0 = const()[name = string("new_k_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_k_1_cast_fp16 = concat(axis = var_997, interleave = new_k_1_interleave_0, values = (var_990_cast_fp16, k_padded_1_cast_fp16))[name = string("new_k_1_cast_fp16")];
+            tensor<int32, [4]> var_1013_begin_0 = const()[name = string("op_1013_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_1013_end_0 = const()[name = string("op_1013_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_1013_end_mask_0 = const()[name = string("op_1013_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_1013_cast_fp16 = slice_by_index(begin = var_1013_begin_0, end = var_1013_end_0, end_mask = var_1013_end_mask_0, x = slot_v_1_cast_fp16)[name = string("op_1013_cast_fp16")];
+            int32 var_1020 = const()[name = string("op_1020"), val = int32(2)];
+            bool new_v_1_interleave_0 = const()[name = string("new_v_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_v_1_cast_fp16 = concat(axis = var_1020, interleave = new_v_1_interleave_0, values = (var_1013_cast_fp16, v_padded_1_cast_fp16))[name = string("new_v_1_cast_fp16")];
+            tensor<int32, [4]> var_1031_begin_0 = const()[name = string("op_1031_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> var_1031_end_0 = const()[name = string("op_1031_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_1031_end_mask_0 = const()[name = string("op_1031_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [9, 2, 512, 512]> var_1031_cast_fp16 = slice_by_index(begin = var_1031_begin_0, end = var_1031_end_0, end_mask = var_1031_end_mask_0, x = K_sliding_in)[name = string("op_1031_cast_fp16")];
+            int32 var_1033 = const()[name = string("op_1033"), val = int32(0)];
+            bool K_sliding_out_1_interleave_0 = const()[name = string("K_sliding_out_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> K_sliding_out_1_cast_fp16 = concat(axis = var_1033, interleave = K_sliding_out_1_interleave_0, values = (new_k_1_cast_fp16, var_1031_cast_fp16))[name = string("K_sliding_out_1_cast_fp16")];
+            tensor<int32, [4]> var_1044_begin_0 = const()[name = string("op_1044_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> var_1044_end_0 = const()[name = string("op_1044_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_1044_end_mask_0 = const()[name = string("op_1044_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [9, 2, 512, 512]> var_1044_cast_fp16 = slice_by_index(begin = var_1044_begin_0, end = var_1044_end_0, end_mask = var_1044_end_mask_0, x = V_sliding_in)[name = string("op_1044_cast_fp16")];
+            int32 var_1046 = const()[name = string("op_1046"), val = int32(0)];
+            bool V_sliding_out_1_interleave_0 = const()[name = string("V_sliding_out_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> V_sliding_out_1_cast_fp16 = concat(axis = var_1046, interleave = V_sliding_out_1_interleave_0, values = (new_v_1_cast_fp16, var_1044_cast_fp16))[name = string("V_sliding_out_1_cast_fp16")];
+            tensor<int32, [4]> var_1052_begin_0 = const()[name = string("op_1052_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1052_end_0 = const()[name = string("op_1052_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_1052_end_mask_0 = const()[name = string("op_1052_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_1052_cast_fp16 = slice_by_index(begin = var_1052_begin_0, end = var_1052_end_0, end_mask = var_1052_end_mask_0, x = K_sliding_out_1_cast_fp16)[name = string("op_1052_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_1_begin_0 = const()[name = string("K_for_attn_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_1_end_0 = const()[name = string("K_for_attn_1_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_1_end_mask_0 = const()[name = string("K_for_attn_1_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_1_cast_fp16 = slice_by_index(begin = K_for_attn_1_begin_0, end = K_for_attn_1_end_0, end_mask = K_for_attn_1_end_mask_0, x = var_1052_cast_fp16)[name = string("K_for_attn_1_cast_fp16")];
+            tensor<int32, [4]> var_1062_begin_0 = const()[name = string("op_1062_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1062_end_0 = const()[name = string("op_1062_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_1062_end_mask_0 = const()[name = string("op_1062_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_1062_cast_fp16 = slice_by_index(begin = var_1062_begin_0, end = var_1062_end_0, end_mask = var_1062_end_mask_0, x = V_sliding_out_1_cast_fp16)[name = string("op_1062_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_1_begin_0 = const()[name = string("V_for_attn_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_1_end_0 = const()[name = string("V_for_attn_1_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_1_end_mask_0 = const()[name = string("V_for_attn_1_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_1_cast_fp16 = slice_by_index(begin = V_for_attn_1_begin_0, end = V_for_attn_1_end_0, end_mask = V_for_attn_1_end_mask_0, x = var_1062_cast_fp16)[name = string("V_for_attn_1_cast_fp16")];
+            tensor<int32, [4]> transpose_0_perm_0 = const()[name = string("transpose_0_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_0_reps_0 = const()[name = string("tile_0_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_0_cast_fp16 = transpose(perm = transpose_0_perm_0, x = K_for_attn_1_cast_fp16)[name = string("transpose_233")];
+            tensor<fp16, [8, 1, 512, 256]> tile_0_cast_fp16 = tile(reps = tile_0_reps_0, x = transpose_0_cast_fp16)[name = string("tile_0_cast_fp16")];
+            tensor<int32, [5]> concat_0 = const()[name = string("concat_0"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_0_cast_fp16 = reshape(shape = concat_0, x = tile_0_cast_fp16)[name = string("reshape_0_cast_fp16")];
+            tensor<int32, [5]> transpose_1_perm_0 = const()[name = string("transpose_1_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_1 = const()[name = string("concat_1"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_1_cast_fp16 = transpose(perm = transpose_1_perm_0, x = reshape_0_cast_fp16)[name = string("transpose_232")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_1_cast_fp16 = reshape(shape = concat_1, x = transpose_1_cast_fp16)[name = string("reshape_1_cast_fp16")];
+            tensor<int32, [4]> transpose_50_perm_0 = const()[name = string("transpose_50_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_2_perm_0 = const()[name = string("transpose_2_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_1_reps_0 = const()[name = string("tile_1_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_2_cast_fp16 = transpose(perm = transpose_2_perm_0, x = V_for_attn_1_cast_fp16)[name = string("transpose_231")];
+            tensor<fp16, [8, 1, 512, 256]> tile_1_cast_fp16 = tile(reps = tile_1_reps_0, x = transpose_2_cast_fp16)[name = string("tile_1_cast_fp16")];
+            tensor<int32, [5]> concat_2 = const()[name = string("concat_2"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_2_cast_fp16 = reshape(shape = concat_2, x = tile_1_cast_fp16)[name = string("reshape_2_cast_fp16")];
+            tensor<int32, [5]> transpose_3_perm_0 = const()[name = string("transpose_3_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_3 = const()[name = string("concat_3"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_3_cast_fp16 = transpose(perm = transpose_3_perm_0, x = reshape_2_cast_fp16)[name = string("transpose_230")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_3_cast_fp16 = reshape(shape = concat_3, x = transpose_3_cast_fp16)[name = string("reshape_3_cast_fp16")];
+            tensor<int32, [4]> V_expanded_1_perm_0 = const()[name = string("V_expanded_1_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(false)];
+            bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_50_cast_fp16 = transpose(perm = transpose_50_perm_0, x = reshape_1_cast_fp16)[name = string("transpose_229")];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = q_11_cast_fp16, y = transpose_50_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_7_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = causal_mask_sliding)[name = string("x_7_cast_fp16")];
+            tensor<int32, [1]> reduce_max_0_axes_0 = const()[name = string("reduce_max_0_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_0_keep_dims_0 = const()[name = string("reduce_max_0_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_0 = reduce_max(axes = reduce_max_0_axes_0, keep_dims = reduce_max_0_keep_dims_0, x = x_7_cast_fp16)[name = string("reduce_max_0")];
+            tensor<fp16, [1, 8, 3, 512]> var_1097 = sub(x = x_7_cast_fp16, y = reduce_max_0)[name = string("op_1097")];
+            tensor<fp16, [1, 8, 3, 512]> var_1103 = exp(x = var_1097)[name = string("op_1103")];
+            tensor<int32, [1]> var_1113_axes_0 = const()[name = string("op_1113_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1113_keep_dims_0 = const()[name = string("op_1113_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_1113 = reduce_sum(axes = var_1113_axes_0, keep_dims = var_1113_keep_dims_0, x = var_1103)[name = string("op_1113")];
+            tensor<fp16, [1, 8, 3, 512]> var_1119_cast_fp16 = real_div(x = var_1103, y = var_1113)[name = string("op_1119_cast_fp16")];
+            bool attn_output_1_transpose_x_0 = const()[name = string("attn_output_1_transpose_x_0"), val = bool(false)];
+            bool attn_output_1_transpose_y_0 = const()[name = string("attn_output_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_1_cast_fp16 = transpose(perm = V_expanded_1_perm_0, x = reshape_3_cast_fp16)[name = string("transpose_228")];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_1_cast_fp16 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = var_1119_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_1_cast_fp16")];
+            tensor<int32, [4]> var_1130 = const()[name = string("op_1130"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1137 = const()[name = string("op_1137"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_1131_cast_fp16 = transpose(perm = var_1130, x = attn_output_1_cast_fp16)[name = string("transpose_227")];
+            tensor<fp16, [1, 3, 2048]> attn_output_3_cast_fp16 = reshape(shape = var_1137, x = var_1131_cast_fp16)[name = string("attn_output_3_cast_fp16")];
+            tensor<int32, [3]> var_1142 = const()[name = string("op_1142"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_1158_pad_type_0 = const()[name = string("op_1158_pad_type_0"), val = string("valid")];
+            int32 var_1158_groups_0 = const()[name = string("op_1158_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_1158_strides_0 = const()[name = string("op_1158_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_1158_pad_0 = const()[name = string("op_1158_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_1158_dilations_0 = const()[name = string("op_1158_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_0_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(531256512))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(533878016))))[name = string("squeeze_0_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_1143_cast_fp16 = transpose(perm = var_1142, x = attn_output_3_cast_fp16)[name = string("transpose_226")];
+            tensor<fp16, [1, 2560, 3]> var_1158_cast_fp16 = conv(dilations = var_1158_dilations_0, groups = var_1158_groups_0, pad = var_1158_pad_0, pad_type = var_1158_pad_type_0, strides = var_1158_strides_0, weight = squeeze_0_cast_fp16_to_fp32_to_fp16_palettized, x = var_1143_cast_fp16)[name = string("op_1158_cast_fp16")];
+            tensor<int32, [3]> var_1162 = const()[name = string("op_1162"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1168 = const()[name = string("op_1168"), val = int32(-1)];
+            fp16 const_7_promoted_to_fp16 = const()[name = string("const_7_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_11_cast_fp16 = transpose(perm = var_1162, x = var_1158_cast_fp16)[name = string("transpose_225")];
+            tensor<fp16, [1, 3, 2560]> var_1170_cast_fp16 = mul(x = x_11_cast_fp16, y = const_7_promoted_to_fp16)[name = string("op_1170_cast_fp16")];
+            bool input_15_interleave_0 = const()[name = string("input_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_15_cast_fp16 = concat(axis = var_1168, interleave = input_15_interleave_0, values = (x_11_cast_fp16, var_1170_cast_fp16))[name = string("input_15_cast_fp16")];
+            tensor<int32, [1]> normed_13_axes_0 = const()[name = string("normed_13_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1165_to_fp16 = const()[name = string("op_1165_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_13_cast_fp16 = layer_norm(axes = normed_13_axes_0, epsilon = var_1165_to_fp16, x = input_15_cast_fp16)[name = string("normed_13_cast_fp16")];
+            tensor<int32, [2]> var_1175_split_sizes_0 = const()[name = string("op_1175_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1175_axis_0 = const()[name = string("op_1175_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1175_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1175_cast_fp16_1 = split(axis = var_1175_axis_0, split_sizes = var_1175_split_sizes_0, x = normed_13_cast_fp16)[name = string("op_1175_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_0_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(533880640)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_5_cast_fp16 = mul(x = var_1175_cast_fp16_0, y = layers_0_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_5_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_13_cast_fp16 = add(x = hidden_states, y = attn_output_5_cast_fp16)[name = string("x_13_cast_fp16")];
+            int32 var_1184 = const()[name = string("op_1184"), val = int32(-1)];
+            fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_1186_cast_fp16 = mul(x = x_13_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_1186_cast_fp16")];
+            bool input_17_interleave_0 = const()[name = string("input_17_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_17_cast_fp16 = concat(axis = var_1184, interleave = input_17_interleave_0, values = (x_13_cast_fp16, var_1186_cast_fp16))[name = string("input_17_cast_fp16")];
+            tensor<int32, [1]> normed_17_axes_0 = const()[name = string("normed_17_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1181_to_fp16 = const()[name = string("op_1181_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_17_cast_fp16 = layer_norm(axes = normed_17_axes_0, epsilon = var_1181_to_fp16, x = input_17_cast_fp16)[name = string("normed_17_cast_fp16")];
+            tensor<int32, [2]> var_1191_split_sizes_0 = const()[name = string("op_1191_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1191_axis_0 = const()[name = string("op_1191_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1191_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1191_cast_fp16_1 = split(axis = var_1191_axis_0, split_sizes = var_1191_split_sizes_0, x = normed_17_cast_fp16)[name = string("op_1191_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_0_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(533885824)))];
+            tensor<fp16, [1, 3, 2560]> h_3_cast_fp16 = mul(x = var_1191_cast_fp16_0, y = layers_0_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_3_cast_fp16")];
+            tensor<int32, [3]> var_1202 = const()[name = string("op_1202"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_19_axes_0 = const()[name = string("input_19_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1203 = transpose(perm = var_1202, x = h_3_cast_fp16)[name = string("transpose_224")];
+            tensor<fp16, [1, 2560, 1, 3]> input_19 = expand_dims(axes = input_19_axes_0, x = var_1203)[name = string("input_19")];
+            string gate_1_pad_type_0 = const()[name = string("gate_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_1_strides_0 = const()[name = string("gate_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_1_pad_0 = const()[name = string("gate_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_1_dilations_0 = const()[name = string("gate_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_1_groups_0 = const()[name = string("gate_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_1 = conv(dilations = gate_1_dilations_0, groups = gate_1_groups_0, pad = gate_1_pad_0, pad_type = gate_1_pad_type_0, strides = gate_1_strides_0, weight = layers_0_mlp_gate_proj_weight_palettized, x = input_19)[name = string("gate_1")];
+            string up_1_pad_type_0 = const()[name = string("up_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_1_strides_0 = const()[name = string("up_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_1_pad_0 = const()[name = string("up_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_1_dilations_0 = const()[name = string("up_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_1_groups_0 = const()[name = string("up_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_1 = conv(dilations = up_1_dilations_0, groups = up_1_groups_0, pad = up_1_pad_0, pad_type = up_1_pad_type_0, strides = up_1_strides_0, weight = layers_0_mlp_up_proj_weight_palettized, x = input_19)[name = string("up_1")];
+            string gate_3_mode_0 = const()[name = string("gate_3_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_3 = gelu(mode = gate_3_mode_0, x = gate_1)[name = string("gate_3")];
+            tensor<fp16, [1, 10240, 1, 3]> input_21 = mul(x = gate_3, y = up_1)[name = string("input_21")];
+            string mlp_out_1_pad_type_0 = const()[name = string("mlp_out_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_1_strides_0 = const()[name = string("mlp_out_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_1_pad_0 = const()[name = string("mlp_out_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_1_dilations_0 = const()[name = string("mlp_out_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_1_groups_0 = const()[name = string("mlp_out_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_1 = conv(dilations = mlp_out_1_dilations_0, groups = mlp_out_1_groups_0, pad = mlp_out_1_pad_0, pad_type = mlp_out_1_pad_type_0, strides = mlp_out_1_strides_0, weight = layers_0_mlp_down_proj_weight_palettized, x = input_21)[name = string("mlp_out_1")];
+            tensor<int32, [1]> var_1243_axes_0 = const()[name = string("op_1243_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1243 = squeeze(axes = var_1243_axes_0, x = mlp_out_1)[name = string("op_1243")];
+            tensor<int32, [3]> var_1247 = const()[name = string("op_1247"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1253 = const()[name = string("op_1253"), val = int32(-1)];
+            fp16 const_9_promoted = const()[name = string("const_9_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_15 = transpose(perm = var_1247, x = var_1243)[name = string("transpose_223")];
+            tensor<fp16, [1, 3, 2560]> var_1255 = mul(x = x_15, y = const_9_promoted)[name = string("op_1255")];
+            bool input_23_interleave_0 = const()[name = string("input_23_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_23 = concat(axis = var_1253, interleave = input_23_interleave_0, values = (x_15, var_1255))[name = string("input_23")];
+            tensor<int32, [1]> normed_21_axes_0 = const()[name = string("normed_21_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1250_to_fp16 = const()[name = string("op_1250_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_21_cast_fp16 = layer_norm(axes = normed_21_axes_0, epsilon = var_1250_to_fp16, x = input_23)[name = string("normed_21_cast_fp16")];
+            tensor<int32, [2]> var_1260_split_sizes_0 = const()[name = string("op_1260_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1260_axis_0 = const()[name = string("op_1260_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1260_0, tensor<fp16, [1, 3, 2560]> var_1260_1 = split(axis = var_1260_axis_0, split_sizes = var_1260_split_sizes_0, x = normed_21_cast_fp16)[name = string("op_1260")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_3 = mul(x = var_1260_0, y = layers_0_post_feedforward_layernorm_weight)[name = string("hidden_states_3")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_5_cast_fp16 = add(x = x_13_cast_fp16, y = hidden_states_3)[name = string("hidden_states_5_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_1_begin_0 = const()[name = string("per_layer_slice_1_begin_0"), val = tensor<int32, [3]>([0, 0, 3072])];
+            tensor<int32, [3]> per_layer_slice_1_end_0 = const()[name = string("per_layer_slice_1_end_0"), val = tensor<int32, [3]>([1, 3, 3328])];
+            tensor<bool, [3]> per_layer_slice_1_end_mask_0 = const()[name = string("per_layer_slice_1_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_1_cast_fp16 = slice_by_index(begin = per_layer_slice_1_begin_0, end = per_layer_slice_1_end_0, end_mask = per_layer_slice_1_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_1_cast_fp16")];
+            tensor<int32, [3]> var_1288 = const()[name = string("op_1288"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_25_axes_0 = const()[name = string("input_25_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1289 = transpose(perm = var_1288, x = hidden_states_5_cast_fp16)[name = string("transpose_222")];
+            tensor<fp16, [1, 2560, 1, 3]> input_25 = expand_dims(axes = input_25_axes_0, x = var_1289)[name = string("input_25")];
+            string gated_1_pad_type_0 = const()[name = string("gated_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_1_strides_0 = const()[name = string("gated_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_1_pad_0 = const()[name = string("gated_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_1_dilations_0 = const()[name = string("gated_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_1_groups_0 = const()[name = string("gated_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_1 = conv(dilations = gated_1_dilations_0, groups = gated_1_groups_0, pad = gated_1_pad_0, pad_type = gated_1_pad_type_0, strides = gated_1_strides_0, weight = layers_0_per_layer_input_gate_weight_palettized, x = input_25)[name = string("gated_1")];
+            string gated_3_mode_0 = const()[name = string("gated_3_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_3 = gelu(mode = gated_3_mode_0, x = gated_1)[name = string("gated_3")];
+            tensor<int32, [3]> var_1308 = const()[name = string("op_1308"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_1_axes_0 = const()[name = string("per_layer_slice_conv_1_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_1309_cast_fp16 = transpose(perm = var_1308, x = per_layer_slice_1_cast_fp16)[name = string("transpose_221")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_1_cast_fp16 = expand_dims(axes = per_layer_slice_conv_1_axes_0, x = var_1309_cast_fp16)[name = string("per_layer_slice_conv_1_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_27_cast_fp16 = mul(x = gated_3, y = per_layer_slice_conv_1_cast_fp16)[name = string("input_27_cast_fp16")];
+            string gated_5_pad_type_0 = const()[name = string("gated_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_5_strides_0 = const()[name = string("gated_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_5_pad_0 = const()[name = string("gated_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_5_dilations_0 = const()[name = string("gated_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_5_groups_0 = const()[name = string("gated_5_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_0_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(533891008))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(534218752))))[name = string("layers_0_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_5_cast_fp16 = conv(dilations = gated_5_dilations_0, groups = gated_5_groups_0, pad = gated_5_pad_0, pad_type = gated_5_pad_type_0, strides = gated_5_strides_0, weight = layers_0_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_27_cast_fp16)[name = string("gated_5_cast_fp16")];
+            tensor<int32, [1]> var_1325_axes_0 = const()[name = string("op_1325_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1325_cast_fp16 = squeeze(axes = var_1325_axes_0, x = gated_5_cast_fp16)[name = string("op_1325_cast_fp16")];
+            tensor<int32, [3]> var_1329 = const()[name = string("op_1329"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1335 = const()[name = string("op_1335"), val = int32(-1)];
+            fp16 const_10_promoted_to_fp16 = const()[name = string("const_10_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_17_cast_fp16 = transpose(perm = var_1329, x = var_1325_cast_fp16)[name = string("transpose_220")];
+            tensor<fp16, [1, 3, 2560]> var_1337_cast_fp16 = mul(x = x_17_cast_fp16, y = const_10_promoted_to_fp16)[name = string("op_1337_cast_fp16")];
+            bool input_29_interleave_0 = const()[name = string("input_29_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_29_cast_fp16 = concat(axis = var_1335, interleave = input_29_interleave_0, values = (x_17_cast_fp16, var_1337_cast_fp16))[name = string("input_29_cast_fp16")];
+            tensor<int32, [1]> normed_25_axes_0 = const()[name = string("normed_25_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1332_to_fp16 = const()[name = string("op_1332_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_25_cast_fp16 = layer_norm(axes = normed_25_axes_0, epsilon = var_1332_to_fp16, x = input_29_cast_fp16)[name = string("normed_25_cast_fp16")];
+            tensor<int32, [2]> var_1342_split_sizes_0 = const()[name = string("op_1342_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1342_axis_0 = const()[name = string("op_1342_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1342_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1342_cast_fp16_1 = split(axis = var_1342_axis_0, split_sizes = var_1342_split_sizes_0, x = normed_25_cast_fp16)[name = string("op_1342_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_0_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(534221376)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_9_cast_fp16 = mul(x = var_1342_cast_fp16_0, y = layers_0_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_9_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_11_cast_fp16 = add(x = hidden_states_5_cast_fp16, y = hidden_states_9_cast_fp16)[name = string("hidden_states_11_cast_fp16")];
+            tensor<fp16, [1]> const_11_promoted_to_fp16 = const()[name = string("const_11_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.7ep-1])];
+            tensor<fp16, [1, 3, 2560]> x_19_cast_fp16 = mul(x = hidden_states_11_cast_fp16, y = const_11_promoted_to_fp16)[name = string("x_19_cast_fp16")];
+            int32 var_1357 = const()[name = string("op_1357"), val = int32(-1)];
+            fp16 const_12_promoted_to_fp16 = const()[name = string("const_12_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_1359_cast_fp16 = mul(x = x_19_cast_fp16, y = const_12_promoted_to_fp16)[name = string("op_1359_cast_fp16")];
+            bool input_31_interleave_0 = const()[name = string("input_31_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_31_cast_fp16 = concat(axis = var_1357, interleave = input_31_interleave_0, values = (x_19_cast_fp16, var_1359_cast_fp16))[name = string("input_31_cast_fp16")];
+            tensor<int32, [1]> normed_29_axes_0 = const()[name = string("normed_29_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1354_to_fp16 = const()[name = string("op_1354_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_29_cast_fp16 = layer_norm(axes = normed_29_axes_0, epsilon = var_1354_to_fp16, x = input_31_cast_fp16)[name = string("normed_29_cast_fp16")];
+            tensor<int32, [2]> var_1364_split_sizes_0 = const()[name = string("op_1364_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1364_axis_0 = const()[name = string("op_1364_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1364_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1364_cast_fp16_1 = split(axis = var_1364_axis_0, split_sizes = var_1364_split_sizes_0, x = normed_29_cast_fp16)[name = string("op_1364_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_1_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(534226560)))];
+            tensor<fp16, [1, 3, 2560]> h_7_cast_fp16 = mul(x = var_1364_cast_fp16_0, y = layers_1_input_layernorm_weight_promoted_to_fp16)[name = string("h_7_cast_fp16")];
+            tensor<int32, [3]> var_1370 = const()[name = string("op_1370"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1373_axes_0 = const()[name = string("op_1373_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1371_cast_fp16 = transpose(perm = var_1370, x = h_7_cast_fp16)[name = string("transpose_219")];
+            tensor<fp16, [1, 2560, 1, 3]> var_1373_cast_fp16 = expand_dims(axes = var_1373_axes_0, x = var_1371_cast_fp16)[name = string("op_1373_cast_fp16")];
+            string q_13_pad_type_0 = const()[name = string("q_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_13_strides_0 = const()[name = string("q_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_13_pad_0 = const()[name = string("q_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_13_dilations_0 = const()[name = string("q_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_13_groups_0 = const()[name = string("q_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_13 = conv(dilations = q_13_dilations_0, groups = q_13_groups_0, pad = q_13_pad_0, pad_type = q_13_pad_type_0, strides = q_13_strides_0, weight = layers_1_self_attn_q_proj_weight_palettized, x = var_1373_cast_fp16)[name = string("q_13")];
+            tensor<int32, [4]> var_1394 = const()[name = string("op_1394"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_1395 = reshape(shape = var_1394, x = q_13)[name = string("op_1395")];
+            tensor<int32, [4]> transpose_51_perm_0 = const()[name = string("transpose_51_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_1418 = const()[name = string("op_1418"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_51 = transpose(perm = transpose_51_perm_0, x = var_1395)[name = string("transpose_218")];
+            tensor<fp16, [3, 8, 256]> x_21 = reshape(shape = var_1418, x = transpose_51)[name = string("x_21")];
+            int32 var_1424 = const()[name = string("op_1424"), val = int32(-1)];
+            fp16 const_13_promoted = const()[name = string("const_13_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_1426 = mul(x = x_21, y = const_13_promoted)[name = string("op_1426")];
+            bool input_35_interleave_0 = const()[name = string("input_35_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_35 = concat(axis = var_1424, interleave = input_35_interleave_0, values = (x_21, var_1426))[name = string("input_35")];
+            tensor<int32, [1]> normed_33_axes_0 = const()[name = string("normed_33_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1421_to_fp16 = const()[name = string("op_1421_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_33_cast_fp16 = layer_norm(axes = normed_33_axes_0, epsilon = var_1421_to_fp16, x = input_35)[name = string("normed_33_cast_fp16")];
+            tensor<int32, [2]> var_1431_split_sizes_0 = const()[name = string("op_1431_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_1431_axis_0 = const()[name = string("op_1431_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_1431_0, tensor<fp16, [3, 8, 256]> var_1431_1 = split(axis = var_1431_axis_0, split_sizes = var_1431_split_sizes_0, x = normed_33_cast_fp16)[name = string("op_1431")];
+            tensor<fp16, [3, 8, 256]> q_17 = mul(x = var_1431_0, y = layers_1_self_attn_q_norm_weight)[name = string("q_17")];
+            tensor<int32, [4]> var_1438 = const()[name = string("op_1438"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_1439 = reshape(shape = var_1438, x = q_17)[name = string("op_1439")];
+            tensor<int32, [4]> var_1444 = const()[name = string("op_1444"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_19 = transpose(perm = var_1444, x = var_1439)[name = string("transpose_217")];
+            tensor<fp16, [1, 8, 3, 256]> var_1446_cast_fp16 = mul(x = q_19, y = cos_s)[name = string("op_1446_cast_fp16")];
+            tensor<int32, [2]> var_1447_split_sizes_0 = const()[name = string("op_1447_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_1447_axis_0 = const()[name = string("op_1447_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_1447_0, tensor<fp16, [1, 8, 3, 128]> var_1447_1 = split(axis = var_1447_axis_0, split_sizes = var_1447_split_sizes_0, x = q_19)[name = string("op_1447")];
+            fp16 const_14_promoted = const()[name = string("const_14_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_1449 = mul(x = var_1447_1, y = const_14_promoted)[name = string("op_1449")];
+            int32 var_1451 = const()[name = string("op_1451"), val = int32(-1)];
+            bool var_1452_interleave_0 = const()[name = string("op_1452_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_1452 = concat(axis = var_1451, interleave = var_1452_interleave_0, values = (var_1449, var_1447_0))[name = string("op_1452")];
+            tensor<fp16, [1, 8, 3, 256]> var_1453_cast_fp16 = mul(x = var_1452, y = sin_s)[name = string("op_1453_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_23_cast_fp16 = add(x = var_1446_cast_fp16, y = var_1453_cast_fp16)[name = string("q_23_cast_fp16")];
+            string k_7_pad_type_0 = const()[name = string("k_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_7_strides_0 = const()[name = string("k_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_7_pad_0 = const()[name = string("k_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_7_dilations_0 = const()[name = string("k_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_7_groups_0 = const()[name = string("k_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> k_7 = conv(dilations = k_7_dilations_0, groups = k_7_groups_0, pad = k_7_pad_0, pad_type = k_7_pad_type_0, strides = k_7_strides_0, weight = layers_1_self_attn_k_proj_weight_palettized, x = var_1373_cast_fp16)[name = string("k_7")];
+            tensor<int32, [4]> var_1471 = const()[name = string("op_1471"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_1472 = reshape(shape = var_1471, x = k_7)[name = string("op_1472")];
+            tensor<int32, [4]> transpose_52_perm_0 = const()[name = string("transpose_52_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            string v_3_pad_type_0 = const()[name = string("v_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_3_strides_0 = const()[name = string("v_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_3_pad_0 = const()[name = string("v_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_3_dilations_0 = const()[name = string("v_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_3_groups_0 = const()[name = string("v_3_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> v_3 = conv(dilations = v_3_dilations_0, groups = v_3_groups_0, pad = v_3_pad_0, pad_type = v_3_pad_type_0, strides = v_3_strides_0, weight = layers_1_self_attn_v_proj_weight_palettized, x = var_1373_cast_fp16)[name = string("v_3")];
+            tensor<int32, [4]> var_1499 = const()[name = string("op_1499"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_1500 = reshape(shape = var_1499, x = v_3)[name = string("op_1500")];
+            tensor<int32, [4]> var_1505 = const()[name = string("op_1505"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_1523 = const()[name = string("op_1523"), val = tensor<int32, [3]>([3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> transpose_52 = transpose(perm = transpose_52_perm_0, x = var_1472)[name = string("transpose_216")];
+            tensor<fp16, [3, 2, 256]> x_23 = reshape(shape = var_1523, x = transpose_52)[name = string("x_23")];
+            int32 var_1529 = const()[name = string("op_1529"), val = int32(-1)];
+            fp16 const_15_promoted = const()[name = string("const_15_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 2, 256]> var_1531 = mul(x = x_23, y = const_15_promoted)[name = string("op_1531")];
+            bool input_37_interleave_0 = const()[name = string("input_37_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 2, 512]> input_37 = concat(axis = var_1529, interleave = input_37_interleave_0, values = (x_23, var_1531))[name = string("input_37")];
+            tensor<int32, [1]> normed_37_axes_0 = const()[name = string("normed_37_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1526_to_fp16 = const()[name = string("op_1526_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 2, 512]> normed_37_cast_fp16 = layer_norm(axes = normed_37_axes_0, epsilon = var_1526_to_fp16, x = input_37)[name = string("normed_37_cast_fp16")];
+            tensor<int32, [2]> var_1536_split_sizes_0 = const()[name = string("op_1536_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_1536_axis_0 = const()[name = string("op_1536_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 2, 256]> var_1536_0, tensor<fp16, [3, 2, 256]> var_1536_1 = split(axis = var_1536_axis_0, split_sizes = var_1536_split_sizes_0, x = normed_37_cast_fp16)[name = string("op_1536")];
+            tensor<fp16, [3, 2, 256]> k_11 = mul(x = var_1536_0, y = layers_1_self_attn_k_norm_weight)[name = string("k_11")];
+            tensor<int32, [4]> var_1543 = const()[name = string("op_1543"), val = tensor<int32, [4]>([1, 3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> var_1544 = reshape(shape = var_1543, x = k_11)[name = string("op_1544")];
+            tensor<int32, [4]> var_1549 = const()[name = string("op_1549"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            fp16 var_1551_promoted = const()[name = string("op_1551_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 3, 256]> var_1506 = transpose(perm = var_1505, x = var_1500)[name = string("transpose_215")];
+            tensor<fp16, [1, 2, 3, 256]> var_1552 = pow(x = var_1506, y = var_1551_promoted)[name = string("op_1552")];
+            tensor<int32, [1]> var_1557_axes_0 = const()[name = string("op_1557_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1557_keep_dims_0 = const()[name = string("op_1557_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 3, 1]> var_1557 = reduce_mean(axes = var_1557_axes_0, keep_dims = var_1557_keep_dims_0, x = var_1552)[name = string("op_1557")];
+            fp16 var_1559_to_fp16 = const()[name = string("op_1559_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 3, 1]> mean_sq_3_cast_fp16 = add(x = var_1557, y = var_1559_to_fp16)[name = string("mean_sq_3_cast_fp16")];
+            fp32 var_1561_epsilon_0 = const()[name = string("op_1561_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 3, 1]> var_1561_cast_fp16 = rsqrt(epsilon = var_1561_epsilon_0, x = mean_sq_3_cast_fp16)[name = string("op_1561_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_41_cast_fp16 = mul(x = var_1506, y = var_1561_cast_fp16)[name = string("input_41_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> q_21 = transpose(perm = var_1549, x = var_1544)[name = string("transpose_214")];
+            tensor<fp16, [1, 2, 3, 256]> var_1563_cast_fp16 = mul(x = q_21, y = cos_s)[name = string("op_1563_cast_fp16")];
+            tensor<int32, [2]> var_1564_split_sizes_0 = const()[name = string("op_1564_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_1564_axis_0 = const()[name = string("op_1564_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 3, 128]> var_1564_0, tensor<fp16, [1, 2, 3, 128]> var_1564_1 = split(axis = var_1564_axis_0, split_sizes = var_1564_split_sizes_0, x = q_21)[name = string("op_1564")];
+            fp16 const_16_promoted = const()[name = string("const_16_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 3, 128]> var_1566 = mul(x = var_1564_1, y = const_16_promoted)[name = string("op_1566")];
+            int32 var_1568 = const()[name = string("op_1568"), val = int32(-1)];
+            bool var_1569_interleave_0 = const()[name = string("op_1569_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 3, 256]> var_1569 = concat(axis = var_1568, interleave = var_1569_interleave_0, values = (var_1566, var_1564_0))[name = string("op_1569")];
+            tensor<fp16, [1, 2, 3, 256]> var_1570_cast_fp16 = mul(x = var_1569, y = sin_s)[name = string("op_1570_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_39_cast_fp16 = add(x = var_1563_cast_fp16, y = var_1570_cast_fp16)[name = string("input_39_cast_fp16")];
+            tensor<int32, [8]> k_padded_3_pad_0 = const()[name = string("k_padded_3_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_3_mode_0 = const()[name = string("k_padded_3_mode_0"), val = string("constant")];
+            fp16 const_17_to_fp16 = const()[name = string("const_17_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> k_padded_3_cast_fp16 = pad(constant_val = const_17_to_fp16, mode = k_padded_3_mode_0, pad = k_padded_3_pad_0, x = input_39_cast_fp16)[name = string("k_padded_3_cast_fp16")];
+            tensor<int32, [8]> v_padded_3_pad_0 = const()[name = string("v_padded_3_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_3_mode_0 = const()[name = string("v_padded_3_mode_0"), val = string("constant")];
+            fp16 const_18_to_fp16 = const()[name = string("const_18_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> v_padded_3_cast_fp16 = pad(constant_val = const_18_to_fp16, mode = v_padded_3_mode_0, pad = v_padded_3_pad_0, x = input_41_cast_fp16)[name = string("v_padded_3_cast_fp16")];
+            tensor<int32, [4]> slot_k_3_begin_0 = const()[name = string("slot_k_3_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> slot_k_3_end_0 = const()[name = string("slot_k_3_end_0"), val = tensor<int32, [4]>([2, 2, 512, 512])];
+            tensor<bool, [4]> slot_k_3_end_mask_0 = const()[name = string("slot_k_3_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_k_3_cast_fp16 = slice_by_index(begin = slot_k_3_begin_0, end = slot_k_3_end_0, end_mask = slot_k_3_end_mask_0, x = K_sliding_out_1_cast_fp16)[name = string("slot_k_3_cast_fp16")];
+            tensor<int32, [4]> slot_v_3_begin_0 = const()[name = string("slot_v_3_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> slot_v_3_end_0 = const()[name = string("slot_v_3_end_0"), val = tensor<int32, [4]>([2, 2, 512, 512])];
+            tensor<bool, [4]> slot_v_3_end_mask_0 = const()[name = string("slot_v_3_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_v_3_cast_fp16 = slice_by_index(begin = slot_v_3_begin_0, end = slot_v_3_end_0, end_mask = slot_v_3_end_mask_0, x = V_sliding_out_1_cast_fp16)[name = string("slot_v_3_cast_fp16")];
+            tensor<int32, [4]> var_1609_begin_0 = const()[name = string("op_1609_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_1609_end_0 = const()[name = string("op_1609_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_1609_end_mask_0 = const()[name = string("op_1609_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_1609_cast_fp16 = slice_by_index(begin = var_1609_begin_0, end = var_1609_end_0, end_mask = var_1609_end_mask_0, x = slot_k_3_cast_fp16)[name = string("op_1609_cast_fp16")];
+            int32 var_1616 = const()[name = string("op_1616"), val = int32(2)];
+            bool new_k_3_interleave_0 = const()[name = string("new_k_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_k_3_cast_fp16 = concat(axis = var_1616, interleave = new_k_3_interleave_0, values = (var_1609_cast_fp16, k_padded_3_cast_fp16))[name = string("new_k_3_cast_fp16")];
+            tensor<int32, [4]> var_1632_begin_0 = const()[name = string("op_1632_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_1632_end_0 = const()[name = string("op_1632_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_1632_end_mask_0 = const()[name = string("op_1632_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_1632_cast_fp16 = slice_by_index(begin = var_1632_begin_0, end = var_1632_end_0, end_mask = var_1632_end_mask_0, x = slot_v_3_cast_fp16)[name = string("op_1632_cast_fp16")];
+            int32 var_1639 = const()[name = string("op_1639"), val = int32(2)];
+            bool new_v_3_interleave_0 = const()[name = string("new_v_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_v_3_cast_fp16 = concat(axis = var_1639, interleave = new_v_3_interleave_0, values = (var_1632_cast_fp16, v_padded_3_cast_fp16))[name = string("new_v_3_cast_fp16")];
+            tensor<int32, [4]> var_1650_begin_0 = const()[name = string("op_1650_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
+            tensor<int32, [4]> var_1650_end_0 = const()[name = string("op_1650_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_1650_end_mask_0 = const()[name = string("op_1650_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [8, 2, 512, 512]> var_1650_cast_fp16 = slice_by_index(begin = var_1650_begin_0, end = var_1650_end_0, end_mask = var_1650_end_mask_0, x = K_sliding_out_1_cast_fp16)[name = string("op_1650_cast_fp16")];
+            int32 var_1652 = const()[name = string("op_1652"), val = int32(0)];
+            bool K_sliding_out_3_interleave_0 = const()[name = string("K_sliding_out_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> K_sliding_out_3_cast_fp16 = concat(axis = var_1652, interleave = K_sliding_out_3_interleave_0, values = (var_1052_cast_fp16, new_k_3_cast_fp16, var_1650_cast_fp16))[name = string("K_sliding_out_3_cast_fp16")];
+            tensor<int32, [4]> var_1663_begin_0 = const()[name = string("op_1663_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
+            tensor<int32, [4]> var_1663_end_0 = const()[name = string("op_1663_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_1663_end_mask_0 = const()[name = string("op_1663_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [8, 2, 512, 512]> var_1663_cast_fp16 = slice_by_index(begin = var_1663_begin_0, end = var_1663_end_0, end_mask = var_1663_end_mask_0, x = V_sliding_out_1_cast_fp16)[name = string("op_1663_cast_fp16")];
+            int32 var_1665 = const()[name = string("op_1665"), val = int32(0)];
+            bool V_sliding_out_3_interleave_0 = const()[name = string("V_sliding_out_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> V_sliding_out_3_cast_fp16 = concat(axis = var_1665, interleave = V_sliding_out_3_interleave_0, values = (var_1062_cast_fp16, new_v_3_cast_fp16, var_1663_cast_fp16))[name = string("V_sliding_out_3_cast_fp16")];
+            tensor<int32, [4]> var_1671_begin_0 = const()[name = string("op_1671_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> var_1671_end_0 = const()[name = string("op_1671_end_0"), val = tensor<int32, [4]>([2, 2, 512, 512])];
+            tensor<bool, [4]> var_1671_end_mask_0 = const()[name = string("op_1671_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_1671_cast_fp16 = slice_by_index(begin = var_1671_begin_0, end = var_1671_end_0, end_mask = var_1671_end_mask_0, x = K_sliding_out_3_cast_fp16)[name = string("op_1671_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_3_begin_0 = const()[name = string("K_for_attn_3_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_3_end_0 = const()[name = string("K_for_attn_3_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_3_end_mask_0 = const()[name = string("K_for_attn_3_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_3_cast_fp16 = slice_by_index(begin = K_for_attn_3_begin_0, end = K_for_attn_3_end_0, end_mask = K_for_attn_3_end_mask_0, x = var_1671_cast_fp16)[name = string("K_for_attn_3_cast_fp16")];
+            tensor<int32, [4]> var_1681_begin_0 = const()[name = string("op_1681_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> var_1681_end_0 = const()[name = string("op_1681_end_0"), val = tensor<int32, [4]>([2, 2, 512, 512])];
+            tensor<bool, [4]> var_1681_end_mask_0 = const()[name = string("op_1681_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_1681_cast_fp16 = slice_by_index(begin = var_1681_begin_0, end = var_1681_end_0, end_mask = var_1681_end_mask_0, x = V_sliding_out_3_cast_fp16)[name = string("op_1681_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_3_begin_0 = const()[name = string("V_for_attn_3_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_3_end_0 = const()[name = string("V_for_attn_3_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_3_end_mask_0 = const()[name = string("V_for_attn_3_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_3_cast_fp16 = slice_by_index(begin = V_for_attn_3_begin_0, end = V_for_attn_3_end_0, end_mask = V_for_attn_3_end_mask_0, x = var_1681_cast_fp16)[name = string("V_for_attn_3_cast_fp16")];
+            tensor<int32, [4]> transpose_4_perm_0 = const()[name = string("transpose_4_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_2_reps_0 = const()[name = string("tile_2_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_4_cast_fp16 = transpose(perm = transpose_4_perm_0, x = K_for_attn_3_cast_fp16)[name = string("transpose_213")];
+            tensor<fp16, [8, 1, 512, 256]> tile_2_cast_fp16 = tile(reps = tile_2_reps_0, x = transpose_4_cast_fp16)[name = string("tile_2_cast_fp16")];
+            tensor<int32, [5]> concat_4 = const()[name = string("concat_4"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_4_cast_fp16 = reshape(shape = concat_4, x = tile_2_cast_fp16)[name = string("reshape_4_cast_fp16")];
+            tensor<int32, [5]> transpose_5_perm_0 = const()[name = string("transpose_5_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_5 = const()[name = string("concat_5"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_5_cast_fp16 = transpose(perm = transpose_5_perm_0, x = reshape_4_cast_fp16)[name = string("transpose_212")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_5_cast_fp16 = reshape(shape = concat_5, x = transpose_5_cast_fp16)[name = string("reshape_5_cast_fp16")];
+            tensor<int32, [4]> transpose_53_perm_0 = const()[name = string("transpose_53_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_6_perm_0 = const()[name = string("transpose_6_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_3_reps_0 = const()[name = string("tile_3_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_6_cast_fp16 = transpose(perm = transpose_6_perm_0, x = V_for_attn_3_cast_fp16)[name = string("transpose_211")];
+            tensor<fp16, [8, 1, 512, 256]> tile_3_cast_fp16 = tile(reps = tile_3_reps_0, x = transpose_6_cast_fp16)[name = string("tile_3_cast_fp16")];
+            tensor<int32, [5]> concat_6 = const()[name = string("concat_6"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_6_cast_fp16 = reshape(shape = concat_6, x = tile_3_cast_fp16)[name = string("reshape_6_cast_fp16")];
+            tensor<int32, [5]> transpose_7_perm_0 = const()[name = string("transpose_7_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_7 = const()[name = string("concat_7"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_7_cast_fp16 = transpose(perm = transpose_7_perm_0, x = reshape_6_cast_fp16)[name = string("transpose_210")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_7_cast_fp16 = reshape(shape = concat_7, x = transpose_7_cast_fp16)[name = string("reshape_7_cast_fp16")];
+            tensor<int32, [4]> V_expanded_3_perm_0 = const()[name = string("V_expanded_3_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(false)];
+            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_53_cast_fp16 = transpose(perm = transpose_53_perm_0, x = reshape_5_cast_fp16)[name = string("transpose_209")];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = q_23_cast_fp16, y = transpose_53_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_27_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = causal_mask_sliding)[name = string("x_27_cast_fp16")];
+            tensor<int32, [1]> reduce_max_1_axes_0 = const()[name = string("reduce_max_1_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_1_keep_dims_0 = const()[name = string("reduce_max_1_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_1 = reduce_max(axes = reduce_max_1_axes_0, keep_dims = reduce_max_1_keep_dims_0, x = x_27_cast_fp16)[name = string("reduce_max_1")];
+            tensor<fp16, [1, 8, 3, 512]> var_1716 = sub(x = x_27_cast_fp16, y = reduce_max_1)[name = string("op_1716")];
+            tensor<fp16, [1, 8, 3, 512]> var_1722 = exp(x = var_1716)[name = string("op_1722")];
+            tensor<int32, [1]> var_1732_axes_0 = const()[name = string("op_1732_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1732_keep_dims_0 = const()[name = string("op_1732_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_1732 = reduce_sum(axes = var_1732_axes_0, keep_dims = var_1732_keep_dims_0, x = var_1722)[name = string("op_1732")];
+            tensor<fp16, [1, 8, 3, 512]> var_1738_cast_fp16 = real_div(x = var_1722, y = var_1732)[name = string("op_1738_cast_fp16")];
+            bool attn_output_7_transpose_x_0 = const()[name = string("attn_output_7_transpose_x_0"), val = bool(false)];
+            bool attn_output_7_transpose_y_0 = const()[name = string("attn_output_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_3_cast_fp16 = transpose(perm = V_expanded_3_perm_0, x = reshape_7_cast_fp16)[name = string("transpose_208")];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_7_cast_fp16 = matmul(transpose_x = attn_output_7_transpose_x_0, transpose_y = attn_output_7_transpose_y_0, x = var_1738_cast_fp16, y = V_expanded_3_cast_fp16)[name = string("attn_output_7_cast_fp16")];
+            tensor<int32, [4]> var_1749 = const()[name = string("op_1749"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1756 = const()[name = string("op_1756"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_1750_cast_fp16 = transpose(perm = var_1749, x = attn_output_7_cast_fp16)[name = string("transpose_207")];
+            tensor<fp16, [1, 3, 2048]> attn_output_9_cast_fp16 = reshape(shape = var_1756, x = var_1750_cast_fp16)[name = string("attn_output_9_cast_fp16")];
+            tensor<int32, [3]> var_1761 = const()[name = string("op_1761"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_1777_pad_type_0 = const()[name = string("op_1777_pad_type_0"), val = string("valid")];
+            int32 var_1777_groups_0 = const()[name = string("op_1777_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_1777_strides_0 = const()[name = string("op_1777_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_1777_pad_0 = const()[name = string("op_1777_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_1777_dilations_0 = const()[name = string("op_1777_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_1_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(534231744))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(536853248))))[name = string("squeeze_1_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_1762_cast_fp16 = transpose(perm = var_1761, x = attn_output_9_cast_fp16)[name = string("transpose_206")];
+            tensor<fp16, [1, 2560, 3]> var_1777_cast_fp16 = conv(dilations = var_1777_dilations_0, groups = var_1777_groups_0, pad = var_1777_pad_0, pad_type = var_1777_pad_type_0, strides = var_1777_strides_0, weight = squeeze_1_cast_fp16_to_fp32_to_fp16_palettized, x = var_1762_cast_fp16)[name = string("op_1777_cast_fp16")];
+            tensor<int32, [3]> var_1781 = const()[name = string("op_1781"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1787 = const()[name = string("op_1787"), val = int32(-1)];
+            fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_31_cast_fp16 = transpose(perm = var_1781, x = var_1777_cast_fp16)[name = string("transpose_205")];
+            tensor<fp16, [1, 3, 2560]> var_1789_cast_fp16 = mul(x = x_31_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_1789_cast_fp16")];
+            bool input_45_interleave_0 = const()[name = string("input_45_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_45_cast_fp16 = concat(axis = var_1787, interleave = input_45_interleave_0, values = (x_31_cast_fp16, var_1789_cast_fp16))[name = string("input_45_cast_fp16")];
+            tensor<int32, [1]> normed_41_axes_0 = const()[name = string("normed_41_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1784_to_fp16 = const()[name = string("op_1784_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_41_cast_fp16 = layer_norm(axes = normed_41_axes_0, epsilon = var_1784_to_fp16, x = input_45_cast_fp16)[name = string("normed_41_cast_fp16")];
+            tensor<int32, [2]> var_1794_split_sizes_0 = const()[name = string("op_1794_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1794_axis_0 = const()[name = string("op_1794_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1794_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1794_cast_fp16_1 = split(axis = var_1794_axis_0, split_sizes = var_1794_split_sizes_0, x = normed_41_cast_fp16)[name = string("op_1794_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_1_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(536855872)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_11_cast_fp16 = mul(x = var_1794_cast_fp16_0, y = layers_1_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_11_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_33_cast_fp16 = add(x = x_19_cast_fp16, y = attn_output_11_cast_fp16)[name = string("x_33_cast_fp16")];
+            int32 var_1803 = const()[name = string("op_1803"), val = int32(-1)];
+            fp16 const_20_promoted_to_fp16 = const()[name = string("const_20_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_1805_cast_fp16 = mul(x = x_33_cast_fp16, y = const_20_promoted_to_fp16)[name = string("op_1805_cast_fp16")];
+            bool input_47_interleave_0 = const()[name = string("input_47_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_47_cast_fp16 = concat(axis = var_1803, interleave = input_47_interleave_0, values = (x_33_cast_fp16, var_1805_cast_fp16))[name = string("input_47_cast_fp16")];
+            tensor<int32, [1]> normed_45_axes_0 = const()[name = string("normed_45_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1800_to_fp16 = const()[name = string("op_1800_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_45_cast_fp16 = layer_norm(axes = normed_45_axes_0, epsilon = var_1800_to_fp16, x = input_47_cast_fp16)[name = string("normed_45_cast_fp16")];
+            tensor<int32, [2]> var_1810_split_sizes_0 = const()[name = string("op_1810_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1810_axis_0 = const()[name = string("op_1810_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1810_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1810_cast_fp16_1 = split(axis = var_1810_axis_0, split_sizes = var_1810_split_sizes_0, x = normed_45_cast_fp16)[name = string("op_1810_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_1_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(536861056)))];
+            tensor<fp16, [1, 3, 2560]> h_9_cast_fp16 = mul(x = var_1810_cast_fp16_0, y = layers_1_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_9_cast_fp16")];
+            tensor<int32, [3]> var_1821 = const()[name = string("op_1821"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_49_axes_0 = const()[name = string("input_49_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1822 = transpose(perm = var_1821, x = h_9_cast_fp16)[name = string("transpose_204")];
+            tensor<fp16, [1, 2560, 1, 3]> input_49 = expand_dims(axes = input_49_axes_0, x = var_1822)[name = string("input_49")];
+            string gate_5_pad_type_0 = const()[name = string("gate_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_5_strides_0 = const()[name = string("gate_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_5_pad_0 = const()[name = string("gate_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_5_dilations_0 = const()[name = string("gate_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_5_groups_0 = const()[name = string("gate_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_5 = conv(dilations = gate_5_dilations_0, groups = gate_5_groups_0, pad = gate_5_pad_0, pad_type = gate_5_pad_type_0, strides = gate_5_strides_0, weight = layers_1_mlp_gate_proj_weight_palettized, x = input_49)[name = string("gate_5")];
+            string up_3_pad_type_0 = const()[name = string("up_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_3_strides_0 = const()[name = string("up_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_3_pad_0 = const()[name = string("up_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_3_dilations_0 = const()[name = string("up_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_3_groups_0 = const()[name = string("up_3_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_3 = conv(dilations = up_3_dilations_0, groups = up_3_groups_0, pad = up_3_pad_0, pad_type = up_3_pad_type_0, strides = up_3_strides_0, weight = layers_1_mlp_up_proj_weight_palettized, x = input_49)[name = string("up_3")];
+            string gate_7_mode_0 = const()[name = string("gate_7_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_7 = gelu(mode = gate_7_mode_0, x = gate_5)[name = string("gate_7")];
+            tensor<fp16, [1, 10240, 1, 3]> input_51 = mul(x = gate_7, y = up_3)[name = string("input_51")];
+            string mlp_out_3_pad_type_0 = const()[name = string("mlp_out_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_3_strides_0 = const()[name = string("mlp_out_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_3_pad_0 = const()[name = string("mlp_out_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_3_dilations_0 = const()[name = string("mlp_out_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_3_groups_0 = const()[name = string("mlp_out_3_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_3 = conv(dilations = mlp_out_3_dilations_0, groups = mlp_out_3_groups_0, pad = mlp_out_3_pad_0, pad_type = mlp_out_3_pad_type_0, strides = mlp_out_3_strides_0, weight = layers_1_mlp_down_proj_weight_palettized, x = input_51)[name = string("mlp_out_3")];
+            tensor<int32, [1]> var_1862_axes_0 = const()[name = string("op_1862_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1862 = squeeze(axes = var_1862_axes_0, x = mlp_out_3)[name = string("op_1862")];
+            tensor<int32, [3]> var_1866 = const()[name = string("op_1866"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1872 = const()[name = string("op_1872"), val = int32(-1)];
+            fp16 const_21_promoted = const()[name = string("const_21_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_35 = transpose(perm = var_1866, x = var_1862)[name = string("transpose_203")];
+            tensor<fp16, [1, 3, 2560]> var_1874 = mul(x = x_35, y = const_21_promoted)[name = string("op_1874")];
+            bool input_53_interleave_0 = const()[name = string("input_53_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_53 = concat(axis = var_1872, interleave = input_53_interleave_0, values = (x_35, var_1874))[name = string("input_53")];
+            tensor<int32, [1]> normed_49_axes_0 = const()[name = string("normed_49_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1869_to_fp16 = const()[name = string("op_1869_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_49_cast_fp16 = layer_norm(axes = normed_49_axes_0, epsilon = var_1869_to_fp16, x = input_53)[name = string("normed_49_cast_fp16")];
+            tensor<int32, [2]> var_1879_split_sizes_0 = const()[name = string("op_1879_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1879_axis_0 = const()[name = string("op_1879_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1879_0, tensor<fp16, [1, 3, 2560]> var_1879_1 = split(axis = var_1879_axis_0, split_sizes = var_1879_split_sizes_0, x = normed_49_cast_fp16)[name = string("op_1879")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_13 = mul(x = var_1879_0, y = layers_1_post_feedforward_layernorm_weight)[name = string("hidden_states_13")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_15_cast_fp16 = add(x = x_33_cast_fp16, y = hidden_states_13)[name = string("hidden_states_15_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_3_begin_0 = const()[name = string("per_layer_slice_3_begin_0"), val = tensor<int32, [3]>([0, 0, 3328])];
+            tensor<int32, [3]> per_layer_slice_3_end_0 = const()[name = string("per_layer_slice_3_end_0"), val = tensor<int32, [3]>([1, 3, 3584])];
+            tensor<bool, [3]> per_layer_slice_3_end_mask_0 = const()[name = string("per_layer_slice_3_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_3_cast_fp16 = slice_by_index(begin = per_layer_slice_3_begin_0, end = per_layer_slice_3_end_0, end_mask = per_layer_slice_3_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_3_cast_fp16")];
+            tensor<int32, [3]> var_1907 = const()[name = string("op_1907"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_55_axes_0 = const()[name = string("input_55_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1908 = transpose(perm = var_1907, x = hidden_states_15_cast_fp16)[name = string("transpose_202")];
+            tensor<fp16, [1, 2560, 1, 3]> input_55 = expand_dims(axes = input_55_axes_0, x = var_1908)[name = string("input_55")];
+            string gated_7_pad_type_0 = const()[name = string("gated_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_7_strides_0 = const()[name = string("gated_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_7_pad_0 = const()[name = string("gated_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_7_dilations_0 = const()[name = string("gated_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_7_groups_0 = const()[name = string("gated_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_7 = conv(dilations = gated_7_dilations_0, groups = gated_7_groups_0, pad = gated_7_pad_0, pad_type = gated_7_pad_type_0, strides = gated_7_strides_0, weight = layers_1_per_layer_input_gate_weight_palettized, x = input_55)[name = string("gated_7")];
+            string gated_9_mode_0 = const()[name = string("gated_9_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_9 = gelu(mode = gated_9_mode_0, x = gated_7)[name = string("gated_9")];
+            tensor<int32, [3]> var_1927 = const()[name = string("op_1927"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_3_axes_0 = const()[name = string("per_layer_slice_conv_3_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_1928_cast_fp16 = transpose(perm = var_1927, x = per_layer_slice_3_cast_fp16)[name = string("transpose_201")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_3_cast_fp16 = expand_dims(axes = per_layer_slice_conv_3_axes_0, x = var_1928_cast_fp16)[name = string("per_layer_slice_conv_3_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_57_cast_fp16 = mul(x = gated_9, y = per_layer_slice_conv_3_cast_fp16)[name = string("input_57_cast_fp16")];
+            string gated_11_pad_type_0 = const()[name = string("gated_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_11_strides_0 = const()[name = string("gated_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_11_pad_0 = const()[name = string("gated_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_11_dilations_0 = const()[name = string("gated_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_11_groups_0 = const()[name = string("gated_11_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_1_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(536866240))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(537193984))))[name = string("layers_1_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_11_cast_fp16 = conv(dilations = gated_11_dilations_0, groups = gated_11_groups_0, pad = gated_11_pad_0, pad_type = gated_11_pad_type_0, strides = gated_11_strides_0, weight = layers_1_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_57_cast_fp16)[name = string("gated_11_cast_fp16")];
+            tensor<int32, [1]> var_1944_axes_0 = const()[name = string("op_1944_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1944_cast_fp16 = squeeze(axes = var_1944_axes_0, x = gated_11_cast_fp16)[name = string("op_1944_cast_fp16")];
+            tensor<int32, [3]> var_1948 = const()[name = string("op_1948"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1954 = const()[name = string("op_1954"), val = int32(-1)];
+            fp16 const_22_promoted_to_fp16 = const()[name = string("const_22_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_37_cast_fp16 = transpose(perm = var_1948, x = var_1944_cast_fp16)[name = string("transpose_200")];
+            tensor<fp16, [1, 3, 2560]> var_1956_cast_fp16 = mul(x = x_37_cast_fp16, y = const_22_promoted_to_fp16)[name = string("op_1956_cast_fp16")];
+            bool input_59_interleave_0 = const()[name = string("input_59_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_59_cast_fp16 = concat(axis = var_1954, interleave = input_59_interleave_0, values = (x_37_cast_fp16, var_1956_cast_fp16))[name = string("input_59_cast_fp16")];
+            tensor<int32, [1]> normed_53_axes_0 = const()[name = string("normed_53_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1951_to_fp16 = const()[name = string("op_1951_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_53_cast_fp16 = layer_norm(axes = normed_53_axes_0, epsilon = var_1951_to_fp16, x = input_59_cast_fp16)[name = string("normed_53_cast_fp16")];
+            tensor<int32, [2]> var_1961_split_sizes_0 = const()[name = string("op_1961_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1961_axis_0 = const()[name = string("op_1961_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1961_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1961_cast_fp16_1 = split(axis = var_1961_axis_0, split_sizes = var_1961_split_sizes_0, x = normed_53_cast_fp16)[name = string("op_1961_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_1_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(537196608)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_19_cast_fp16 = mul(x = var_1961_cast_fp16_0, y = layers_1_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_19_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_21_cast_fp16 = add(x = hidden_states_15_cast_fp16, y = hidden_states_19_cast_fp16)[name = string("hidden_states_21_cast_fp16")];
+            tensor<fp16, [1]> const_23_promoted_to_fp16 = const()[name = string("const_23_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.6cp-1])];
+            tensor<fp16, [1, 3, 2560]> x_39_cast_fp16 = mul(x = hidden_states_21_cast_fp16, y = const_23_promoted_to_fp16)[name = string("x_39_cast_fp16")];
+            int32 var_1976 = const()[name = string("op_1976"), val = int32(-1)];
+            fp16 const_24_promoted_to_fp16 = const()[name = string("const_24_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_1978_cast_fp16 = mul(x = x_39_cast_fp16, y = const_24_promoted_to_fp16)[name = string("op_1978_cast_fp16")];
+            bool input_61_interleave_0 = const()[name = string("input_61_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_61_cast_fp16 = concat(axis = var_1976, interleave = input_61_interleave_0, values = (x_39_cast_fp16, var_1978_cast_fp16))[name = string("input_61_cast_fp16")];
+            tensor<int32, [1]> normed_57_axes_0 = const()[name = string("normed_57_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1973_to_fp16 = const()[name = string("op_1973_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_57_cast_fp16 = layer_norm(axes = normed_57_axes_0, epsilon = var_1973_to_fp16, x = input_61_cast_fp16)[name = string("normed_57_cast_fp16")];
+            tensor<int32, [2]> var_1983_split_sizes_0 = const()[name = string("op_1983_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1983_axis_0 = const()[name = string("op_1983_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1983_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1983_cast_fp16_1 = split(axis = var_1983_axis_0, split_sizes = var_1983_split_sizes_0, x = normed_57_cast_fp16)[name = string("op_1983_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_2_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(537201792)))];
+            tensor<fp16, [1, 3, 2560]> h_13_cast_fp16 = mul(x = var_1983_cast_fp16_0, y = layers_2_input_layernorm_weight_promoted_to_fp16)[name = string("h_13_cast_fp16")];
+            tensor<int32, [3]> var_1989 = const()[name = string("op_1989"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1992_axes_0 = const()[name = string("op_1992_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1990_cast_fp16 = transpose(perm = var_1989, x = h_13_cast_fp16)[name = string("transpose_199")];
+            tensor<fp16, [1, 2560, 1, 3]> var_1992_cast_fp16 = expand_dims(axes = var_1992_axes_0, x = var_1990_cast_fp16)[name = string("op_1992_cast_fp16")];
+            string q_25_pad_type_0 = const()[name = string("q_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_25_strides_0 = const()[name = string("q_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_25_pad_0 = const()[name = string("q_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_25_dilations_0 = const()[name = string("q_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_25_groups_0 = const()[name = string("q_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_25 = conv(dilations = q_25_dilations_0, groups = q_25_groups_0, pad = q_25_pad_0, pad_type = q_25_pad_type_0, strides = q_25_strides_0, weight = layers_2_self_attn_q_proj_weight_palettized, x = var_1992_cast_fp16)[name = string("q_25")];
+            tensor<int32, [4]> var_2013 = const()[name = string("op_2013"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_2014 = reshape(shape = var_2013, x = q_25)[name = string("op_2014")];
+            tensor<int32, [4]> transpose_54_perm_0 = const()[name = string("transpose_54_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_2037 = const()[name = string("op_2037"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_54 = transpose(perm = transpose_54_perm_0, x = var_2014)[name = string("transpose_198")];
+            tensor<fp16, [3, 8, 256]> x_41 = reshape(shape = var_2037, x = transpose_54)[name = string("x_41")];
+            int32 var_2043 = const()[name = string("op_2043"), val = int32(-1)];
+            fp16 const_25_promoted = const()[name = string("const_25_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_2045 = mul(x = x_41, y = const_25_promoted)[name = string("op_2045")];
+            bool input_65_interleave_0 = const()[name = string("input_65_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_65 = concat(axis = var_2043, interleave = input_65_interleave_0, values = (x_41, var_2045))[name = string("input_65")];
+            tensor<int32, [1]> normed_61_axes_0 = const()[name = string("normed_61_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2040_to_fp16 = const()[name = string("op_2040_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_61_cast_fp16 = layer_norm(axes = normed_61_axes_0, epsilon = var_2040_to_fp16, x = input_65)[name = string("normed_61_cast_fp16")];
+            tensor<int32, [2]> var_2050_split_sizes_0 = const()[name = string("op_2050_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2050_axis_0 = const()[name = string("op_2050_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_2050_0, tensor<fp16, [3, 8, 256]> var_2050_1 = split(axis = var_2050_axis_0, split_sizes = var_2050_split_sizes_0, x = normed_61_cast_fp16)[name = string("op_2050")];
+            tensor<fp16, [3, 8, 256]> q_29 = mul(x = var_2050_0, y = layers_2_self_attn_q_norm_weight)[name = string("q_29")];
+            tensor<int32, [4]> var_2057 = const()[name = string("op_2057"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_2058 = reshape(shape = var_2057, x = q_29)[name = string("op_2058")];
+            tensor<int32, [4]> var_2063 = const()[name = string("op_2063"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_31 = transpose(perm = var_2063, x = var_2058)[name = string("transpose_197")];
+            tensor<fp16, [1, 8, 3, 256]> var_2065_cast_fp16 = mul(x = q_31, y = cos_s)[name = string("op_2065_cast_fp16")];
+            tensor<int32, [2]> var_2066_split_sizes_0 = const()[name = string("op_2066_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2066_axis_0 = const()[name = string("op_2066_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_2066_0, tensor<fp16, [1, 8, 3, 128]> var_2066_1 = split(axis = var_2066_axis_0, split_sizes = var_2066_split_sizes_0, x = q_31)[name = string("op_2066")];
+            fp16 const_26_promoted = const()[name = string("const_26_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_2068 = mul(x = var_2066_1, y = const_26_promoted)[name = string("op_2068")];
+            int32 var_2070 = const()[name = string("op_2070"), val = int32(-1)];
+            bool var_2071_interleave_0 = const()[name = string("op_2071_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_2071 = concat(axis = var_2070, interleave = var_2071_interleave_0, values = (var_2068, var_2066_0))[name = string("op_2071")];
+            tensor<fp16, [1, 8, 3, 256]> var_2072_cast_fp16 = mul(x = var_2071, y = sin_s)[name = string("op_2072_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_35_cast_fp16 = add(x = var_2065_cast_fp16, y = var_2072_cast_fp16)[name = string("q_35_cast_fp16")];
+            string k_13_pad_type_0 = const()[name = string("k_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_13_strides_0 = const()[name = string("k_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_13_pad_0 = const()[name = string("k_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_13_dilations_0 = const()[name = string("k_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_13_groups_0 = const()[name = string("k_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> k_13 = conv(dilations = k_13_dilations_0, groups = k_13_groups_0, pad = k_13_pad_0, pad_type = k_13_pad_type_0, strides = k_13_strides_0, weight = layers_2_self_attn_k_proj_weight_palettized, x = var_1992_cast_fp16)[name = string("k_13")];
+            tensor<int32, [4]> var_2090 = const()[name = string("op_2090"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_2091 = reshape(shape = var_2090, x = k_13)[name = string("op_2091")];
+            tensor<int32, [4]> transpose_55_perm_0 = const()[name = string("transpose_55_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            string v_5_pad_type_0 = const()[name = string("v_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_5_strides_0 = const()[name = string("v_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_5_pad_0 = const()[name = string("v_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_5_dilations_0 = const()[name = string("v_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_5_groups_0 = const()[name = string("v_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> v_5 = conv(dilations = v_5_dilations_0, groups = v_5_groups_0, pad = v_5_pad_0, pad_type = v_5_pad_type_0, strides = v_5_strides_0, weight = layers_2_self_attn_v_proj_weight_palettized, x = var_1992_cast_fp16)[name = string("v_5")];
+            tensor<int32, [4]> var_2118 = const()[name = string("op_2118"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_2119 = reshape(shape = var_2118, x = v_5)[name = string("op_2119")];
+            tensor<int32, [4]> var_2124 = const()[name = string("op_2124"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_2142 = const()[name = string("op_2142"), val = tensor<int32, [3]>([3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> transpose_55 = transpose(perm = transpose_55_perm_0, x = var_2091)[name = string("transpose_196")];
+            tensor<fp16, [3, 2, 256]> x_43 = reshape(shape = var_2142, x = transpose_55)[name = string("x_43")];
+            int32 var_2148 = const()[name = string("op_2148"), val = int32(-1)];
+            fp16 const_27_promoted = const()[name = string("const_27_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 2, 256]> var_2150 = mul(x = x_43, y = const_27_promoted)[name = string("op_2150")];
+            bool input_67_interleave_0 = const()[name = string("input_67_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 2, 512]> input_67 = concat(axis = var_2148, interleave = input_67_interleave_0, values = (x_43, var_2150))[name = string("input_67")];
+            tensor<int32, [1]> normed_65_axes_0 = const()[name = string("normed_65_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2145_to_fp16 = const()[name = string("op_2145_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 2, 512]> normed_65_cast_fp16 = layer_norm(axes = normed_65_axes_0, epsilon = var_2145_to_fp16, x = input_67)[name = string("normed_65_cast_fp16")];
+            tensor<int32, [2]> var_2155_split_sizes_0 = const()[name = string("op_2155_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2155_axis_0 = const()[name = string("op_2155_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 2, 256]> var_2155_0, tensor<fp16, [3, 2, 256]> var_2155_1 = split(axis = var_2155_axis_0, split_sizes = var_2155_split_sizes_0, x = normed_65_cast_fp16)[name = string("op_2155")];
+            tensor<fp16, [3, 2, 256]> k_17 = mul(x = var_2155_0, y = layers_2_self_attn_k_norm_weight)[name = string("k_17")];
+            tensor<int32, [4]> var_2162 = const()[name = string("op_2162"), val = tensor<int32, [4]>([1, 3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> var_2163 = reshape(shape = var_2162, x = k_17)[name = string("op_2163")];
+            tensor<int32, [4]> var_2168 = const()[name = string("op_2168"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            fp16 var_2170_promoted = const()[name = string("op_2170_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 3, 256]> var_2125 = transpose(perm = var_2124, x = var_2119)[name = string("transpose_195")];
+            tensor<fp16, [1, 2, 3, 256]> var_2171 = pow(x = var_2125, y = var_2170_promoted)[name = string("op_2171")];
+            tensor<int32, [1]> var_2176_axes_0 = const()[name = string("op_2176_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2176_keep_dims_0 = const()[name = string("op_2176_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 3, 1]> var_2176 = reduce_mean(axes = var_2176_axes_0, keep_dims = var_2176_keep_dims_0, x = var_2171)[name = string("op_2176")];
+            fp16 var_2178_to_fp16 = const()[name = string("op_2178_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 3, 1]> mean_sq_5_cast_fp16 = add(x = var_2176, y = var_2178_to_fp16)[name = string("mean_sq_5_cast_fp16")];
+            fp32 var_2180_epsilon_0 = const()[name = string("op_2180_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 3, 1]> var_2180_cast_fp16 = rsqrt(epsilon = var_2180_epsilon_0, x = mean_sq_5_cast_fp16)[name = string("op_2180_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_71_cast_fp16 = mul(x = var_2125, y = var_2180_cast_fp16)[name = string("input_71_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> q_33 = transpose(perm = var_2168, x = var_2163)[name = string("transpose_194")];
+            tensor<fp16, [1, 2, 3, 256]> var_2182_cast_fp16 = mul(x = q_33, y = cos_s)[name = string("op_2182_cast_fp16")];
+            tensor<int32, [2]> var_2183_split_sizes_0 = const()[name = string("op_2183_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2183_axis_0 = const()[name = string("op_2183_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 3, 128]> var_2183_0, tensor<fp16, [1, 2, 3, 128]> var_2183_1 = split(axis = var_2183_axis_0, split_sizes = var_2183_split_sizes_0, x = q_33)[name = string("op_2183")];
+            fp16 const_28_promoted = const()[name = string("const_28_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 3, 128]> var_2185 = mul(x = var_2183_1, y = const_28_promoted)[name = string("op_2185")];
+            int32 var_2187 = const()[name = string("op_2187"), val = int32(-1)];
+            bool var_2188_interleave_0 = const()[name = string("op_2188_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 3, 256]> var_2188 = concat(axis = var_2187, interleave = var_2188_interleave_0, values = (var_2185, var_2183_0))[name = string("op_2188")];
+            tensor<fp16, [1, 2, 3, 256]> var_2189_cast_fp16 = mul(x = var_2188, y = sin_s)[name = string("op_2189_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_69_cast_fp16 = add(x = var_2182_cast_fp16, y = var_2189_cast_fp16)[name = string("input_69_cast_fp16")];
+            tensor<int32, [8]> k_padded_5_pad_0 = const()[name = string("k_padded_5_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_5_mode_0 = const()[name = string("k_padded_5_mode_0"), val = string("constant")];
+            fp16 const_29_to_fp16 = const()[name = string("const_29_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> k_padded_5_cast_fp16 = pad(constant_val = const_29_to_fp16, mode = k_padded_5_mode_0, pad = k_padded_5_pad_0, x = input_69_cast_fp16)[name = string("k_padded_5_cast_fp16")];
+            tensor<int32, [8]> v_padded_5_pad_0 = const()[name = string("v_padded_5_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_5_mode_0 = const()[name = string("v_padded_5_mode_0"), val = string("constant")];
+            fp16 const_30_to_fp16 = const()[name = string("const_30_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> v_padded_5_cast_fp16 = pad(constant_val = const_30_to_fp16, mode = v_padded_5_mode_0, pad = v_padded_5_pad_0, x = input_71_cast_fp16)[name = string("v_padded_5_cast_fp16")];
+            tensor<int32, [4]> slot_k_5_begin_0 = const()[name = string("slot_k_5_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
+            tensor<int32, [4]> slot_k_5_end_0 = const()[name = string("slot_k_5_end_0"), val = tensor<int32, [4]>([3, 2, 512, 512])];
+            tensor<bool, [4]> slot_k_5_end_mask_0 = const()[name = string("slot_k_5_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_k_5_cast_fp16 = slice_by_index(begin = slot_k_5_begin_0, end = slot_k_5_end_0, end_mask = slot_k_5_end_mask_0, x = K_sliding_out_3_cast_fp16)[name = string("slot_k_5_cast_fp16")];
+            tensor<int32, [4]> slot_v_5_begin_0 = const()[name = string("slot_v_5_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
+            tensor<int32, [4]> slot_v_5_end_0 = const()[name = string("slot_v_5_end_0"), val = tensor<int32, [4]>([3, 2, 512, 512])];
+            tensor<bool, [4]> slot_v_5_end_mask_0 = const()[name = string("slot_v_5_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_v_5_cast_fp16 = slice_by_index(begin = slot_v_5_begin_0, end = slot_v_5_end_0, end_mask = slot_v_5_end_mask_0, x = V_sliding_out_3_cast_fp16)[name = string("slot_v_5_cast_fp16")];
+            tensor<int32, [4]> var_2228_begin_0 = const()[name = string("op_2228_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_2228_end_0 = const()[name = string("op_2228_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_2228_end_mask_0 = const()[name = string("op_2228_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_2228_cast_fp16 = slice_by_index(begin = var_2228_begin_0, end = var_2228_end_0, end_mask = var_2228_end_mask_0, x = slot_k_5_cast_fp16)[name = string("op_2228_cast_fp16")];
+            int32 var_2235 = const()[name = string("op_2235"), val = int32(2)];
+            bool new_k_5_interleave_0 = const()[name = string("new_k_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_k_5_cast_fp16 = concat(axis = var_2235, interleave = new_k_5_interleave_0, values = (var_2228_cast_fp16, k_padded_5_cast_fp16))[name = string("new_k_5_cast_fp16")];
+            tensor<int32, [4]> var_2251_begin_0 = const()[name = string("op_2251_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_2251_end_0 = const()[name = string("op_2251_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_2251_end_mask_0 = const()[name = string("op_2251_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_2251_cast_fp16 = slice_by_index(begin = var_2251_begin_0, end = var_2251_end_0, end_mask = var_2251_end_mask_0, x = slot_v_5_cast_fp16)[name = string("op_2251_cast_fp16")];
+            int32 var_2258 = const()[name = string("op_2258"), val = int32(2)];
+            bool new_v_5_interleave_0 = const()[name = string("new_v_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_v_5_cast_fp16 = concat(axis = var_2258, interleave = new_v_5_interleave_0, values = (var_2251_cast_fp16, v_padded_5_cast_fp16))[name = string("new_v_5_cast_fp16")];
+            tensor<int32, [4]> var_2264_begin_0 = const()[name = string("op_2264_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2264_end_0 = const()[name = string("op_2264_end_0"), val = tensor<int32, [4]>([2, 2, 512, 512])];
+            tensor<bool, [4]> var_2264_end_mask_0 = const()[name = string("op_2264_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [2, 2, 512, 512]> var_2264_cast_fp16 = slice_by_index(begin = var_2264_begin_0, end = var_2264_end_0, end_mask = var_2264_end_mask_0, x = K_sliding_out_3_cast_fp16)[name = string("op_2264_cast_fp16")];
+            tensor<int32, [4]> var_2269_begin_0 = const()[name = string("op_2269_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
+            tensor<int32, [4]> var_2269_end_0 = const()[name = string("op_2269_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_2269_end_mask_0 = const()[name = string("op_2269_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [7, 2, 512, 512]> var_2269_cast_fp16 = slice_by_index(begin = var_2269_begin_0, end = var_2269_end_0, end_mask = var_2269_end_mask_0, x = K_sliding_out_3_cast_fp16)[name = string("op_2269_cast_fp16")];
+            int32 var_2271 = const()[name = string("op_2271"), val = int32(0)];
+            bool K_sliding_out_5_interleave_0 = const()[name = string("K_sliding_out_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> K_sliding_out_5_cast_fp16 = concat(axis = var_2271, interleave = K_sliding_out_5_interleave_0, values = (var_2264_cast_fp16, new_k_5_cast_fp16, var_2269_cast_fp16))[name = string("K_sliding_out_5_cast_fp16")];
+            tensor<int32, [4]> var_2277_begin_0 = const()[name = string("op_2277_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2277_end_0 = const()[name = string("op_2277_end_0"), val = tensor<int32, [4]>([2, 2, 512, 512])];
+            tensor<bool, [4]> var_2277_end_mask_0 = const()[name = string("op_2277_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [2, 2, 512, 512]> var_2277_cast_fp16 = slice_by_index(begin = var_2277_begin_0, end = var_2277_end_0, end_mask = var_2277_end_mask_0, x = V_sliding_out_3_cast_fp16)[name = string("op_2277_cast_fp16")];
+            tensor<int32, [4]> var_2282_begin_0 = const()[name = string("op_2282_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
+            tensor<int32, [4]> var_2282_end_0 = const()[name = string("op_2282_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_2282_end_mask_0 = const()[name = string("op_2282_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [7, 2, 512, 512]> var_2282_cast_fp16 = slice_by_index(begin = var_2282_begin_0, end = var_2282_end_0, end_mask = var_2282_end_mask_0, x = V_sliding_out_3_cast_fp16)[name = string("op_2282_cast_fp16")];
+            int32 var_2284 = const()[name = string("op_2284"), val = int32(0)];
+            bool V_sliding_out_5_interleave_0 = const()[name = string("V_sliding_out_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> V_sliding_out_5_cast_fp16 = concat(axis = var_2284, interleave = V_sliding_out_5_interleave_0, values = (var_2277_cast_fp16, new_v_5_cast_fp16, var_2282_cast_fp16))[name = string("V_sliding_out_5_cast_fp16")];
+            tensor<int32, [4]> var_2290_begin_0 = const()[name = string("op_2290_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
+            tensor<int32, [4]> var_2290_end_0 = const()[name = string("op_2290_end_0"), val = tensor<int32, [4]>([3, 2, 512, 512])];
+            tensor<bool, [4]> var_2290_end_mask_0 = const()[name = string("op_2290_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_2290_cast_fp16 = slice_by_index(begin = var_2290_begin_0, end = var_2290_end_0, end_mask = var_2290_end_mask_0, x = K_sliding_out_5_cast_fp16)[name = string("op_2290_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_5_begin_0 = const()[name = string("K_for_attn_5_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_5_end_0 = const()[name = string("K_for_attn_5_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_5_end_mask_0 = const()[name = string("K_for_attn_5_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_5_cast_fp16 = slice_by_index(begin = K_for_attn_5_begin_0, end = K_for_attn_5_end_0, end_mask = K_for_attn_5_end_mask_0, x = var_2290_cast_fp16)[name = string("K_for_attn_5_cast_fp16")];
+            tensor<int32, [4]> var_2300_begin_0 = const()[name = string("op_2300_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
+            tensor<int32, [4]> var_2300_end_0 = const()[name = string("op_2300_end_0"), val = tensor<int32, [4]>([3, 2, 512, 512])];
+            tensor<bool, [4]> var_2300_end_mask_0 = const()[name = string("op_2300_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_2300_cast_fp16 = slice_by_index(begin = var_2300_begin_0, end = var_2300_end_0, end_mask = var_2300_end_mask_0, x = V_sliding_out_5_cast_fp16)[name = string("op_2300_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_5_begin_0 = const()[name = string("V_for_attn_5_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_5_end_0 = const()[name = string("V_for_attn_5_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_5_end_mask_0 = const()[name = string("V_for_attn_5_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_5_cast_fp16 = slice_by_index(begin = V_for_attn_5_begin_0, end = V_for_attn_5_end_0, end_mask = V_for_attn_5_end_mask_0, x = var_2300_cast_fp16)[name = string("V_for_attn_5_cast_fp16")];
+            tensor<int32, [4]> transpose_8_perm_0 = const()[name = string("transpose_8_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_4_reps_0 = const()[name = string("tile_4_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_8_cast_fp16 = transpose(perm = transpose_8_perm_0, x = K_for_attn_5_cast_fp16)[name = string("transpose_193")];
+            tensor<fp16, [8, 1, 512, 256]> tile_4_cast_fp16 = tile(reps = tile_4_reps_0, x = transpose_8_cast_fp16)[name = string("tile_4_cast_fp16")];
+            tensor<int32, [5]> concat_8 = const()[name = string("concat_8"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_8_cast_fp16 = reshape(shape = concat_8, x = tile_4_cast_fp16)[name = string("reshape_8_cast_fp16")];
+            tensor<int32, [5]> transpose_9_perm_0 = const()[name = string("transpose_9_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_9 = const()[name = string("concat_9"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_9_cast_fp16 = transpose(perm = transpose_9_perm_0, x = reshape_8_cast_fp16)[name = string("transpose_192")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_9_cast_fp16 = reshape(shape = concat_9, x = transpose_9_cast_fp16)[name = string("reshape_9_cast_fp16")];
+            tensor<int32, [4]> transpose_56_perm_0 = const()[name = string("transpose_56_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_10_perm_0 = const()[name = string("transpose_10_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_5_reps_0 = const()[name = string("tile_5_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_10_cast_fp16 = transpose(perm = transpose_10_perm_0, x = V_for_attn_5_cast_fp16)[name = string("transpose_191")];
+            tensor<fp16, [8, 1, 512, 256]> tile_5_cast_fp16 = tile(reps = tile_5_reps_0, x = transpose_10_cast_fp16)[name = string("tile_5_cast_fp16")];
+            tensor<int32, [5]> concat_10 = const()[name = string("concat_10"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_10_cast_fp16 = reshape(shape = concat_10, x = tile_5_cast_fp16)[name = string("reshape_10_cast_fp16")];
+            tensor<int32, [5]> transpose_11_perm_0 = const()[name = string("transpose_11_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_11 = const()[name = string("concat_11"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_11_cast_fp16 = transpose(perm = transpose_11_perm_0, x = reshape_10_cast_fp16)[name = string("transpose_190")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_11_cast_fp16 = reshape(shape = concat_11, x = transpose_11_cast_fp16)[name = string("reshape_11_cast_fp16")];
+            tensor<int32, [4]> V_expanded_5_perm_0 = const()[name = string("V_expanded_5_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(false)];
+            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_56_cast_fp16 = transpose(perm = transpose_56_perm_0, x = reshape_9_cast_fp16)[name = string("transpose_189")];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = q_35_cast_fp16, y = transpose_56_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_47_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = causal_mask_sliding)[name = string("x_47_cast_fp16")];
+            tensor<int32, [1]> reduce_max_2_axes_0 = const()[name = string("reduce_max_2_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_2_keep_dims_0 = const()[name = string("reduce_max_2_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_2 = reduce_max(axes = reduce_max_2_axes_0, keep_dims = reduce_max_2_keep_dims_0, x = x_47_cast_fp16)[name = string("reduce_max_2")];
+            tensor<fp16, [1, 8, 3, 512]> var_2335 = sub(x = x_47_cast_fp16, y = reduce_max_2)[name = string("op_2335")];
+            tensor<fp16, [1, 8, 3, 512]> var_2341 = exp(x = var_2335)[name = string("op_2341")];
+            tensor<int32, [1]> var_2351_axes_0 = const()[name = string("op_2351_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2351_keep_dims_0 = const()[name = string("op_2351_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_2351 = reduce_sum(axes = var_2351_axes_0, keep_dims = var_2351_keep_dims_0, x = var_2341)[name = string("op_2351")];
+            tensor<fp16, [1, 8, 3, 512]> var_2357_cast_fp16 = real_div(x = var_2341, y = var_2351)[name = string("op_2357_cast_fp16")];
+            bool attn_output_13_transpose_x_0 = const()[name = string("attn_output_13_transpose_x_0"), val = bool(false)];
+            bool attn_output_13_transpose_y_0 = const()[name = string("attn_output_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_5_cast_fp16 = transpose(perm = V_expanded_5_perm_0, x = reshape_11_cast_fp16)[name = string("transpose_188")];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_13_cast_fp16 = matmul(transpose_x = attn_output_13_transpose_x_0, transpose_y = attn_output_13_transpose_y_0, x = var_2357_cast_fp16, y = V_expanded_5_cast_fp16)[name = string("attn_output_13_cast_fp16")];
+            tensor<int32, [4]> var_2368 = const()[name = string("op_2368"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_2375 = const()[name = string("op_2375"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_2369_cast_fp16 = transpose(perm = var_2368, x = attn_output_13_cast_fp16)[name = string("transpose_187")];
+            tensor<fp16, [1, 3, 2048]> attn_output_15_cast_fp16 = reshape(shape = var_2375, x = var_2369_cast_fp16)[name = string("attn_output_15_cast_fp16")];
+            tensor<int32, [3]> var_2380 = const()[name = string("op_2380"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_2396_pad_type_0 = const()[name = string("op_2396_pad_type_0"), val = string("valid")];
+            int32 var_2396_groups_0 = const()[name = string("op_2396_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_2396_strides_0 = const()[name = string("op_2396_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_2396_pad_0 = const()[name = string("op_2396_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_2396_dilations_0 = const()[name = string("op_2396_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_2_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(537206976))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(539828480))))[name = string("squeeze_2_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_2381_cast_fp16 = transpose(perm = var_2380, x = attn_output_15_cast_fp16)[name = string("transpose_186")];
+            tensor<fp16, [1, 2560, 3]> var_2396_cast_fp16 = conv(dilations = var_2396_dilations_0, groups = var_2396_groups_0, pad = var_2396_pad_0, pad_type = var_2396_pad_type_0, strides = var_2396_strides_0, weight = squeeze_2_cast_fp16_to_fp32_to_fp16_palettized, x = var_2381_cast_fp16)[name = string("op_2396_cast_fp16")];
+            tensor<int32, [3]> var_2400 = const()[name = string("op_2400"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2406 = const()[name = string("op_2406"), val = int32(-1)];
+            fp16 const_31_promoted_to_fp16 = const()[name = string("const_31_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_51_cast_fp16 = transpose(perm = var_2400, x = var_2396_cast_fp16)[name = string("transpose_185")];
+            tensor<fp16, [1, 3, 2560]> var_2408_cast_fp16 = mul(x = x_51_cast_fp16, y = const_31_promoted_to_fp16)[name = string("op_2408_cast_fp16")];
+            bool input_75_interleave_0 = const()[name = string("input_75_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_75_cast_fp16 = concat(axis = var_2406, interleave = input_75_interleave_0, values = (x_51_cast_fp16, var_2408_cast_fp16))[name = string("input_75_cast_fp16")];
+            tensor<int32, [1]> normed_69_axes_0 = const()[name = string("normed_69_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2403_to_fp16 = const()[name = string("op_2403_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_69_cast_fp16 = layer_norm(axes = normed_69_axes_0, epsilon = var_2403_to_fp16, x = input_75_cast_fp16)[name = string("normed_69_cast_fp16")];
+            tensor<int32, [2]> var_2413_split_sizes_0 = const()[name = string("op_2413_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2413_axis_0 = const()[name = string("op_2413_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2413_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2413_cast_fp16_1 = split(axis = var_2413_axis_0, split_sizes = var_2413_split_sizes_0, x = normed_69_cast_fp16)[name = string("op_2413_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_2_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(539831104)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_17_cast_fp16 = mul(x = var_2413_cast_fp16_0, y = layers_2_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_17_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_53_cast_fp16 = add(x = x_39_cast_fp16, y = attn_output_17_cast_fp16)[name = string("x_53_cast_fp16")];
+            int32 var_2422 = const()[name = string("op_2422"), val = int32(-1)];
+            fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_2424_cast_fp16 = mul(x = x_53_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_2424_cast_fp16")];
+            bool input_77_interleave_0 = const()[name = string("input_77_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_77_cast_fp16 = concat(axis = var_2422, interleave = input_77_interleave_0, values = (x_53_cast_fp16, var_2424_cast_fp16))[name = string("input_77_cast_fp16")];
+            tensor<int32, [1]> normed_73_axes_0 = const()[name = string("normed_73_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2419_to_fp16 = const()[name = string("op_2419_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_73_cast_fp16 = layer_norm(axes = normed_73_axes_0, epsilon = var_2419_to_fp16, x = input_77_cast_fp16)[name = string("normed_73_cast_fp16")];
+            tensor<int32, [2]> var_2429_split_sizes_0 = const()[name = string("op_2429_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2429_axis_0 = const()[name = string("op_2429_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2429_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2429_cast_fp16_1 = split(axis = var_2429_axis_0, split_sizes = var_2429_split_sizes_0, x = normed_73_cast_fp16)[name = string("op_2429_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_2_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(539836288)))];
+            tensor<fp16, [1, 3, 2560]> h_15_cast_fp16 = mul(x = var_2429_cast_fp16_0, y = layers_2_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_15_cast_fp16")];
+            tensor<int32, [3]> var_2440 = const()[name = string("op_2440"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_79_axes_0 = const()[name = string("input_79_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2441 = transpose(perm = var_2440, x = h_15_cast_fp16)[name = string("transpose_184")];
+            tensor<fp16, [1, 2560, 1, 3]> input_79 = expand_dims(axes = input_79_axes_0, x = var_2441)[name = string("input_79")];
+            string gate_9_pad_type_0 = const()[name = string("gate_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_9_strides_0 = const()[name = string("gate_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_9_pad_0 = const()[name = string("gate_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_9_dilations_0 = const()[name = string("gate_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_9_groups_0 = const()[name = string("gate_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_9 = conv(dilations = gate_9_dilations_0, groups = gate_9_groups_0, pad = gate_9_pad_0, pad_type = gate_9_pad_type_0, strides = gate_9_strides_0, weight = layers_2_mlp_gate_proj_weight_palettized, x = input_79)[name = string("gate_9")];
+            string up_5_pad_type_0 = const()[name = string("up_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_5_strides_0 = const()[name = string("up_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_5_pad_0 = const()[name = string("up_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_5_dilations_0 = const()[name = string("up_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_5_groups_0 = const()[name = string("up_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_5 = conv(dilations = up_5_dilations_0, groups = up_5_groups_0, pad = up_5_pad_0, pad_type = up_5_pad_type_0, strides = up_5_strides_0, weight = layers_2_mlp_up_proj_weight_palettized, x = input_79)[name = string("up_5")];
+            string gate_11_mode_0 = const()[name = string("gate_11_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_11 = gelu(mode = gate_11_mode_0, x = gate_9)[name = string("gate_11")];
+            tensor<fp16, [1, 10240, 1, 3]> input_81 = mul(x = gate_11, y = up_5)[name = string("input_81")];
+            string mlp_out_5_pad_type_0 = const()[name = string("mlp_out_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_5_strides_0 = const()[name = string("mlp_out_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_5_pad_0 = const()[name = string("mlp_out_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_5_dilations_0 = const()[name = string("mlp_out_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_5_groups_0 = const()[name = string("mlp_out_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_5 = conv(dilations = mlp_out_5_dilations_0, groups = mlp_out_5_groups_0, pad = mlp_out_5_pad_0, pad_type = mlp_out_5_pad_type_0, strides = mlp_out_5_strides_0, weight = layers_2_mlp_down_proj_weight_palettized, x = input_81)[name = string("mlp_out_5")];
+            tensor<int32, [1]> var_2481_axes_0 = const()[name = string("op_2481_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2481 = squeeze(axes = var_2481_axes_0, x = mlp_out_5)[name = string("op_2481")];
+            tensor<int32, [3]> var_2485 = const()[name = string("op_2485"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2491 = const()[name = string("op_2491"), val = int32(-1)];
+            fp16 const_33_promoted = const()[name = string("const_33_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_55 = transpose(perm = var_2485, x = var_2481)[name = string("transpose_183")];
+            tensor<fp16, [1, 3, 2560]> var_2493 = mul(x = x_55, y = const_33_promoted)[name = string("op_2493")];
+            bool input_83_interleave_0 = const()[name = string("input_83_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_83 = concat(axis = var_2491, interleave = input_83_interleave_0, values = (x_55, var_2493))[name = string("input_83")];
+            tensor<int32, [1]> normed_77_axes_0 = const()[name = string("normed_77_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2488_to_fp16 = const()[name = string("op_2488_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_77_cast_fp16 = layer_norm(axes = normed_77_axes_0, epsilon = var_2488_to_fp16, x = input_83)[name = string("normed_77_cast_fp16")];
+            tensor<int32, [2]> var_2498_split_sizes_0 = const()[name = string("op_2498_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2498_axis_0 = const()[name = string("op_2498_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2498_0, tensor<fp16, [1, 3, 2560]> var_2498_1 = split(axis = var_2498_axis_0, split_sizes = var_2498_split_sizes_0, x = normed_77_cast_fp16)[name = string("op_2498")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_23 = mul(x = var_2498_0, y = layers_2_post_feedforward_layernorm_weight)[name = string("hidden_states_23")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_25_cast_fp16 = add(x = x_53_cast_fp16, y = hidden_states_23)[name = string("hidden_states_25_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_5_begin_0 = const()[name = string("per_layer_slice_5_begin_0"), val = tensor<int32, [3]>([0, 0, 3584])];
+            tensor<int32, [3]> per_layer_slice_5_end_0 = const()[name = string("per_layer_slice_5_end_0"), val = tensor<int32, [3]>([1, 3, 3840])];
+            tensor<bool, [3]> per_layer_slice_5_end_mask_0 = const()[name = string("per_layer_slice_5_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_5_cast_fp16 = slice_by_index(begin = per_layer_slice_5_begin_0, end = per_layer_slice_5_end_0, end_mask = per_layer_slice_5_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_5_cast_fp16")];
+            tensor<int32, [3]> var_2526 = const()[name = string("op_2526"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_85_axes_0 = const()[name = string("input_85_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2527 = transpose(perm = var_2526, x = hidden_states_25_cast_fp16)[name = string("transpose_182")];
+            tensor<fp16, [1, 2560, 1, 3]> input_85 = expand_dims(axes = input_85_axes_0, x = var_2527)[name = string("input_85")];
+            string gated_13_pad_type_0 = const()[name = string("gated_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_13_strides_0 = const()[name = string("gated_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_13_pad_0 = const()[name = string("gated_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_13_dilations_0 = const()[name = string("gated_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_13_groups_0 = const()[name = string("gated_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_13 = conv(dilations = gated_13_dilations_0, groups = gated_13_groups_0, pad = gated_13_pad_0, pad_type = gated_13_pad_type_0, strides = gated_13_strides_0, weight = layers_2_per_layer_input_gate_weight_palettized, x = input_85)[name = string("gated_13")];
+            string gated_15_mode_0 = const()[name = string("gated_15_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_15 = gelu(mode = gated_15_mode_0, x = gated_13)[name = string("gated_15")];
+            tensor<int32, [3]> var_2546 = const()[name = string("op_2546"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_5_axes_0 = const()[name = string("per_layer_slice_conv_5_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_2547_cast_fp16 = transpose(perm = var_2546, x = per_layer_slice_5_cast_fp16)[name = string("transpose_181")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_5_cast_fp16 = expand_dims(axes = per_layer_slice_conv_5_axes_0, x = var_2547_cast_fp16)[name = string("per_layer_slice_conv_5_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_87_cast_fp16 = mul(x = gated_15, y = per_layer_slice_conv_5_cast_fp16)[name = string("input_87_cast_fp16")];
+            string gated_17_pad_type_0 = const()[name = string("gated_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_17_strides_0 = const()[name = string("gated_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_17_pad_0 = const()[name = string("gated_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_17_dilations_0 = const()[name = string("gated_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_17_groups_0 = const()[name = string("gated_17_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_2_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(539841472))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(540169216))))[name = string("layers_2_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_17_cast_fp16 = conv(dilations = gated_17_dilations_0, groups = gated_17_groups_0, pad = gated_17_pad_0, pad_type = gated_17_pad_type_0, strides = gated_17_strides_0, weight = layers_2_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_87_cast_fp16)[name = string("gated_17_cast_fp16")];
+            tensor<int32, [1]> var_2563_axes_0 = const()[name = string("op_2563_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2563_cast_fp16 = squeeze(axes = var_2563_axes_0, x = gated_17_cast_fp16)[name = string("op_2563_cast_fp16")];
+            tensor<int32, [3]> var_2567 = const()[name = string("op_2567"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2573 = const()[name = string("op_2573"), val = int32(-1)];
+            fp16 const_34_promoted_to_fp16 = const()[name = string("const_34_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_57_cast_fp16 = transpose(perm = var_2567, x = var_2563_cast_fp16)[name = string("transpose_180")];
+            tensor<fp16, [1, 3, 2560]> var_2575_cast_fp16 = mul(x = x_57_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_2575_cast_fp16")];
+            bool input_89_interleave_0 = const()[name = string("input_89_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_89_cast_fp16 = concat(axis = var_2573, interleave = input_89_interleave_0, values = (x_57_cast_fp16, var_2575_cast_fp16))[name = string("input_89_cast_fp16")];
+            tensor<int32, [1]> normed_81_axes_0 = const()[name = string("normed_81_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2570_to_fp16 = const()[name = string("op_2570_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_81_cast_fp16 = layer_norm(axes = normed_81_axes_0, epsilon = var_2570_to_fp16, x = input_89_cast_fp16)[name = string("normed_81_cast_fp16")];
+            tensor<int32, [2]> var_2580_split_sizes_0 = const()[name = string("op_2580_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2580_axis_0 = const()[name = string("op_2580_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2580_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2580_cast_fp16_1 = split(axis = var_2580_axis_0, split_sizes = var_2580_split_sizes_0, x = normed_81_cast_fp16)[name = string("op_2580_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_2_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(540171840)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_29_cast_fp16 = mul(x = var_2580_cast_fp16_0, y = layers_2_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_29_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_31_cast_fp16 = add(x = hidden_states_25_cast_fp16, y = hidden_states_29_cast_fp16)[name = string("hidden_states_31_cast_fp16")];
+            tensor<fp16, [1]> const_35_promoted_to_fp16 = const()[name = string("const_35_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.58p-1])];
+            tensor<fp16, [1, 3, 2560]> x_59_cast_fp16 = mul(x = hidden_states_31_cast_fp16, y = const_35_promoted_to_fp16)[name = string("x_59_cast_fp16")];
+            int32 var_2595 = const()[name = string("op_2595"), val = int32(-1)];
+            fp16 const_36_promoted_to_fp16 = const()[name = string("const_36_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_2597_cast_fp16 = mul(x = x_59_cast_fp16, y = const_36_promoted_to_fp16)[name = string("op_2597_cast_fp16")];
+            bool input_91_interleave_0 = const()[name = string("input_91_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_91_cast_fp16 = concat(axis = var_2595, interleave = input_91_interleave_0, values = (x_59_cast_fp16, var_2597_cast_fp16))[name = string("input_91_cast_fp16")];
+            tensor<int32, [1]> normed_85_axes_0 = const()[name = string("normed_85_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2592_to_fp16 = const()[name = string("op_2592_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_85_cast_fp16 = layer_norm(axes = normed_85_axes_0, epsilon = var_2592_to_fp16, x = input_91_cast_fp16)[name = string("normed_85_cast_fp16")];
+            tensor<int32, [2]> var_2602_split_sizes_0 = const()[name = string("op_2602_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2602_axis_0 = const()[name = string("op_2602_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2602_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2602_cast_fp16_1 = split(axis = var_2602_axis_0, split_sizes = var_2602_split_sizes_0, x = normed_85_cast_fp16)[name = string("op_2602_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_3_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(540177024)))];
+            tensor<fp16, [1, 3, 2560]> h_19_cast_fp16 = mul(x = var_2602_cast_fp16_0, y = layers_3_input_layernorm_weight_promoted_to_fp16)[name = string("h_19_cast_fp16")];
+            tensor<int32, [3]> var_2608 = const()[name = string("op_2608"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_2611_axes_0 = const()[name = string("op_2611_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2609_cast_fp16 = transpose(perm = var_2608, x = h_19_cast_fp16)[name = string("transpose_179")];
+            tensor<fp16, [1, 2560, 1, 3]> var_2611_cast_fp16 = expand_dims(axes = var_2611_axes_0, x = var_2609_cast_fp16)[name = string("op_2611_cast_fp16")];
+            string q_37_pad_type_0 = const()[name = string("q_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_37_strides_0 = const()[name = string("q_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_37_pad_0 = const()[name = string("q_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_37_dilations_0 = const()[name = string("q_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_37_groups_0 = const()[name = string("q_37_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_37 = conv(dilations = q_37_dilations_0, groups = q_37_groups_0, pad = q_37_pad_0, pad_type = q_37_pad_type_0, strides = q_37_strides_0, weight = layers_3_self_attn_q_proj_weight_palettized, x = var_2611_cast_fp16)[name = string("q_37")];
+            tensor<int32, [4]> var_2632 = const()[name = string("op_2632"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_2633 = reshape(shape = var_2632, x = q_37)[name = string("op_2633")];
+            tensor<int32, [4]> transpose_57_perm_0 = const()[name = string("transpose_57_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_2656 = const()[name = string("op_2656"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_57 = transpose(perm = transpose_57_perm_0, x = var_2633)[name = string("transpose_178")];
+            tensor<fp16, [3, 8, 256]> x_61 = reshape(shape = var_2656, x = transpose_57)[name = string("x_61")];
+            int32 var_2662 = const()[name = string("op_2662"), val = int32(-1)];
+            fp16 const_37_promoted = const()[name = string("const_37_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_2664 = mul(x = x_61, y = const_37_promoted)[name = string("op_2664")];
+            bool input_95_interleave_0 = const()[name = string("input_95_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_95 = concat(axis = var_2662, interleave = input_95_interleave_0, values = (x_61, var_2664))[name = string("input_95")];
+            tensor<int32, [1]> normed_89_axes_0 = const()[name = string("normed_89_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2659_to_fp16 = const()[name = string("op_2659_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_89_cast_fp16 = layer_norm(axes = normed_89_axes_0, epsilon = var_2659_to_fp16, x = input_95)[name = string("normed_89_cast_fp16")];
+            tensor<int32, [2]> var_2669_split_sizes_0 = const()[name = string("op_2669_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2669_axis_0 = const()[name = string("op_2669_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_2669_0, tensor<fp16, [3, 8, 256]> var_2669_1 = split(axis = var_2669_axis_0, split_sizes = var_2669_split_sizes_0, x = normed_89_cast_fp16)[name = string("op_2669")];
+            tensor<fp16, [3, 8, 256]> q_41 = mul(x = var_2669_0, y = layers_3_self_attn_q_norm_weight)[name = string("q_41")];
+            tensor<int32, [4]> var_2676 = const()[name = string("op_2676"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_2677 = reshape(shape = var_2676, x = q_41)[name = string("op_2677")];
+            tensor<int32, [4]> var_2682 = const()[name = string("op_2682"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_43 = transpose(perm = var_2682, x = var_2677)[name = string("transpose_177")];
+            tensor<fp16, [1, 8, 3, 256]> var_2684_cast_fp16 = mul(x = q_43, y = cos_s)[name = string("op_2684_cast_fp16")];
+            tensor<int32, [2]> var_2685_split_sizes_0 = const()[name = string("op_2685_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2685_axis_0 = const()[name = string("op_2685_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_2685_0, tensor<fp16, [1, 8, 3, 128]> var_2685_1 = split(axis = var_2685_axis_0, split_sizes = var_2685_split_sizes_0, x = q_43)[name = string("op_2685")];
+            fp16 const_38_promoted = const()[name = string("const_38_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_2687 = mul(x = var_2685_1, y = const_38_promoted)[name = string("op_2687")];
+            int32 var_2689 = const()[name = string("op_2689"), val = int32(-1)];
+            bool var_2690_interleave_0 = const()[name = string("op_2690_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_2690 = concat(axis = var_2689, interleave = var_2690_interleave_0, values = (var_2687, var_2685_0))[name = string("op_2690")];
+            tensor<fp16, [1, 8, 3, 256]> var_2691_cast_fp16 = mul(x = var_2690, y = sin_s)[name = string("op_2691_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_47_cast_fp16 = add(x = var_2684_cast_fp16, y = var_2691_cast_fp16)[name = string("q_47_cast_fp16")];
+            string k_19_pad_type_0 = const()[name = string("k_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_19_strides_0 = const()[name = string("k_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_19_pad_0 = const()[name = string("k_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_19_dilations_0 = const()[name = string("k_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_19_groups_0 = const()[name = string("k_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> k_19 = conv(dilations = k_19_dilations_0, groups = k_19_groups_0, pad = k_19_pad_0, pad_type = k_19_pad_type_0, strides = k_19_strides_0, weight = layers_3_self_attn_k_proj_weight_palettized, x = var_2611_cast_fp16)[name = string("k_19")];
+            tensor<int32, [4]> var_2709 = const()[name = string("op_2709"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_2710 = reshape(shape = var_2709, x = k_19)[name = string("op_2710")];
+            tensor<int32, [4]> transpose_58_perm_0 = const()[name = string("transpose_58_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            string v_7_pad_type_0 = const()[name = string("v_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_7_strides_0 = const()[name = string("v_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_7_pad_0 = const()[name = string("v_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_7_dilations_0 = const()[name = string("v_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_7_groups_0 = const()[name = string("v_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> v_7 = conv(dilations = v_7_dilations_0, groups = v_7_groups_0, pad = v_7_pad_0, pad_type = v_7_pad_type_0, strides = v_7_strides_0, weight = layers_3_self_attn_v_proj_weight_palettized, x = var_2611_cast_fp16)[name = string("v_7")];
+            tensor<int32, [4]> var_2737 = const()[name = string("op_2737"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_2738 = reshape(shape = var_2737, x = v_7)[name = string("op_2738")];
+            tensor<int32, [4]> var_2743 = const()[name = string("op_2743"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_2761 = const()[name = string("op_2761"), val = tensor<int32, [3]>([3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> transpose_58 = transpose(perm = transpose_58_perm_0, x = var_2710)[name = string("transpose_176")];
+            tensor<fp16, [3, 2, 256]> x_63 = reshape(shape = var_2761, x = transpose_58)[name = string("x_63")];
+            int32 var_2767 = const()[name = string("op_2767"), val = int32(-1)];
+            fp16 const_39_promoted = const()[name = string("const_39_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 2, 256]> var_2769 = mul(x = x_63, y = const_39_promoted)[name = string("op_2769")];
+            bool input_97_interleave_0 = const()[name = string("input_97_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 2, 512]> input_97 = concat(axis = var_2767, interleave = input_97_interleave_0, values = (x_63, var_2769))[name = string("input_97")];
+            tensor<int32, [1]> normed_93_axes_0 = const()[name = string("normed_93_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2764_to_fp16 = const()[name = string("op_2764_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 2, 512]> normed_93_cast_fp16 = layer_norm(axes = normed_93_axes_0, epsilon = var_2764_to_fp16, x = input_97)[name = string("normed_93_cast_fp16")];
+            tensor<int32, [2]> var_2774_split_sizes_0 = const()[name = string("op_2774_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2774_axis_0 = const()[name = string("op_2774_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 2, 256]> var_2774_0, tensor<fp16, [3, 2, 256]> var_2774_1 = split(axis = var_2774_axis_0, split_sizes = var_2774_split_sizes_0, x = normed_93_cast_fp16)[name = string("op_2774")];
+            tensor<fp16, [3, 2, 256]> k_23 = mul(x = var_2774_0, y = layers_3_self_attn_k_norm_weight)[name = string("k_23")];
+            tensor<int32, [4]> var_2781 = const()[name = string("op_2781"), val = tensor<int32, [4]>([1, 3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> var_2782 = reshape(shape = var_2781, x = k_23)[name = string("op_2782")];
+            tensor<int32, [4]> var_2787 = const()[name = string("op_2787"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            fp16 var_2789_promoted = const()[name = string("op_2789_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 3, 256]> var_2744 = transpose(perm = var_2743, x = var_2738)[name = string("transpose_175")];
+            tensor<fp16, [1, 2, 3, 256]> var_2790 = pow(x = var_2744, y = var_2789_promoted)[name = string("op_2790")];
+            tensor<int32, [1]> var_2795_axes_0 = const()[name = string("op_2795_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2795_keep_dims_0 = const()[name = string("op_2795_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 3, 1]> var_2795 = reduce_mean(axes = var_2795_axes_0, keep_dims = var_2795_keep_dims_0, x = var_2790)[name = string("op_2795")];
+            fp16 var_2797_to_fp16 = const()[name = string("op_2797_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 3, 1]> mean_sq_7_cast_fp16 = add(x = var_2795, y = var_2797_to_fp16)[name = string("mean_sq_7_cast_fp16")];
+            fp32 var_2799_epsilon_0 = const()[name = string("op_2799_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 3, 1]> var_2799_cast_fp16 = rsqrt(epsilon = var_2799_epsilon_0, x = mean_sq_7_cast_fp16)[name = string("op_2799_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_101_cast_fp16 = mul(x = var_2744, y = var_2799_cast_fp16)[name = string("input_101_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> q_45 = transpose(perm = var_2787, x = var_2782)[name = string("transpose_174")];
+            tensor<fp16, [1, 2, 3, 256]> var_2801_cast_fp16 = mul(x = q_45, y = cos_s)[name = string("op_2801_cast_fp16")];
+            tensor<int32, [2]> var_2802_split_sizes_0 = const()[name = string("op_2802_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2802_axis_0 = const()[name = string("op_2802_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 3, 128]> var_2802_0, tensor<fp16, [1, 2, 3, 128]> var_2802_1 = split(axis = var_2802_axis_0, split_sizes = var_2802_split_sizes_0, x = q_45)[name = string("op_2802")];
+            fp16 const_40_promoted = const()[name = string("const_40_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 3, 128]> var_2804 = mul(x = var_2802_1, y = const_40_promoted)[name = string("op_2804")];
+            int32 var_2806 = const()[name = string("op_2806"), val = int32(-1)];
+            bool var_2807_interleave_0 = const()[name = string("op_2807_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 3, 256]> var_2807 = concat(axis = var_2806, interleave = var_2807_interleave_0, values = (var_2804, var_2802_0))[name = string("op_2807")];
+            tensor<fp16, [1, 2, 3, 256]> var_2808_cast_fp16 = mul(x = var_2807, y = sin_s)[name = string("op_2808_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_99_cast_fp16 = add(x = var_2801_cast_fp16, y = var_2808_cast_fp16)[name = string("input_99_cast_fp16")];
+            tensor<int32, [8]> k_padded_7_pad_0 = const()[name = string("k_padded_7_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_7_mode_0 = const()[name = string("k_padded_7_mode_0"), val = string("constant")];
+            fp16 const_41_to_fp16 = const()[name = string("const_41_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> k_padded_7_cast_fp16 = pad(constant_val = const_41_to_fp16, mode = k_padded_7_mode_0, pad = k_padded_7_pad_0, x = input_99_cast_fp16)[name = string("k_padded_7_cast_fp16")];
+            tensor<int32, [8]> v_padded_7_pad_0 = const()[name = string("v_padded_7_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_7_mode_0 = const()[name = string("v_padded_7_mode_0"), val = string("constant")];
+            fp16 const_42_to_fp16 = const()[name = string("const_42_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> v_padded_7_cast_fp16 = pad(constant_val = const_42_to_fp16, mode = v_padded_7_mode_0, pad = v_padded_7_pad_0, x = input_101_cast_fp16)[name = string("v_padded_7_cast_fp16")];
+            tensor<int32, [4]> slot_k_7_begin_0 = const()[name = string("slot_k_7_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
+            tensor<int32, [4]> slot_k_7_end_0 = const()[name = string("slot_k_7_end_0"), val = tensor<int32, [4]>([4, 2, 512, 512])];
+            tensor<bool, [4]> slot_k_7_end_mask_0 = const()[name = string("slot_k_7_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_k_7_cast_fp16 = slice_by_index(begin = slot_k_7_begin_0, end = slot_k_7_end_0, end_mask = slot_k_7_end_mask_0, x = K_sliding_out_5_cast_fp16)[name = string("slot_k_7_cast_fp16")];
+            tensor<int32, [4]> slot_v_7_begin_0 = const()[name = string("slot_v_7_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
+            tensor<int32, [4]> slot_v_7_end_0 = const()[name = string("slot_v_7_end_0"), val = tensor<int32, [4]>([4, 2, 512, 512])];
+            tensor<bool, [4]> slot_v_7_end_mask_0 = const()[name = string("slot_v_7_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_v_7_cast_fp16 = slice_by_index(begin = slot_v_7_begin_0, end = slot_v_7_end_0, end_mask = slot_v_7_end_mask_0, x = V_sliding_out_5_cast_fp16)[name = string("slot_v_7_cast_fp16")];
+            tensor<int32, [4]> var_2847_begin_0 = const()[name = string("op_2847_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_2847_end_0 = const()[name = string("op_2847_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_2847_end_mask_0 = const()[name = string("op_2847_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_2847_cast_fp16 = slice_by_index(begin = var_2847_begin_0, end = var_2847_end_0, end_mask = var_2847_end_mask_0, x = slot_k_7_cast_fp16)[name = string("op_2847_cast_fp16")];
+            int32 var_2854 = const()[name = string("op_2854"), val = int32(2)];
+            bool new_k_7_interleave_0 = const()[name = string("new_k_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_k_7_cast_fp16 = concat(axis = var_2854, interleave = new_k_7_interleave_0, values = (var_2847_cast_fp16, k_padded_7_cast_fp16))[name = string("new_k_7_cast_fp16")];
+            tensor<int32, [4]> var_2870_begin_0 = const()[name = string("op_2870_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_2870_end_0 = const()[name = string("op_2870_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_2870_end_mask_0 = const()[name = string("op_2870_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_2870_cast_fp16 = slice_by_index(begin = var_2870_begin_0, end = var_2870_end_0, end_mask = var_2870_end_mask_0, x = slot_v_7_cast_fp16)[name = string("op_2870_cast_fp16")];
+            int32 var_2877 = const()[name = string("op_2877"), val = int32(2)];
+            bool new_v_7_interleave_0 = const()[name = string("new_v_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_v_7_cast_fp16 = concat(axis = var_2877, interleave = new_v_7_interleave_0, values = (var_2870_cast_fp16, v_padded_7_cast_fp16))[name = string("new_v_7_cast_fp16")];
+            tensor<int32, [4]> var_2883_begin_0 = const()[name = string("op_2883_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2883_end_0 = const()[name = string("op_2883_end_0"), val = tensor<int32, [4]>([3, 2, 512, 512])];
+            tensor<bool, [4]> var_2883_end_mask_0 = const()[name = string("op_2883_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [3, 2, 512, 512]> var_2883_cast_fp16 = slice_by_index(begin = var_2883_begin_0, end = var_2883_end_0, end_mask = var_2883_end_mask_0, x = K_sliding_out_5_cast_fp16)[name = string("op_2883_cast_fp16")];
+            tensor<int32, [4]> var_2888_begin_0 = const()[name = string("op_2888_begin_0"), val = tensor<int32, [4]>([4, 0, 0, 0])];
+            tensor<int32, [4]> var_2888_end_0 = const()[name = string("op_2888_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_2888_end_mask_0 = const()[name = string("op_2888_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [6, 2, 512, 512]> var_2888_cast_fp16 = slice_by_index(begin = var_2888_begin_0, end = var_2888_end_0, end_mask = var_2888_end_mask_0, x = K_sliding_out_5_cast_fp16)[name = string("op_2888_cast_fp16")];
+            int32 var_2890 = const()[name = string("op_2890"), val = int32(0)];
+            bool K_sliding_out_7_interleave_0 = const()[name = string("K_sliding_out_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> K_sliding_out_7_cast_fp16 = concat(axis = var_2890, interleave = K_sliding_out_7_interleave_0, values = (var_2883_cast_fp16, new_k_7_cast_fp16, var_2888_cast_fp16))[name = string("K_sliding_out_7_cast_fp16")];
+            tensor<int32, [4]> var_2896_begin_0 = const()[name = string("op_2896_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_2896_end_0 = const()[name = string("op_2896_end_0"), val = tensor<int32, [4]>([3, 2, 512, 512])];
+            tensor<bool, [4]> var_2896_end_mask_0 = const()[name = string("op_2896_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [3, 2, 512, 512]> var_2896_cast_fp16 = slice_by_index(begin = var_2896_begin_0, end = var_2896_end_0, end_mask = var_2896_end_mask_0, x = V_sliding_out_5_cast_fp16)[name = string("op_2896_cast_fp16")];
+            tensor<int32, [4]> var_2901_begin_0 = const()[name = string("op_2901_begin_0"), val = tensor<int32, [4]>([4, 0, 0, 0])];
+            tensor<int32, [4]> var_2901_end_0 = const()[name = string("op_2901_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_2901_end_mask_0 = const()[name = string("op_2901_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [6, 2, 512, 512]> var_2901_cast_fp16 = slice_by_index(begin = var_2901_begin_0, end = var_2901_end_0, end_mask = var_2901_end_mask_0, x = V_sliding_out_5_cast_fp16)[name = string("op_2901_cast_fp16")];
+            int32 var_2903 = const()[name = string("op_2903"), val = int32(0)];
+            bool V_sliding_out_7_interleave_0 = const()[name = string("V_sliding_out_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> V_sliding_out_7_cast_fp16 = concat(axis = var_2903, interleave = V_sliding_out_7_interleave_0, values = (var_2896_cast_fp16, new_v_7_cast_fp16, var_2901_cast_fp16))[name = string("V_sliding_out_7_cast_fp16")];
+            tensor<int32, [4]> var_2909_begin_0 = const()[name = string("op_2909_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
+            tensor<int32, [4]> var_2909_end_0 = const()[name = string("op_2909_end_0"), val = tensor<int32, [4]>([4, 2, 512, 512])];
+            tensor<bool, [4]> var_2909_end_mask_0 = const()[name = string("op_2909_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_2909_cast_fp16 = slice_by_index(begin = var_2909_begin_0, end = var_2909_end_0, end_mask = var_2909_end_mask_0, x = K_sliding_out_7_cast_fp16)[name = string("op_2909_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_7_begin_0 = const()[name = string("K_for_attn_7_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_7_end_0 = const()[name = string("K_for_attn_7_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_7_end_mask_0 = const()[name = string("K_for_attn_7_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_7_cast_fp16 = slice_by_index(begin = K_for_attn_7_begin_0, end = K_for_attn_7_end_0, end_mask = K_for_attn_7_end_mask_0, x = var_2909_cast_fp16)[name = string("K_for_attn_7_cast_fp16")];
+            tensor<int32, [4]> var_2919_begin_0 = const()[name = string("op_2919_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
+            tensor<int32, [4]> var_2919_end_0 = const()[name = string("op_2919_end_0"), val = tensor<int32, [4]>([4, 2, 512, 512])];
+            tensor<bool, [4]> var_2919_end_mask_0 = const()[name = string("op_2919_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_2919_cast_fp16 = slice_by_index(begin = var_2919_begin_0, end = var_2919_end_0, end_mask = var_2919_end_mask_0, x = V_sliding_out_7_cast_fp16)[name = string("op_2919_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_7_begin_0 = const()[name = string("V_for_attn_7_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_7_end_0 = const()[name = string("V_for_attn_7_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_7_end_mask_0 = const()[name = string("V_for_attn_7_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_7_cast_fp16 = slice_by_index(begin = V_for_attn_7_begin_0, end = V_for_attn_7_end_0, end_mask = V_for_attn_7_end_mask_0, x = var_2919_cast_fp16)[name = string("V_for_attn_7_cast_fp16")];
+            tensor<int32, [4]> transpose_12_perm_0 = const()[name = string("transpose_12_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_6_reps_0 = const()[name = string("tile_6_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_12_cast_fp16 = transpose(perm = transpose_12_perm_0, x = K_for_attn_7_cast_fp16)[name = string("transpose_173")];
+            tensor<fp16, [8, 1, 512, 256]> tile_6_cast_fp16 = tile(reps = tile_6_reps_0, x = transpose_12_cast_fp16)[name = string("tile_6_cast_fp16")];
+            tensor<int32, [5]> concat_12 = const()[name = string("concat_12"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_12_cast_fp16 = reshape(shape = concat_12, x = tile_6_cast_fp16)[name = string("reshape_12_cast_fp16")];
+            tensor<int32, [5]> transpose_13_perm_0 = const()[name = string("transpose_13_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_13 = const()[name = string("concat_13"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_13_cast_fp16 = transpose(perm = transpose_13_perm_0, x = reshape_12_cast_fp16)[name = string("transpose_172")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_13_cast_fp16 = reshape(shape = concat_13, x = transpose_13_cast_fp16)[name = string("reshape_13_cast_fp16")];
+            tensor<int32, [4]> transpose_59_perm_0 = const()[name = string("transpose_59_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_14_perm_0 = const()[name = string("transpose_14_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_7_reps_0 = const()[name = string("tile_7_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_14_cast_fp16 = transpose(perm = transpose_14_perm_0, x = V_for_attn_7_cast_fp16)[name = string("transpose_171")];
+            tensor<fp16, [8, 1, 512, 256]> tile_7_cast_fp16 = tile(reps = tile_7_reps_0, x = transpose_14_cast_fp16)[name = string("tile_7_cast_fp16")];
+            tensor<int32, [5]> concat_14 = const()[name = string("concat_14"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_14_cast_fp16 = reshape(shape = concat_14, x = tile_7_cast_fp16)[name = string("reshape_14_cast_fp16")];
+            tensor<int32, [5]> transpose_15_perm_0 = const()[name = string("transpose_15_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_15 = const()[name = string("concat_15"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_15_cast_fp16 = transpose(perm = transpose_15_perm_0, x = reshape_14_cast_fp16)[name = string("transpose_170")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_15_cast_fp16 = reshape(shape = concat_15, x = transpose_15_cast_fp16)[name = string("reshape_15_cast_fp16")];
+            tensor<int32, [4]> V_expanded_7_perm_0 = const()[name = string("V_expanded_7_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(false)];
+            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_59_cast_fp16 = transpose(perm = transpose_59_perm_0, x = reshape_13_cast_fp16)[name = string("transpose_169")];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = q_47_cast_fp16, y = transpose_59_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_67_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = causal_mask_sliding)[name = string("x_67_cast_fp16")];
+            tensor<int32, [1]> reduce_max_3_axes_0 = const()[name = string("reduce_max_3_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_3_keep_dims_0 = const()[name = string("reduce_max_3_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_3 = reduce_max(axes = reduce_max_3_axes_0, keep_dims = reduce_max_3_keep_dims_0, x = x_67_cast_fp16)[name = string("reduce_max_3")];
+            tensor<fp16, [1, 8, 3, 512]> var_2954 = sub(x = x_67_cast_fp16, y = reduce_max_3)[name = string("op_2954")];
+            tensor<fp16, [1, 8, 3, 512]> var_2960 = exp(x = var_2954)[name = string("op_2960")];
+            tensor<int32, [1]> var_2970_axes_0 = const()[name = string("op_2970_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2970_keep_dims_0 = const()[name = string("op_2970_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_2970 = reduce_sum(axes = var_2970_axes_0, keep_dims = var_2970_keep_dims_0, x = var_2960)[name = string("op_2970")];
+            tensor<fp16, [1, 8, 3, 512]> var_2976_cast_fp16 = real_div(x = var_2960, y = var_2970)[name = string("op_2976_cast_fp16")];
+            bool attn_output_19_transpose_x_0 = const()[name = string("attn_output_19_transpose_x_0"), val = bool(false)];
+            bool attn_output_19_transpose_y_0 = const()[name = string("attn_output_19_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_7_cast_fp16 = transpose(perm = V_expanded_7_perm_0, x = reshape_15_cast_fp16)[name = string("transpose_168")];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_19_cast_fp16 = matmul(transpose_x = attn_output_19_transpose_x_0, transpose_y = attn_output_19_transpose_y_0, x = var_2976_cast_fp16, y = V_expanded_7_cast_fp16)[name = string("attn_output_19_cast_fp16")];
+            tensor<int32, [4]> var_2987 = const()[name = string("op_2987"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_2994 = const()[name = string("op_2994"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_2988_cast_fp16 = transpose(perm = var_2987, x = attn_output_19_cast_fp16)[name = string("transpose_167")];
+            tensor<fp16, [1, 3, 2048]> attn_output_21_cast_fp16 = reshape(shape = var_2994, x = var_2988_cast_fp16)[name = string("attn_output_21_cast_fp16")];
+            tensor<int32, [3]> var_2999 = const()[name = string("op_2999"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_3015_pad_type_0 = const()[name = string("op_3015_pad_type_0"), val = string("valid")];
+            int32 var_3015_groups_0 = const()[name = string("op_3015_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_3015_strides_0 = const()[name = string("op_3015_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_3015_pad_0 = const()[name = string("op_3015_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_3015_dilations_0 = const()[name = string("op_3015_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_3_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(540182208))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(542803712))))[name = string("squeeze_3_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_3000_cast_fp16 = transpose(perm = var_2999, x = attn_output_21_cast_fp16)[name = string("transpose_166")];
+            tensor<fp16, [1, 2560, 3]> var_3015_cast_fp16 = conv(dilations = var_3015_dilations_0, groups = var_3015_groups_0, pad = var_3015_pad_0, pad_type = var_3015_pad_type_0, strides = var_3015_strides_0, weight = squeeze_3_cast_fp16_to_fp32_to_fp16_palettized, x = var_3000_cast_fp16)[name = string("op_3015_cast_fp16")];
+            tensor<int32, [3]> var_3019 = const()[name = string("op_3019"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3025 = const()[name = string("op_3025"), val = int32(-1)];
+            fp16 const_43_promoted_to_fp16 = const()[name = string("const_43_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_71_cast_fp16 = transpose(perm = var_3019, x = var_3015_cast_fp16)[name = string("transpose_165")];
+            tensor<fp16, [1, 3, 2560]> var_3027_cast_fp16 = mul(x = x_71_cast_fp16, y = const_43_promoted_to_fp16)[name = string("op_3027_cast_fp16")];
+            bool input_105_interleave_0 = const()[name = string("input_105_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_105_cast_fp16 = concat(axis = var_3025, interleave = input_105_interleave_0, values = (x_71_cast_fp16, var_3027_cast_fp16))[name = string("input_105_cast_fp16")];
+            tensor<int32, [1]> normed_97_axes_0 = const()[name = string("normed_97_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3022_to_fp16 = const()[name = string("op_3022_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_97_cast_fp16 = layer_norm(axes = normed_97_axes_0, epsilon = var_3022_to_fp16, x = input_105_cast_fp16)[name = string("normed_97_cast_fp16")];
+            tensor<int32, [2]> var_3032_split_sizes_0 = const()[name = string("op_3032_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3032_axis_0 = const()[name = string("op_3032_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3032_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3032_cast_fp16_1 = split(axis = var_3032_axis_0, split_sizes = var_3032_split_sizes_0, x = normed_97_cast_fp16)[name = string("op_3032_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_3_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(542806336)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_23_cast_fp16 = mul(x = var_3032_cast_fp16_0, y = layers_3_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_23_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_73_cast_fp16 = add(x = x_59_cast_fp16, y = attn_output_23_cast_fp16)[name = string("x_73_cast_fp16")];
+            int32 var_3041 = const()[name = string("op_3041"), val = int32(-1)];
+            fp16 const_44_promoted_to_fp16 = const()[name = string("const_44_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_3043_cast_fp16 = mul(x = x_73_cast_fp16, y = const_44_promoted_to_fp16)[name = string("op_3043_cast_fp16")];
+            bool input_107_interleave_0 = const()[name = string("input_107_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_107_cast_fp16 = concat(axis = var_3041, interleave = input_107_interleave_0, values = (x_73_cast_fp16, var_3043_cast_fp16))[name = string("input_107_cast_fp16")];
+            tensor<int32, [1]> normed_101_axes_0 = const()[name = string("normed_101_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3038_to_fp16 = const()[name = string("op_3038_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_101_cast_fp16 = layer_norm(axes = normed_101_axes_0, epsilon = var_3038_to_fp16, x = input_107_cast_fp16)[name = string("normed_101_cast_fp16")];
+            tensor<int32, [2]> var_3048_split_sizes_0 = const()[name = string("op_3048_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3048_axis_0 = const()[name = string("op_3048_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3048_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3048_cast_fp16_1 = split(axis = var_3048_axis_0, split_sizes = var_3048_split_sizes_0, x = normed_101_cast_fp16)[name = string("op_3048_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_3_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(542811520)))];
+            tensor<fp16, [1, 3, 2560]> h_21_cast_fp16 = mul(x = var_3048_cast_fp16_0, y = layers_3_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_21_cast_fp16")];
+            tensor<int32, [3]> var_3059 = const()[name = string("op_3059"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_109_axes_0 = const()[name = string("input_109_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3060 = transpose(perm = var_3059, x = h_21_cast_fp16)[name = string("transpose_164")];
+            tensor<fp16, [1, 2560, 1, 3]> input_109 = expand_dims(axes = input_109_axes_0, x = var_3060)[name = string("input_109")];
+            string gate_13_pad_type_0 = const()[name = string("gate_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_13_strides_0 = const()[name = string("gate_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_13_pad_0 = const()[name = string("gate_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_13_dilations_0 = const()[name = string("gate_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_13_groups_0 = const()[name = string("gate_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_13 = conv(dilations = gate_13_dilations_0, groups = gate_13_groups_0, pad = gate_13_pad_0, pad_type = gate_13_pad_type_0, strides = gate_13_strides_0, weight = layers_3_mlp_gate_proj_weight_palettized, x = input_109)[name = string("gate_13")];
+            string up_7_pad_type_0 = const()[name = string("up_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_7_strides_0 = const()[name = string("up_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_7_pad_0 = const()[name = string("up_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_7_dilations_0 = const()[name = string("up_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_7_groups_0 = const()[name = string("up_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_7 = conv(dilations = up_7_dilations_0, groups = up_7_groups_0, pad = up_7_pad_0, pad_type = up_7_pad_type_0, strides = up_7_strides_0, weight = layers_3_mlp_up_proj_weight_palettized, x = input_109)[name = string("up_7")];
+            string gate_15_mode_0 = const()[name = string("gate_15_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_15 = gelu(mode = gate_15_mode_0, x = gate_13)[name = string("gate_15")];
+            tensor<fp16, [1, 10240, 1, 3]> input_111 = mul(x = gate_15, y = up_7)[name = string("input_111")];
+            string mlp_out_7_pad_type_0 = const()[name = string("mlp_out_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_7_strides_0 = const()[name = string("mlp_out_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_7_pad_0 = const()[name = string("mlp_out_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_7_dilations_0 = const()[name = string("mlp_out_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_7_groups_0 = const()[name = string("mlp_out_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_7 = conv(dilations = mlp_out_7_dilations_0, groups = mlp_out_7_groups_0, pad = mlp_out_7_pad_0, pad_type = mlp_out_7_pad_type_0, strides = mlp_out_7_strides_0, weight = layers_3_mlp_down_proj_weight_palettized, x = input_111)[name = string("mlp_out_7")];
+            tensor<int32, [1]> var_3100_axes_0 = const()[name = string("op_3100_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3100 = squeeze(axes = var_3100_axes_0, x = mlp_out_7)[name = string("op_3100")];
+            tensor<int32, [3]> var_3104 = const()[name = string("op_3104"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3110 = const()[name = string("op_3110"), val = int32(-1)];
+            fp16 const_45_promoted = const()[name = string("const_45_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_75 = transpose(perm = var_3104, x = var_3100)[name = string("transpose_163")];
+            tensor<fp16, [1, 3, 2560]> var_3112 = mul(x = x_75, y = const_45_promoted)[name = string("op_3112")];
+            bool input_113_interleave_0 = const()[name = string("input_113_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_113 = concat(axis = var_3110, interleave = input_113_interleave_0, values = (x_75, var_3112))[name = string("input_113")];
+            tensor<int32, [1]> normed_105_axes_0 = const()[name = string("normed_105_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3107_to_fp16 = const()[name = string("op_3107_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_105_cast_fp16 = layer_norm(axes = normed_105_axes_0, epsilon = var_3107_to_fp16, x = input_113)[name = string("normed_105_cast_fp16")];
+            tensor<int32, [2]> var_3117_split_sizes_0 = const()[name = string("op_3117_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3117_axis_0 = const()[name = string("op_3117_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3117_0, tensor<fp16, [1, 3, 2560]> var_3117_1 = split(axis = var_3117_axis_0, split_sizes = var_3117_split_sizes_0, x = normed_105_cast_fp16)[name = string("op_3117")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_33 = mul(x = var_3117_0, y = layers_3_post_feedforward_layernorm_weight)[name = string("hidden_states_33")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_35_cast_fp16 = add(x = x_73_cast_fp16, y = hidden_states_33)[name = string("hidden_states_35_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_7_begin_0 = const()[name = string("per_layer_slice_7_begin_0"), val = tensor<int32, [3]>([0, 0, 3840])];
+            tensor<int32, [3]> per_layer_slice_7_end_0 = const()[name = string("per_layer_slice_7_end_0"), val = tensor<int32, [3]>([1, 3, 4096])];
+            tensor<bool, [3]> per_layer_slice_7_end_mask_0 = const()[name = string("per_layer_slice_7_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_7_cast_fp16 = slice_by_index(begin = per_layer_slice_7_begin_0, end = per_layer_slice_7_end_0, end_mask = per_layer_slice_7_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_7_cast_fp16")];
+            tensor<int32, [3]> var_3145 = const()[name = string("op_3145"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_115_axes_0 = const()[name = string("input_115_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3146 = transpose(perm = var_3145, x = hidden_states_35_cast_fp16)[name = string("transpose_162")];
+            tensor<fp16, [1, 2560, 1, 3]> input_115 = expand_dims(axes = input_115_axes_0, x = var_3146)[name = string("input_115")];
+            string gated_19_pad_type_0 = const()[name = string("gated_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_19_strides_0 = const()[name = string("gated_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_19_pad_0 = const()[name = string("gated_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_19_dilations_0 = const()[name = string("gated_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_19_groups_0 = const()[name = string("gated_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_19 = conv(dilations = gated_19_dilations_0, groups = gated_19_groups_0, pad = gated_19_pad_0, pad_type = gated_19_pad_type_0, strides = gated_19_strides_0, weight = layers_3_per_layer_input_gate_weight_palettized, x = input_115)[name = string("gated_19")];
+            string gated_21_mode_0 = const()[name = string("gated_21_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_21 = gelu(mode = gated_21_mode_0, x = gated_19)[name = string("gated_21")];
+            tensor<int32, [3]> var_3165 = const()[name = string("op_3165"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_7_axes_0 = const()[name = string("per_layer_slice_conv_7_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_3166_cast_fp16 = transpose(perm = var_3165, x = per_layer_slice_7_cast_fp16)[name = string("transpose_161")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_7_cast_fp16 = expand_dims(axes = per_layer_slice_conv_7_axes_0, x = var_3166_cast_fp16)[name = string("per_layer_slice_conv_7_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_117_cast_fp16 = mul(x = gated_21, y = per_layer_slice_conv_7_cast_fp16)[name = string("input_117_cast_fp16")];
+            string gated_23_pad_type_0 = const()[name = string("gated_23_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_23_strides_0 = const()[name = string("gated_23_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_23_pad_0 = const()[name = string("gated_23_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_23_dilations_0 = const()[name = string("gated_23_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_23_groups_0 = const()[name = string("gated_23_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_3_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(542816704))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(543144448))))[name = string("layers_3_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_23_cast_fp16 = conv(dilations = gated_23_dilations_0, groups = gated_23_groups_0, pad = gated_23_pad_0, pad_type = gated_23_pad_type_0, strides = gated_23_strides_0, weight = layers_3_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_117_cast_fp16)[name = string("gated_23_cast_fp16")];
+            tensor<int32, [1]> var_3182_axes_0 = const()[name = string("op_3182_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3182_cast_fp16 = squeeze(axes = var_3182_axes_0, x = gated_23_cast_fp16)[name = string("op_3182_cast_fp16")];
+            tensor<int32, [3]> var_3186 = const()[name = string("op_3186"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3192 = const()[name = string("op_3192"), val = int32(-1)];
+            fp16 const_46_promoted_to_fp16 = const()[name = string("const_46_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_77_cast_fp16 = transpose(perm = var_3186, x = var_3182_cast_fp16)[name = string("transpose_160")];
+            tensor<fp16, [1, 3, 2560]> var_3194_cast_fp16 = mul(x = x_77_cast_fp16, y = const_46_promoted_to_fp16)[name = string("op_3194_cast_fp16")];
+            bool input_119_interleave_0 = const()[name = string("input_119_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_119_cast_fp16 = concat(axis = var_3192, interleave = input_119_interleave_0, values = (x_77_cast_fp16, var_3194_cast_fp16))[name = string("input_119_cast_fp16")];
+            tensor<int32, [1]> normed_109_axes_0 = const()[name = string("normed_109_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3189_to_fp16 = const()[name = string("op_3189_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_109_cast_fp16 = layer_norm(axes = normed_109_axes_0, epsilon = var_3189_to_fp16, x = input_119_cast_fp16)[name = string("normed_109_cast_fp16")];
+            tensor<int32, [2]> var_3199_split_sizes_0 = const()[name = string("op_3199_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3199_axis_0 = const()[name = string("op_3199_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3199_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3199_cast_fp16_1 = split(axis = var_3199_axis_0, split_sizes = var_3199_split_sizes_0, x = normed_109_cast_fp16)[name = string("op_3199_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_3_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(543147072)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_39_cast_fp16 = mul(x = var_3199_cast_fp16_0, y = layers_3_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_39_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_41_cast_fp16 = add(x = hidden_states_35_cast_fp16, y = hidden_states_39_cast_fp16)[name = string("hidden_states_41_cast_fp16")];
+            tensor<fp16, [1]> const_47_promoted_to_fp16 = const()[name = string("const_47_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.14p-1])];
+            tensor<fp16, [1, 3, 2560]> x_79_cast_fp16 = mul(x = hidden_states_41_cast_fp16, y = const_47_promoted_to_fp16)[name = string("x_79_cast_fp16")];
+            int32 var_3214 = const()[name = string("op_3214"), val = int32(-1)];
+            fp16 const_48_promoted_to_fp16 = const()[name = string("const_48_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_3216_cast_fp16 = mul(x = x_79_cast_fp16, y = const_48_promoted_to_fp16)[name = string("op_3216_cast_fp16")];
+            bool input_121_interleave_0 = const()[name = string("input_121_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_121_cast_fp16 = concat(axis = var_3214, interleave = input_121_interleave_0, values = (x_79_cast_fp16, var_3216_cast_fp16))[name = string("input_121_cast_fp16")];
+            tensor<int32, [1]> normed_113_axes_0 = const()[name = string("normed_113_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3211_to_fp16 = const()[name = string("op_3211_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_113_cast_fp16 = layer_norm(axes = normed_113_axes_0, epsilon = var_3211_to_fp16, x = input_121_cast_fp16)[name = string("normed_113_cast_fp16")];
+            tensor<int32, [2]> var_3221_split_sizes_0 = const()[name = string("op_3221_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3221_axis_0 = const()[name = string("op_3221_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3221_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3221_cast_fp16_1 = split(axis = var_3221_axis_0, split_sizes = var_3221_split_sizes_0, x = normed_113_cast_fp16)[name = string("op_3221_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_4_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(543152256)))];
+            tensor<fp16, [1, 3, 2560]> h_25_cast_fp16 = mul(x = var_3221_cast_fp16_0, y = layers_4_input_layernorm_weight_promoted_to_fp16)[name = string("h_25_cast_fp16")];
+            tensor<int32, [3]> var_3227 = const()[name = string("op_3227"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_3230_axes_0 = const()[name = string("op_3230_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3228_cast_fp16 = transpose(perm = var_3227, x = h_25_cast_fp16)[name = string("transpose_159")];
+            tensor<fp16, [1, 2560, 1, 3]> var_3230_cast_fp16 = expand_dims(axes = var_3230_axes_0, x = var_3228_cast_fp16)[name = string("op_3230_cast_fp16")];
+            string q_49_pad_type_0 = const()[name = string("q_49_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_49_strides_0 = const()[name = string("q_49_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_49_pad_0 = const()[name = string("q_49_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_49_dilations_0 = const()[name = string("q_49_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_49_groups_0 = const()[name = string("q_49_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_49 = conv(dilations = q_49_dilations_0, groups = q_49_groups_0, pad = q_49_pad_0, pad_type = q_49_pad_type_0, strides = q_49_strides_0, weight = layers_4_self_attn_q_proj_weight_palettized, x = var_3230_cast_fp16)[name = string("q_49")];
+            tensor<int32, [4]> var_3251 = const()[name = string("op_3251"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_3252 = reshape(shape = var_3251, x = q_49)[name = string("op_3252")];
+            tensor<int32, [4]> transpose_60_perm_0 = const()[name = string("transpose_60_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_3275 = const()[name = string("op_3275"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_60 = transpose(perm = transpose_60_perm_0, x = var_3252)[name = string("transpose_158")];
+            tensor<fp16, [3, 8, 256]> x_81 = reshape(shape = var_3275, x = transpose_60)[name = string("x_81")];
+            int32 var_3281 = const()[name = string("op_3281"), val = int32(-1)];
+            fp16 const_49_promoted = const()[name = string("const_49_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_3283 = mul(x = x_81, y = const_49_promoted)[name = string("op_3283")];
+            bool input_125_interleave_0 = const()[name = string("input_125_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_125 = concat(axis = var_3281, interleave = input_125_interleave_0, values = (x_81, var_3283))[name = string("input_125")];
+            tensor<int32, [1]> normed_117_axes_0 = const()[name = string("normed_117_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3278_to_fp16 = const()[name = string("op_3278_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_117_cast_fp16 = layer_norm(axes = normed_117_axes_0, epsilon = var_3278_to_fp16, x = input_125)[name = string("normed_117_cast_fp16")];
+            tensor<int32, [2]> var_3288_split_sizes_0 = const()[name = string("op_3288_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_3288_axis_0 = const()[name = string("op_3288_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_3288_0, tensor<fp16, [3, 8, 256]> var_3288_1 = split(axis = var_3288_axis_0, split_sizes = var_3288_split_sizes_0, x = normed_117_cast_fp16)[name = string("op_3288")];
+            tensor<fp16, [3, 8, 256]> q_53 = mul(x = var_3288_0, y = layers_4_self_attn_q_norm_weight)[name = string("q_53")];
+            tensor<int32, [4]> var_3295 = const()[name = string("op_3295"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_3296 = reshape(shape = var_3295, x = q_53)[name = string("op_3296")];
+            tensor<int32, [4]> var_3301 = const()[name = string("op_3301"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_55 = transpose(perm = var_3301, x = var_3296)[name = string("transpose_157")];
+            tensor<fp16, [1, 8, 3, 256]> var_3303_cast_fp16 = mul(x = q_55, y = cos_s)[name = string("op_3303_cast_fp16")];
+            tensor<int32, [2]> var_3304_split_sizes_0 = const()[name = string("op_3304_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_3304_axis_0 = const()[name = string("op_3304_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_3304_0, tensor<fp16, [1, 8, 3, 128]> var_3304_1 = split(axis = var_3304_axis_0, split_sizes = var_3304_split_sizes_0, x = q_55)[name = string("op_3304")];
+            fp16 const_50_promoted = const()[name = string("const_50_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_3306 = mul(x = var_3304_1, y = const_50_promoted)[name = string("op_3306")];
+            int32 var_3308 = const()[name = string("op_3308"), val = int32(-1)];
+            bool var_3309_interleave_0 = const()[name = string("op_3309_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_3309 = concat(axis = var_3308, interleave = var_3309_interleave_0, values = (var_3306, var_3304_0))[name = string("op_3309")];
+            tensor<fp16, [1, 8, 3, 256]> var_3310_cast_fp16 = mul(x = var_3309, y = sin_s)[name = string("op_3310_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_59_cast_fp16 = add(x = var_3303_cast_fp16, y = var_3310_cast_fp16)[name = string("q_59_cast_fp16")];
+            string k_25_pad_type_0 = const()[name = string("k_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_25_strides_0 = const()[name = string("k_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_25_pad_0 = const()[name = string("k_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_25_dilations_0 = const()[name = string("k_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_25_groups_0 = const()[name = string("k_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> k_25 = conv(dilations = k_25_dilations_0, groups = k_25_groups_0, pad = k_25_pad_0, pad_type = k_25_pad_type_0, strides = k_25_strides_0, weight = layers_4_self_attn_k_proj_weight_palettized, x = var_3230_cast_fp16)[name = string("k_25")];
+            tensor<int32, [4]> var_3328 = const()[name = string("op_3328"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_3329 = reshape(shape = var_3328, x = k_25)[name = string("op_3329")];
+            tensor<int32, [4]> transpose_61_perm_0 = const()[name = string("transpose_61_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            string v_9_pad_type_0 = const()[name = string("v_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_9_strides_0 = const()[name = string("v_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_9_pad_0 = const()[name = string("v_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_9_dilations_0 = const()[name = string("v_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_9_groups_0 = const()[name = string("v_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> v_9 = conv(dilations = v_9_dilations_0, groups = v_9_groups_0, pad = v_9_pad_0, pad_type = v_9_pad_type_0, strides = v_9_strides_0, weight = layers_4_self_attn_v_proj_weight_palettized, x = var_3230_cast_fp16)[name = string("v_9")];
+            tensor<int32, [4]> var_3356 = const()[name = string("op_3356"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_3357 = reshape(shape = var_3356, x = v_9)[name = string("op_3357")];
+            tensor<int32, [4]> var_3362 = const()[name = string("op_3362"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_3380 = const()[name = string("op_3380"), val = tensor<int32, [3]>([3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> transpose_61 = transpose(perm = transpose_61_perm_0, x = var_3329)[name = string("transpose_156")];
+            tensor<fp16, [3, 2, 256]> x_83 = reshape(shape = var_3380, x = transpose_61)[name = string("x_83")];
+            int32 var_3386 = const()[name = string("op_3386"), val = int32(-1)];
+            fp16 const_51_promoted = const()[name = string("const_51_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 2, 256]> var_3388 = mul(x = x_83, y = const_51_promoted)[name = string("op_3388")];
+            bool input_127_interleave_0 = const()[name = string("input_127_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 2, 512]> input_127 = concat(axis = var_3386, interleave = input_127_interleave_0, values = (x_83, var_3388))[name = string("input_127")];
+            tensor<int32, [1]> normed_121_axes_0 = const()[name = string("normed_121_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3383_to_fp16 = const()[name = string("op_3383_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 2, 512]> normed_121_cast_fp16 = layer_norm(axes = normed_121_axes_0, epsilon = var_3383_to_fp16, x = input_127)[name = string("normed_121_cast_fp16")];
+            tensor<int32, [2]> var_3393_split_sizes_0 = const()[name = string("op_3393_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_3393_axis_0 = const()[name = string("op_3393_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 2, 256]> var_3393_0, tensor<fp16, [3, 2, 256]> var_3393_1 = split(axis = var_3393_axis_0, split_sizes = var_3393_split_sizes_0, x = normed_121_cast_fp16)[name = string("op_3393")];
+            tensor<fp16, [3, 2, 256]> k_29 = mul(x = var_3393_0, y = layers_4_self_attn_k_norm_weight)[name = string("k_29")];
+            tensor<int32, [4]> var_3400 = const()[name = string("op_3400"), val = tensor<int32, [4]>([1, 3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> var_3401 = reshape(shape = var_3400, x = k_29)[name = string("op_3401")];
+            tensor<int32, [4]> var_3406 = const()[name = string("op_3406"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            fp16 var_3408_promoted = const()[name = string("op_3408_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 3, 256]> var_3363 = transpose(perm = var_3362, x = var_3357)[name = string("transpose_155")];
+            tensor<fp16, [1, 2, 3, 256]> var_3409 = pow(x = var_3363, y = var_3408_promoted)[name = string("op_3409")];
+            tensor<int32, [1]> var_3414_axes_0 = const()[name = string("op_3414_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3414_keep_dims_0 = const()[name = string("op_3414_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 3, 1]> var_3414 = reduce_mean(axes = var_3414_axes_0, keep_dims = var_3414_keep_dims_0, x = var_3409)[name = string("op_3414")];
+            fp16 var_3416_to_fp16 = const()[name = string("op_3416_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 3, 1]> mean_sq_9_cast_fp16 = add(x = var_3414, y = var_3416_to_fp16)[name = string("mean_sq_9_cast_fp16")];
+            fp32 var_3418_epsilon_0 = const()[name = string("op_3418_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 3, 1]> var_3418_cast_fp16 = rsqrt(epsilon = var_3418_epsilon_0, x = mean_sq_9_cast_fp16)[name = string("op_3418_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_131_cast_fp16 = mul(x = var_3363, y = var_3418_cast_fp16)[name = string("input_131_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> q_57 = transpose(perm = var_3406, x = var_3401)[name = string("transpose_154")];
+            tensor<fp16, [1, 2, 3, 256]> var_3420_cast_fp16 = mul(x = q_57, y = cos_s)[name = string("op_3420_cast_fp16")];
+            tensor<int32, [2]> var_3421_split_sizes_0 = const()[name = string("op_3421_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_3421_axis_0 = const()[name = string("op_3421_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 3, 128]> var_3421_0, tensor<fp16, [1, 2, 3, 128]> var_3421_1 = split(axis = var_3421_axis_0, split_sizes = var_3421_split_sizes_0, x = q_57)[name = string("op_3421")];
+            fp16 const_52_promoted = const()[name = string("const_52_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 3, 128]> var_3423 = mul(x = var_3421_1, y = const_52_promoted)[name = string("op_3423")];
+            int32 var_3425 = const()[name = string("op_3425"), val = int32(-1)];
+            bool var_3426_interleave_0 = const()[name = string("op_3426_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 3, 256]> var_3426 = concat(axis = var_3425, interleave = var_3426_interleave_0, values = (var_3423, var_3421_0))[name = string("op_3426")];
+            tensor<fp16, [1, 2, 3, 256]> var_3427_cast_fp16 = mul(x = var_3426, y = sin_s)[name = string("op_3427_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_129_cast_fp16 = add(x = var_3420_cast_fp16, y = var_3427_cast_fp16)[name = string("input_129_cast_fp16")];
+            tensor<int32, [8]> k_padded_9_pad_0 = const()[name = string("k_padded_9_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_9_mode_0 = const()[name = string("k_padded_9_mode_0"), val = string("constant")];
+            fp16 const_53_to_fp16 = const()[name = string("const_53_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> k_padded_9_cast_fp16 = pad(constant_val = const_53_to_fp16, mode = k_padded_9_mode_0, pad = k_padded_9_pad_0, x = input_129_cast_fp16)[name = string("k_padded_9_cast_fp16")];
+            tensor<int32, [8]> v_padded_9_pad_0 = const()[name = string("v_padded_9_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_9_mode_0 = const()[name = string("v_padded_9_mode_0"), val = string("constant")];
+            fp16 const_54_to_fp16 = const()[name = string("const_54_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> v_padded_9_cast_fp16 = pad(constant_val = const_54_to_fp16, mode = v_padded_9_mode_0, pad = v_padded_9_pad_0, x = input_131_cast_fp16)[name = string("v_padded_9_cast_fp16")];
+            tensor<int32, [4]> slot_k_9_begin_0 = const()[name = string("slot_k_9_begin_0"), val = tensor<int32, [4]>([4, 0, 0, 0])];
+            tensor<int32, [4]> slot_k_9_end_0 = const()[name = string("slot_k_9_end_0"), val = tensor<int32, [4]>([5, 2, 512, 512])];
+            tensor<bool, [4]> slot_k_9_end_mask_0 = const()[name = string("slot_k_9_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_k_9_cast_fp16 = slice_by_index(begin = slot_k_9_begin_0, end = slot_k_9_end_0, end_mask = slot_k_9_end_mask_0, x = K_sliding_out_7_cast_fp16)[name = string("slot_k_9_cast_fp16")];
+            tensor<int32, [4]> slot_v_9_begin_0 = const()[name = string("slot_v_9_begin_0"), val = tensor<int32, [4]>([4, 0, 0, 0])];
+            tensor<int32, [4]> slot_v_9_end_0 = const()[name = string("slot_v_9_end_0"), val = tensor<int32, [4]>([5, 2, 512, 512])];
+            tensor<bool, [4]> slot_v_9_end_mask_0 = const()[name = string("slot_v_9_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_v_9_cast_fp16 = slice_by_index(begin = slot_v_9_begin_0, end = slot_v_9_end_0, end_mask = slot_v_9_end_mask_0, x = V_sliding_out_7_cast_fp16)[name = string("slot_v_9_cast_fp16")];
+            tensor<int32, [4]> var_3466_begin_0 = const()[name = string("op_3466_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_3466_end_0 = const()[name = string("op_3466_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_3466_end_mask_0 = const()[name = string("op_3466_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_3466_cast_fp16 = slice_by_index(begin = var_3466_begin_0, end = var_3466_end_0, end_mask = var_3466_end_mask_0, x = slot_k_9_cast_fp16)[name = string("op_3466_cast_fp16")];
+            int32 var_3473 = const()[name = string("op_3473"), val = int32(2)];
+            bool new_k_9_interleave_0 = const()[name = string("new_k_9_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_k_9_cast_fp16 = concat(axis = var_3473, interleave = new_k_9_interleave_0, values = (var_3466_cast_fp16, k_padded_9_cast_fp16))[name = string("new_k_9_cast_fp16")];
+            tensor<int32, [4]> var_3489_begin_0 = const()[name = string("op_3489_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_3489_end_0 = const()[name = string("op_3489_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_3489_end_mask_0 = const()[name = string("op_3489_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_3489_cast_fp16 = slice_by_index(begin = var_3489_begin_0, end = var_3489_end_0, end_mask = var_3489_end_mask_0, x = slot_v_9_cast_fp16)[name = string("op_3489_cast_fp16")];
+            int32 var_3496 = const()[name = string("op_3496"), val = int32(2)];
+            bool new_v_9_interleave_0 = const()[name = string("new_v_9_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_v_9_cast_fp16 = concat(axis = var_3496, interleave = new_v_9_interleave_0, values = (var_3489_cast_fp16, v_padded_9_cast_fp16))[name = string("new_v_9_cast_fp16")];
+            tensor<int32, [4]> var_3502_begin_0 = const()[name = string("op_3502_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3502_end_0 = const()[name = string("op_3502_end_0"), val = tensor<int32, [4]>([4, 2, 512, 512])];
+            tensor<bool, [4]> var_3502_end_mask_0 = const()[name = string("op_3502_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [4, 2, 512, 512]> var_3502_cast_fp16 = slice_by_index(begin = var_3502_begin_0, end = var_3502_end_0, end_mask = var_3502_end_mask_0, x = K_sliding_out_7_cast_fp16)[name = string("op_3502_cast_fp16")];
+            tensor<int32, [4]> var_3507_begin_0 = const()[name = string("op_3507_begin_0"), val = tensor<int32, [4]>([5, 0, 0, 0])];
+            tensor<int32, [4]> var_3507_end_0 = const()[name = string("op_3507_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_3507_end_mask_0 = const()[name = string("op_3507_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [5, 2, 512, 512]> var_3507_cast_fp16 = slice_by_index(begin = var_3507_begin_0, end = var_3507_end_0, end_mask = var_3507_end_mask_0, x = K_sliding_out_7_cast_fp16)[name = string("op_3507_cast_fp16")];
+            int32 var_3509 = const()[name = string("op_3509"), val = int32(0)];
+            bool K_sliding_out_9_interleave_0 = const()[name = string("K_sliding_out_9_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> K_sliding_out_9_cast_fp16 = concat(axis = var_3509, interleave = K_sliding_out_9_interleave_0, values = (var_3502_cast_fp16, new_k_9_cast_fp16, var_3507_cast_fp16))[name = string("K_sliding_out_9_cast_fp16")];
+            tensor<int32, [4]> var_3515_begin_0 = const()[name = string("op_3515_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3515_end_0 = const()[name = string("op_3515_end_0"), val = tensor<int32, [4]>([4, 2, 512, 512])];
+            tensor<bool, [4]> var_3515_end_mask_0 = const()[name = string("op_3515_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [4, 2, 512, 512]> var_3515_cast_fp16 = slice_by_index(begin = var_3515_begin_0, end = var_3515_end_0, end_mask = var_3515_end_mask_0, x = V_sliding_out_7_cast_fp16)[name = string("op_3515_cast_fp16")];
+            tensor<int32, [4]> var_3520_begin_0 = const()[name = string("op_3520_begin_0"), val = tensor<int32, [4]>([5, 0, 0, 0])];
+            tensor<int32, [4]> var_3520_end_0 = const()[name = string("op_3520_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_3520_end_mask_0 = const()[name = string("op_3520_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [5, 2, 512, 512]> var_3520_cast_fp16 = slice_by_index(begin = var_3520_begin_0, end = var_3520_end_0, end_mask = var_3520_end_mask_0, x = V_sliding_out_7_cast_fp16)[name = string("op_3520_cast_fp16")];
+            int32 var_3522 = const()[name = string("op_3522"), val = int32(0)];
+            bool V_sliding_out_9_interleave_0 = const()[name = string("V_sliding_out_9_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> V_sliding_out_9_cast_fp16 = concat(axis = var_3522, interleave = V_sliding_out_9_interleave_0, values = (var_3515_cast_fp16, new_v_9_cast_fp16, var_3520_cast_fp16))[name = string("V_sliding_out_9_cast_fp16")];
+            tensor<int32, [4]> var_3528_begin_0 = const()[name = string("op_3528_begin_0"), val = tensor<int32, [4]>([4, 0, 0, 0])];
+            tensor<int32, [4]> var_3528_end_0 = const()[name = string("op_3528_end_0"), val = tensor<int32, [4]>([5, 2, 512, 512])];
+            tensor<bool, [4]> var_3528_end_mask_0 = const()[name = string("op_3528_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_3528_cast_fp16 = slice_by_index(begin = var_3528_begin_0, end = var_3528_end_0, end_mask = var_3528_end_mask_0, x = K_sliding_out_9_cast_fp16)[name = string("op_3528_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_9_begin_0 = const()[name = string("K_for_attn_9_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_9_end_0 = const()[name = string("K_for_attn_9_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_9_end_mask_0 = const()[name = string("K_for_attn_9_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_9_cast_fp16 = slice_by_index(begin = K_for_attn_9_begin_0, end = K_for_attn_9_end_0, end_mask = K_for_attn_9_end_mask_0, x = var_3528_cast_fp16)[name = string("K_for_attn_9_cast_fp16")];
+            tensor<int32, [4]> var_3538_begin_0 = const()[name = string("op_3538_begin_0"), val = tensor<int32, [4]>([4, 0, 0, 0])];
+            tensor<int32, [4]> var_3538_end_0 = const()[name = string("op_3538_end_0"), val = tensor<int32, [4]>([5, 2, 512, 512])];
+            tensor<bool, [4]> var_3538_end_mask_0 = const()[name = string("op_3538_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_3538_cast_fp16 = slice_by_index(begin = var_3538_begin_0, end = var_3538_end_0, end_mask = var_3538_end_mask_0, x = V_sliding_out_9_cast_fp16)[name = string("op_3538_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_9_begin_0 = const()[name = string("V_for_attn_9_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_9_end_0 = const()[name = string("V_for_attn_9_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_9_end_mask_0 = const()[name = string("V_for_attn_9_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_9_cast_fp16 = slice_by_index(begin = V_for_attn_9_begin_0, end = V_for_attn_9_end_0, end_mask = V_for_attn_9_end_mask_0, x = var_3538_cast_fp16)[name = string("V_for_attn_9_cast_fp16")];
+            tensor<int32, [4]> transpose_16_perm_0 = const()[name = string("transpose_16_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_8_reps_0 = const()[name = string("tile_8_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_16_cast_fp16 = transpose(perm = transpose_16_perm_0, x = K_for_attn_9_cast_fp16)[name = string("transpose_153")];
+            tensor<fp16, [8, 1, 512, 256]> tile_8_cast_fp16 = tile(reps = tile_8_reps_0, x = transpose_16_cast_fp16)[name = string("tile_8_cast_fp16")];
+            tensor<int32, [5]> concat_16 = const()[name = string("concat_16"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_16_cast_fp16 = reshape(shape = concat_16, x = tile_8_cast_fp16)[name = string("reshape_16_cast_fp16")];
+            tensor<int32, [5]> transpose_17_perm_0 = const()[name = string("transpose_17_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_17 = const()[name = string("concat_17"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_17_cast_fp16 = transpose(perm = transpose_17_perm_0, x = reshape_16_cast_fp16)[name = string("transpose_152")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_17_cast_fp16 = reshape(shape = concat_17, x = transpose_17_cast_fp16)[name = string("reshape_17_cast_fp16")];
+            tensor<int32, [4]> transpose_62_perm_0 = const()[name = string("transpose_62_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_18_perm_0 = const()[name = string("transpose_18_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_9_reps_0 = const()[name = string("tile_9_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_18_cast_fp16 = transpose(perm = transpose_18_perm_0, x = V_for_attn_9_cast_fp16)[name = string("transpose_151")];
+            tensor<fp16, [8, 1, 512, 256]> tile_9_cast_fp16 = tile(reps = tile_9_reps_0, x = transpose_18_cast_fp16)[name = string("tile_9_cast_fp16")];
+            tensor<int32, [5]> concat_18 = const()[name = string("concat_18"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_18_cast_fp16 = reshape(shape = concat_18, x = tile_9_cast_fp16)[name = string("reshape_18_cast_fp16")];
+            tensor<int32, [5]> transpose_19_perm_0 = const()[name = string("transpose_19_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_19 = const()[name = string("concat_19"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_19_cast_fp16 = transpose(perm = transpose_19_perm_0, x = reshape_18_cast_fp16)[name = string("transpose_150")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_19_cast_fp16 = reshape(shape = concat_19, x = transpose_19_cast_fp16)[name = string("reshape_19_cast_fp16")];
+            tensor<int32, [4]> V_expanded_9_perm_0 = const()[name = string("V_expanded_9_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_17_transpose_x_0 = const()[name = string("attn_weights_17_transpose_x_0"), val = bool(false)];
+            bool attn_weights_17_transpose_y_0 = const()[name = string("attn_weights_17_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_62_cast_fp16 = transpose(perm = transpose_62_perm_0, x = reshape_17_cast_fp16)[name = string("transpose_149")];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_17_cast_fp16 = matmul(transpose_x = attn_weights_17_transpose_x_0, transpose_y = attn_weights_17_transpose_y_0, x = q_59_cast_fp16, y = transpose_62_cast_fp16)[name = string("attn_weights_17_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_87_cast_fp16 = add(x = attn_weights_17_cast_fp16, y = causal_mask_sliding)[name = string("x_87_cast_fp16")];
+            tensor<int32, [1]> reduce_max_4_axes_0 = const()[name = string("reduce_max_4_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_4_keep_dims_0 = const()[name = string("reduce_max_4_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_4 = reduce_max(axes = reduce_max_4_axes_0, keep_dims = reduce_max_4_keep_dims_0, x = x_87_cast_fp16)[name = string("reduce_max_4")];
+            tensor<fp16, [1, 8, 3, 512]> var_3573 = sub(x = x_87_cast_fp16, y = reduce_max_4)[name = string("op_3573")];
+            tensor<fp16, [1, 8, 3, 512]> var_3579 = exp(x = var_3573)[name = string("op_3579")];
+            tensor<int32, [1]> var_3589_axes_0 = const()[name = string("op_3589_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3589_keep_dims_0 = const()[name = string("op_3589_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_3589 = reduce_sum(axes = var_3589_axes_0, keep_dims = var_3589_keep_dims_0, x = var_3579)[name = string("op_3589")];
+            tensor<fp16, [1, 8, 3, 512]> var_3595_cast_fp16 = real_div(x = var_3579, y = var_3589)[name = string("op_3595_cast_fp16")];
+            bool attn_output_25_transpose_x_0 = const()[name = string("attn_output_25_transpose_x_0"), val = bool(false)];
+            bool attn_output_25_transpose_y_0 = const()[name = string("attn_output_25_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_9_cast_fp16 = transpose(perm = V_expanded_9_perm_0, x = reshape_19_cast_fp16)[name = string("transpose_148")];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_25_cast_fp16 = matmul(transpose_x = attn_output_25_transpose_x_0, transpose_y = attn_output_25_transpose_y_0, x = var_3595_cast_fp16, y = V_expanded_9_cast_fp16)[name = string("attn_output_25_cast_fp16")];
+            tensor<int32, [4]> var_3606 = const()[name = string("op_3606"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_3613 = const()[name = string("op_3613"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_3607_cast_fp16 = transpose(perm = var_3606, x = attn_output_25_cast_fp16)[name = string("transpose_147")];
+            tensor<fp16, [1, 3, 2048]> attn_output_27_cast_fp16 = reshape(shape = var_3613, x = var_3607_cast_fp16)[name = string("attn_output_27_cast_fp16")];
+            tensor<int32, [3]> var_3618 = const()[name = string("op_3618"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_3634_pad_type_0 = const()[name = string("op_3634_pad_type_0"), val = string("valid")];
+            int32 var_3634_groups_0 = const()[name = string("op_3634_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_3634_strides_0 = const()[name = string("op_3634_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_3634_pad_0 = const()[name = string("op_3634_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_3634_dilations_0 = const()[name = string("op_3634_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_4_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(543157440))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(545778944))))[name = string("squeeze_4_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_3619_cast_fp16 = transpose(perm = var_3618, x = attn_output_27_cast_fp16)[name = string("transpose_146")];
+            tensor<fp16, [1, 2560, 3]> var_3634_cast_fp16 = conv(dilations = var_3634_dilations_0, groups = var_3634_groups_0, pad = var_3634_pad_0, pad_type = var_3634_pad_type_0, strides = var_3634_strides_0, weight = squeeze_4_cast_fp16_to_fp32_to_fp16_palettized, x = var_3619_cast_fp16)[name = string("op_3634_cast_fp16")];
+            tensor<int32, [3]> var_3638 = const()[name = string("op_3638"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3644 = const()[name = string("op_3644"), val = int32(-1)];
+            fp16 const_55_promoted_to_fp16 = const()[name = string("const_55_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_91_cast_fp16 = transpose(perm = var_3638, x = var_3634_cast_fp16)[name = string("transpose_145")];
+            tensor<fp16, [1, 3, 2560]> var_3646_cast_fp16 = mul(x = x_91_cast_fp16, y = const_55_promoted_to_fp16)[name = string("op_3646_cast_fp16")];
+            bool input_135_interleave_0 = const()[name = string("input_135_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_135_cast_fp16 = concat(axis = var_3644, interleave = input_135_interleave_0, values = (x_91_cast_fp16, var_3646_cast_fp16))[name = string("input_135_cast_fp16")];
+            tensor<int32, [1]> normed_125_axes_0 = const()[name = string("normed_125_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3641_to_fp16 = const()[name = string("op_3641_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_125_cast_fp16 = layer_norm(axes = normed_125_axes_0, epsilon = var_3641_to_fp16, x = input_135_cast_fp16)[name = string("normed_125_cast_fp16")];
+            tensor<int32, [2]> var_3651_split_sizes_0 = const()[name = string("op_3651_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3651_axis_0 = const()[name = string("op_3651_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3651_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3651_cast_fp16_1 = split(axis = var_3651_axis_0, split_sizes = var_3651_split_sizes_0, x = normed_125_cast_fp16)[name = string("op_3651_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_4_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(545781568)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_29_cast_fp16 = mul(x = var_3651_cast_fp16_0, y = layers_4_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_29_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_93_cast_fp16 = add(x = x_79_cast_fp16, y = attn_output_29_cast_fp16)[name = string("x_93_cast_fp16")];
+            int32 var_3660 = const()[name = string("op_3660"), val = int32(-1)];
+            fp16 const_56_promoted_to_fp16 = const()[name = string("const_56_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_3662_cast_fp16 = mul(x = x_93_cast_fp16, y = const_56_promoted_to_fp16)[name = string("op_3662_cast_fp16")];
+            bool input_137_interleave_0 = const()[name = string("input_137_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_137_cast_fp16 = concat(axis = var_3660, interleave = input_137_interleave_0, values = (x_93_cast_fp16, var_3662_cast_fp16))[name = string("input_137_cast_fp16")];
+            tensor<int32, [1]> normed_129_axes_0 = const()[name = string("normed_129_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3657_to_fp16 = const()[name = string("op_3657_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_129_cast_fp16 = layer_norm(axes = normed_129_axes_0, epsilon = var_3657_to_fp16, x = input_137_cast_fp16)[name = string("normed_129_cast_fp16")];
+            tensor<int32, [2]> var_3667_split_sizes_0 = const()[name = string("op_3667_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3667_axis_0 = const()[name = string("op_3667_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3667_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3667_cast_fp16_1 = split(axis = var_3667_axis_0, split_sizes = var_3667_split_sizes_0, x = normed_129_cast_fp16)[name = string("op_3667_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_4_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(545786752)))];
+            tensor<fp16, [1, 3, 2560]> h_27_cast_fp16 = mul(x = var_3667_cast_fp16_0, y = layers_4_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_27_cast_fp16")];
+            tensor<int32, [3]> var_3678 = const()[name = string("op_3678"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_139_axes_0 = const()[name = string("input_139_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3679 = transpose(perm = var_3678, x = h_27_cast_fp16)[name = string("transpose_144")];
+            tensor<fp16, [1, 2560, 1, 3]> input_139 = expand_dims(axes = input_139_axes_0, x = var_3679)[name = string("input_139")];
+            string gate_17_pad_type_0 = const()[name = string("gate_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_17_strides_0 = const()[name = string("gate_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_17_pad_0 = const()[name = string("gate_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_17_dilations_0 = const()[name = string("gate_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_17_groups_0 = const()[name = string("gate_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_17 = conv(dilations = gate_17_dilations_0, groups = gate_17_groups_0, pad = gate_17_pad_0, pad_type = gate_17_pad_type_0, strides = gate_17_strides_0, weight = layers_4_mlp_gate_proj_weight_palettized, x = input_139)[name = string("gate_17")];
+            string up_9_pad_type_0 = const()[name = string("up_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_9_strides_0 = const()[name = string("up_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_9_pad_0 = const()[name = string("up_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_9_dilations_0 = const()[name = string("up_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_9_groups_0 = const()[name = string("up_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_9 = conv(dilations = up_9_dilations_0, groups = up_9_groups_0, pad = up_9_pad_0, pad_type = up_9_pad_type_0, strides = up_9_strides_0, weight = layers_4_mlp_up_proj_weight_palettized, x = input_139)[name = string("up_9")];
+            string gate_19_mode_0 = const()[name = string("gate_19_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_19 = gelu(mode = gate_19_mode_0, x = gate_17)[name = string("gate_19")];
+            tensor<fp16, [1, 10240, 1, 3]> input_141 = mul(x = gate_19, y = up_9)[name = string("input_141")];
+            string mlp_out_9_pad_type_0 = const()[name = string("mlp_out_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_9_strides_0 = const()[name = string("mlp_out_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_9_pad_0 = const()[name = string("mlp_out_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_9_dilations_0 = const()[name = string("mlp_out_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_9_groups_0 = const()[name = string("mlp_out_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_9 = conv(dilations = mlp_out_9_dilations_0, groups = mlp_out_9_groups_0, pad = mlp_out_9_pad_0, pad_type = mlp_out_9_pad_type_0, strides = mlp_out_9_strides_0, weight = layers_4_mlp_down_proj_weight_palettized, x = input_141)[name = string("mlp_out_9")];
+            tensor<int32, [1]> var_3719_axes_0 = const()[name = string("op_3719_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3719 = squeeze(axes = var_3719_axes_0, x = mlp_out_9)[name = string("op_3719")];
+            tensor<int32, [3]> var_3723 = const()[name = string("op_3723"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3729 = const()[name = string("op_3729"), val = int32(-1)];
+            fp16 const_57_promoted = const()[name = string("const_57_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_95 = transpose(perm = var_3723, x = var_3719)[name = string("transpose_143")];
+            tensor<fp16, [1, 3, 2560]> var_3731 = mul(x = x_95, y = const_57_promoted)[name = string("op_3731")];
+            bool input_143_interleave_0 = const()[name = string("input_143_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_143 = concat(axis = var_3729, interleave = input_143_interleave_0, values = (x_95, var_3731))[name = string("input_143")];
+            tensor<int32, [1]> normed_133_axes_0 = const()[name = string("normed_133_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3726_to_fp16 = const()[name = string("op_3726_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_133_cast_fp16 = layer_norm(axes = normed_133_axes_0, epsilon = var_3726_to_fp16, x = input_143)[name = string("normed_133_cast_fp16")];
+            tensor<int32, [2]> var_3736_split_sizes_0 = const()[name = string("op_3736_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3736_axis_0 = const()[name = string("op_3736_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3736_0, tensor<fp16, [1, 3, 2560]> var_3736_1 = split(axis = var_3736_axis_0, split_sizes = var_3736_split_sizes_0, x = normed_133_cast_fp16)[name = string("op_3736")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_43 = mul(x = var_3736_0, y = layers_4_post_feedforward_layernorm_weight)[name = string("hidden_states_43")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_45_cast_fp16 = add(x = x_93_cast_fp16, y = hidden_states_43)[name = string("hidden_states_45_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_9_begin_0 = const()[name = string("per_layer_slice_9_begin_0"), val = tensor<int32, [3]>([0, 0, 4096])];
+            tensor<int32, [3]> per_layer_slice_9_end_0 = const()[name = string("per_layer_slice_9_end_0"), val = tensor<int32, [3]>([1, 3, 4352])];
+            tensor<bool, [3]> per_layer_slice_9_end_mask_0 = const()[name = string("per_layer_slice_9_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_9_cast_fp16 = slice_by_index(begin = per_layer_slice_9_begin_0, end = per_layer_slice_9_end_0, end_mask = per_layer_slice_9_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_9_cast_fp16")];
+            tensor<int32, [3]> var_3764 = const()[name = string("op_3764"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_145_axes_0 = const()[name = string("input_145_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3765 = transpose(perm = var_3764, x = hidden_states_45_cast_fp16)[name = string("transpose_142")];
+            tensor<fp16, [1, 2560, 1, 3]> input_145 = expand_dims(axes = input_145_axes_0, x = var_3765)[name = string("input_145")];
+            string gated_25_pad_type_0 = const()[name = string("gated_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_25_strides_0 = const()[name = string("gated_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_25_pad_0 = const()[name = string("gated_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_25_dilations_0 = const()[name = string("gated_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_25_groups_0 = const()[name = string("gated_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_25 = conv(dilations = gated_25_dilations_0, groups = gated_25_groups_0, pad = gated_25_pad_0, pad_type = gated_25_pad_type_0, strides = gated_25_strides_0, weight = layers_4_per_layer_input_gate_weight_palettized, x = input_145)[name = string("gated_25")];
+            string gated_27_mode_0 = const()[name = string("gated_27_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_27 = gelu(mode = gated_27_mode_0, x = gated_25)[name = string("gated_27")];
+            tensor<int32, [3]> var_3784 = const()[name = string("op_3784"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_9_axes_0 = const()[name = string("per_layer_slice_conv_9_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_3785_cast_fp16 = transpose(perm = var_3784, x = per_layer_slice_9_cast_fp16)[name = string("transpose_141")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_9_cast_fp16 = expand_dims(axes = per_layer_slice_conv_9_axes_0, x = var_3785_cast_fp16)[name = string("per_layer_slice_conv_9_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_147_cast_fp16 = mul(x = gated_27, y = per_layer_slice_conv_9_cast_fp16)[name = string("input_147_cast_fp16")];
+            string gated_29_pad_type_0 = const()[name = string("gated_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_29_strides_0 = const()[name = string("gated_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_29_pad_0 = const()[name = string("gated_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_29_dilations_0 = const()[name = string("gated_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_29_groups_0 = const()[name = string("gated_29_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_4_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(545791936))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(546119680))))[name = string("layers_4_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_29_cast_fp16 = conv(dilations = gated_29_dilations_0, groups = gated_29_groups_0, pad = gated_29_pad_0, pad_type = gated_29_pad_type_0, strides = gated_29_strides_0, weight = layers_4_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_147_cast_fp16)[name = string("gated_29_cast_fp16")];
+            tensor<int32, [1]> var_3801_axes_0 = const()[name = string("op_3801_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3801_cast_fp16 = squeeze(axes = var_3801_axes_0, x = gated_29_cast_fp16)[name = string("op_3801_cast_fp16")];
+            tensor<int32, [3]> var_3805 = const()[name = string("op_3805"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3811 = const()[name = string("op_3811"), val = int32(-1)];
+            fp16 const_58_promoted_to_fp16 = const()[name = string("const_58_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_97_cast_fp16 = transpose(perm = var_3805, x = var_3801_cast_fp16)[name = string("transpose_140")];
+            tensor<fp16, [1, 3, 2560]> var_3813_cast_fp16 = mul(x = x_97_cast_fp16, y = const_58_promoted_to_fp16)[name = string("op_3813_cast_fp16")];
+            bool input_149_interleave_0 = const()[name = string("input_149_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_149_cast_fp16 = concat(axis = var_3811, interleave = input_149_interleave_0, values = (x_97_cast_fp16, var_3813_cast_fp16))[name = string("input_149_cast_fp16")];
+            tensor<int32, [1]> normed_137_axes_0 = const()[name = string("normed_137_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3808_to_fp16 = const()[name = string("op_3808_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_137_cast_fp16 = layer_norm(axes = normed_137_axes_0, epsilon = var_3808_to_fp16, x = input_149_cast_fp16)[name = string("normed_137_cast_fp16")];
+            tensor<int32, [2]> var_3818_split_sizes_0 = const()[name = string("op_3818_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3818_axis_0 = const()[name = string("op_3818_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3818_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3818_cast_fp16_1 = split(axis = var_3818_axis_0, split_sizes = var_3818_split_sizes_0, x = normed_137_cast_fp16)[name = string("op_3818_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_4_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(546122304)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_49_cast_fp16 = mul(x = var_3818_cast_fp16_0, y = layers_4_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_49_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_51_cast_fp16 = add(x = hidden_states_45_cast_fp16, y = hidden_states_49_cast_fp16)[name = string("hidden_states_51_cast_fp16")];
+            tensor<fp16, [1]> const_59_promoted_to_fp16 = const()[name = string("const_59_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.46p-1])];
+            tensor<fp16, [1, 3, 2560]> x_99_cast_fp16 = mul(x = hidden_states_51_cast_fp16, y = const_59_promoted_to_fp16)[name = string("x_99_cast_fp16")];
+            int32 var_3833 = const()[name = string("op_3833"), val = int32(-1)];
+            fp16 const_60_promoted_to_fp16 = const()[name = string("const_60_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_3835_cast_fp16 = mul(x = x_99_cast_fp16, y = const_60_promoted_to_fp16)[name = string("op_3835_cast_fp16")];
+            bool input_151_interleave_0 = const()[name = string("input_151_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_151_cast_fp16 = concat(axis = var_3833, interleave = input_151_interleave_0, values = (x_99_cast_fp16, var_3835_cast_fp16))[name = string("input_151_cast_fp16")];
+            tensor<int32, [1]> normed_141_axes_0 = const()[name = string("normed_141_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3830_to_fp16 = const()[name = string("op_3830_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_141_cast_fp16 = layer_norm(axes = normed_141_axes_0, epsilon = var_3830_to_fp16, x = input_151_cast_fp16)[name = string("normed_141_cast_fp16")];
+            tensor<int32, [2]> var_3840_split_sizes_0 = const()[name = string("op_3840_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3840_axis_0 = const()[name = string("op_3840_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3840_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3840_cast_fp16_1 = split(axis = var_3840_axis_0, split_sizes = var_3840_split_sizes_0, x = normed_141_cast_fp16)[name = string("op_3840_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_5_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(546127488)))];
+            tensor<fp16, [1, 3, 2560]> h_31_cast_fp16 = mul(x = var_3840_cast_fp16_0, y = layers_5_input_layernorm_weight_promoted_to_fp16)[name = string("h_31_cast_fp16")];
+            tensor<int32, [3]> var_3846 = const()[name = string("op_3846"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_3849_axes_0 = const()[name = string("op_3849_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3847_cast_fp16 = transpose(perm = var_3846, x = h_31_cast_fp16)[name = string("transpose_139")];
+            tensor<fp16, [1, 2560, 1, 3]> var_3849_cast_fp16 = expand_dims(axes = var_3849_axes_0, x = var_3847_cast_fp16)[name = string("op_3849_cast_fp16")];
+            string q_61_pad_type_0 = const()[name = string("q_61_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_61_strides_0 = const()[name = string("q_61_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_61_pad_0 = const()[name = string("q_61_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_61_dilations_0 = const()[name = string("q_61_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_61_groups_0 = const()[name = string("q_61_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 4096, 1, 3]> q_61 = conv(dilations = q_61_dilations_0, groups = q_61_groups_0, pad = q_61_pad_0, pad_type = q_61_pad_type_0, strides = q_61_strides_0, weight = layers_5_self_attn_q_proj_weight_palettized, x = var_3849_cast_fp16)[name = string("q_61")];
+            tensor<int32, [4]> var_3870 = const()[name = string("op_3870"), val = tensor<int32, [4]>([1, 8, 512, 3])];
+            tensor<fp16, [1, 8, 512, 3]> var_3871 = reshape(shape = var_3870, x = q_61)[name = string("op_3871")];
+            tensor<int32, [4]> transpose_63_perm_0 = const()[name = string("transpose_63_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_3894 = const()[name = string("op_3894"), val = tensor<int32, [3]>([3, 8, 512])];
+            tensor<fp16, [1, 3, 8, 512]> transpose_63 = transpose(perm = transpose_63_perm_0, x = var_3871)[name = string("transpose_138")];
+            tensor<fp16, [3, 8, 512]> x_101 = reshape(shape = var_3894, x = transpose_63)[name = string("x_101")];
+            int32 var_3900 = const()[name = string("op_3900"), val = int32(-1)];
+            fp16 const_61_promoted = const()[name = string("const_61_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 512]> var_3902 = mul(x = x_101, y = const_61_promoted)[name = string("op_3902")];
+            bool input_155_interleave_0 = const()[name = string("input_155_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 1024]> input_155 = concat(axis = var_3900, interleave = input_155_interleave_0, values = (x_101, var_3902))[name = string("input_155")];
+            tensor<int32, [1]> normed_145_axes_0 = const()[name = string("normed_145_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3897_to_fp16 = const()[name = string("op_3897_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 1024]> normed_145_cast_fp16 = layer_norm(axes = normed_145_axes_0, epsilon = var_3897_to_fp16, x = input_155)[name = string("normed_145_cast_fp16")];
+            tensor<int32, [2]> var_3907_split_sizes_0 = const()[name = string("op_3907_split_sizes_0"), val = tensor<int32, [2]>([512, 512])];
+            int32 var_3907_axis_0 = const()[name = string("op_3907_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 512]> var_3907_0, tensor<fp16, [3, 8, 512]> var_3907_1 = split(axis = var_3907_axis_0, split_sizes = var_3907_split_sizes_0, x = normed_145_cast_fp16)[name = string("op_3907")];
+            tensor<fp16, [3, 8, 512]> q_65 = mul(x = var_3907_0, y = layers_5_self_attn_q_norm_weight)[name = string("q_65")];
+            tensor<int32, [4]> var_3914 = const()[name = string("op_3914"), val = tensor<int32, [4]>([1, 3, 8, 512])];
+            tensor<fp16, [1, 3, 8, 512]> var_3915 = reshape(shape = var_3914, x = q_65)[name = string("op_3915")];
+            tensor<int32, [4]> var_3920 = const()[name = string("op_3920"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 512]> q_67 = transpose(perm = var_3920, x = var_3915)[name = string("transpose_137")];
+            tensor<fp16, [1, 8, 3, 512]> var_3922_cast_fp16 = mul(x = q_67, y = cos_f)[name = string("op_3922_cast_fp16")];
+            tensor<int32, [2]> var_3923_split_sizes_0 = const()[name = string("op_3923_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_3923_axis_0 = const()[name = string("op_3923_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 256]> var_3923_0, tensor<fp16, [1, 8, 3, 256]> var_3923_1 = split(axis = var_3923_axis_0, split_sizes = var_3923_split_sizes_0, x = q_67)[name = string("op_3923")];
+            fp16 const_62_promoted = const()[name = string("const_62_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 256]> var_3925 = mul(x = var_3923_1, y = const_62_promoted)[name = string("op_3925")];
+            int32 var_3927 = const()[name = string("op_3927"), val = int32(-1)];
+            bool var_3928_interleave_0 = const()[name = string("op_3928_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 512]> var_3928 = concat(axis = var_3927, interleave = var_3928_interleave_0, values = (var_3925, var_3923_0))[name = string("op_3928")];
+            tensor<fp16, [1, 8, 3, 512]> var_3929_cast_fp16 = mul(x = var_3928, y = sin_f)[name = string("op_3929_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> q_71_cast_fp16 = add(x = var_3922_cast_fp16, y = var_3929_cast_fp16)[name = string("q_71_cast_fp16")];
+            string k_31_pad_type_0 = const()[name = string("k_31_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_31_strides_0 = const()[name = string("k_31_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_31_pad_0 = const()[name = string("k_31_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_31_dilations_0 = const()[name = string("k_31_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_31_groups_0 = const()[name = string("k_31_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 3]> k_31 = conv(dilations = k_31_dilations_0, groups = k_31_groups_0, pad = k_31_pad_0, pad_type = k_31_pad_type_0, strides = k_31_strides_0, weight = layers_5_self_attn_k_proj_weight_palettized, x = var_3849_cast_fp16)[name = string("k_31")];
+            tensor<int32, [4]> var_3947 = const()[name = string("op_3947"), val = tensor<int32, [4]>([1, 2, 512, 3])];
+            tensor<fp16, [1, 2, 512, 3]> var_3948 = reshape(shape = var_3947, x = k_31)[name = string("op_3948")];
+            tensor<int32, [4]> transpose_64_perm_0 = const()[name = string("transpose_64_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            string v_11_pad_type_0 = const()[name = string("v_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_11_strides_0 = const()[name = string("v_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_11_pad_0 = const()[name = string("v_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_11_dilations_0 = const()[name = string("v_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_11_groups_0 = const()[name = string("v_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 3]> v_11 = conv(dilations = v_11_dilations_0, groups = v_11_groups_0, pad = v_11_pad_0, pad_type = v_11_pad_type_0, strides = v_11_strides_0, weight = layers_5_self_attn_v_proj_weight_palettized, x = var_3849_cast_fp16)[name = string("v_11")];
+            tensor<int32, [4]> var_3975 = const()[name = string("op_3975"), val = tensor<int32, [4]>([1, 2, 512, 3])];
+            tensor<fp16, [1, 2, 512, 3]> var_3976 = reshape(shape = var_3975, x = v_11)[name = string("op_3976")];
+            tensor<int32, [4]> var_3981 = const()[name = string("op_3981"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_3999 = const()[name = string("op_3999"), val = tensor<int32, [3]>([3, 2, 512])];
+            tensor<fp16, [1, 3, 2, 512]> transpose_64 = transpose(perm = transpose_64_perm_0, x = var_3948)[name = string("transpose_136")];
+            tensor<fp16, [3, 2, 512]> x_103 = reshape(shape = var_3999, x = transpose_64)[name = string("x_103")];
+            int32 var_4005 = const()[name = string("op_4005"), val = int32(-1)];
+            fp16 const_63_promoted = const()[name = string("const_63_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 2, 512]> var_4007 = mul(x = x_103, y = const_63_promoted)[name = string("op_4007")];
+            bool input_157_interleave_0 = const()[name = string("input_157_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 2, 1024]> input_157 = concat(axis = var_4005, interleave = input_157_interleave_0, values = (x_103, var_4007))[name = string("input_157")];
+            tensor<int32, [1]> normed_149_axes_0 = const()[name = string("normed_149_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4002_to_fp16 = const()[name = string("op_4002_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 2, 1024]> normed_149_cast_fp16 = layer_norm(axes = normed_149_axes_0, epsilon = var_4002_to_fp16, x = input_157)[name = string("normed_149_cast_fp16")];
+            tensor<int32, [2]> var_4012_split_sizes_0 = const()[name = string("op_4012_split_sizes_0"), val = tensor<int32, [2]>([512, 512])];
+            int32 var_4012_axis_0 = const()[name = string("op_4012_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 2, 512]> var_4012_0, tensor<fp16, [3, 2, 512]> var_4012_1 = split(axis = var_4012_axis_0, split_sizes = var_4012_split_sizes_0, x = normed_149_cast_fp16)[name = string("op_4012")];
+            tensor<fp16, [3, 2, 512]> k_35 = mul(x = var_4012_0, y = layers_5_self_attn_k_norm_weight)[name = string("k_35")];
+            tensor<int32, [4]> var_4019 = const()[name = string("op_4019"), val = tensor<int32, [4]>([1, 3, 2, 512])];
+            tensor<fp16, [1, 3, 2, 512]> var_4020 = reshape(shape = var_4019, x = k_35)[name = string("op_4020")];
+            tensor<int32, [4]> var_4025 = const()[name = string("op_4025"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            fp16 var_4027_promoted = const()[name = string("op_4027_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 3, 512]> var_3982 = transpose(perm = var_3981, x = var_3976)[name = string("transpose_135")];
+            tensor<fp16, [1, 2, 3, 512]> var_4028 = pow(x = var_3982, y = var_4027_promoted)[name = string("op_4028")];
+            tensor<int32, [1]> var_4033_axes_0 = const()[name = string("op_4033_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4033_keep_dims_0 = const()[name = string("op_4033_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 3, 1]> var_4033 = reduce_mean(axes = var_4033_axes_0, keep_dims = var_4033_keep_dims_0, x = var_4028)[name = string("op_4033")];
+            fp16 var_4035_to_fp16 = const()[name = string("op_4035_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 3, 1]> mean_sq_11_cast_fp16 = add(x = var_4033, y = var_4035_to_fp16)[name = string("mean_sq_11_cast_fp16")];
+            fp32 var_4037_epsilon_0 = const()[name = string("op_4037_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 3, 1]> var_4037_cast_fp16 = rsqrt(epsilon = var_4037_epsilon_0, x = mean_sq_11_cast_fp16)[name = string("op_4037_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 512]> v_13_cast_fp16 = mul(x = var_3982, y = var_4037_cast_fp16)[name = string("v_13_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 512]> q_69 = transpose(perm = var_4025, x = var_4020)[name = string("transpose_134")];
+            tensor<fp16, [1, 2, 3, 512]> var_4039_cast_fp16 = mul(x = q_69, y = cos_f)[name = string("op_4039_cast_fp16")];
+            tensor<int32, [2]> var_4040_split_sizes_0 = const()[name = string("op_4040_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_4040_axis_0 = const()[name = string("op_4040_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 3, 256]> var_4040_0, tensor<fp16, [1, 2, 3, 256]> var_4040_1 = split(axis = var_4040_axis_0, split_sizes = var_4040_split_sizes_0, x = q_69)[name = string("op_4040")];
+            fp16 const_64_promoted = const()[name = string("const_64_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 3, 256]> var_4042 = mul(x = var_4040_1, y = const_64_promoted)[name = string("op_4042")];
+            int32 var_4044 = const()[name = string("op_4044"), val = int32(-1)];
+            bool var_4045_interleave_0 = const()[name = string("op_4045_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 3, 512]> var_4045 = concat(axis = var_4044, interleave = var_4045_interleave_0, values = (var_4042, var_4040_0))[name = string("op_4045")];
+            tensor<fp16, [1, 2, 3, 512]> var_4046_cast_fp16 = mul(x = var_4045, y = sin_f)[name = string("op_4046_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 512]> k_37_cast_fp16 = add(x = var_4039_cast_fp16, y = var_4046_cast_fp16)[name = string("k_37_cast_fp16")];
+            tensor<int32, [4]> var_4055_reps_0 = const()[name = string("op_4055_reps_0"), val = tensor<int32, [4]>([1, 2, 1, 1])];
+            tensor<fp16, [1, 2, 2048, 3]> var_4055_cast_fp16 = tile(reps = var_4055_reps_0, x = update_indicator)[name = string("op_4055_cast_fp16")];
+            bool k_scattered_1_transpose_x_0 = const()[name = string("k_scattered_1_transpose_x_0"), val = bool(false)];
+            bool k_scattered_1_transpose_y_0 = const()[name = string("k_scattered_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 2048, 512]> k_scattered_1_cast_fp16 = matmul(transpose_x = k_scattered_1_transpose_x_0, transpose_y = k_scattered_1_transpose_y_0, x = var_4055_cast_fp16, y = k_37_cast_fp16)[name = string("k_scattered_1_cast_fp16")];
+            bool v_scattered_1_transpose_x_0 = const()[name = string("v_scattered_1_transpose_x_0"), val = bool(false)];
+            bool v_scattered_1_transpose_y_0 = const()[name = string("v_scattered_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 2048, 512]> v_scattered_1_cast_fp16 = matmul(transpose_x = v_scattered_1_transpose_x_0, transpose_y = v_scattered_1_transpose_y_0, x = var_4055_cast_fp16, y = v_13_cast_fp16)[name = string("v_scattered_1_cast_fp16")];
+            tensor<int32, [1]> var_4069_axes_0 = const()[name = string("op_4069_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4069_keep_dims_0 = const()[name = string("op_4069_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 1, 2048, 1]> var_4069_cast_fp16 = reduce_sum(axes = var_4069_axes_0, keep_dims = var_4069_keep_dims_0, x = update_indicator)[name = string("op_4069_cast_fp16")];
+            tensor<int32, [4]> slot_k_11_begin_0 = const()[name = string("slot_k_11_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> slot_k_11_end_0 = const()[name = string("slot_k_11_end_0"), val = tensor<int32, [4]>([1, 2, 2048, 512])];
+            tensor<bool, [4]> slot_k_11_end_mask_0 = const()[name = string("slot_k_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 2048, 512]> slot_k_11_cast_fp16 = slice_by_index(begin = slot_k_11_begin_0, end = slot_k_11_end_0, end_mask = slot_k_11_end_mask_0, x = K_full_in)[name = string("slot_k_11_cast_fp16")];
+            tensor<int32, [4]> slot_v_11_begin_0 = const()[name = string("slot_v_11_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> slot_v_11_end_0 = const()[name = string("slot_v_11_end_0"), val = tensor<int32, [4]>([1, 2, 2048, 512])];
+            tensor<bool, [4]> slot_v_11_end_mask_0 = const()[name = string("slot_v_11_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 2048, 512]> slot_v_11_cast_fp16 = slice_by_index(begin = slot_v_11_begin_0, end = slot_v_11_end_0, end_mask = slot_v_11_end_mask_0, x = V_full_in)[name = string("slot_v_11_cast_fp16")];
+            fp16 var_4080_promoted_to_fp16 = const()[name = string("op_4080_promoted_to_fp16"), val = fp16(0x1p+0)];
+            tensor<fp16, [1, 1, 2048, 1]> var_4082_cast_fp16 = sub(x = var_4080_promoted_to_fp16, y = var_4069_cast_fp16)[name = string("op_4082_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_4083_cast_fp16 = mul(x = slot_k_11_cast_fp16, y = var_4082_cast_fp16)[name = string("op_4083_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> new_k_11_cast_fp16 = add(x = var_4083_cast_fp16, y = k_scattered_1_cast_fp16)[name = string("new_k_11_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_4089_cast_fp16 = mul(x = slot_v_11_cast_fp16, y = var_4082_cast_fp16)[name = string("op_4089_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> new_v_11_cast_fp16 = add(x = var_4089_cast_fp16, y = v_scattered_1_cast_fp16)[name = string("new_v_11_cast_fp16")];
+            tensor<int32, [4]> var_4101_begin_0 = const()[name = string("op_4101_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> var_4101_end_0 = const()[name = string("op_4101_end_0"), val = tensor<int32, [4]>([2, 2, 2048, 512])];
+            tensor<bool, [4]> var_4101_end_mask_0 = const()[name = string("op_4101_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 2048, 512]> var_4101_cast_fp16 = slice_by_index(begin = var_4101_begin_0, end = var_4101_end_0, end_mask = var_4101_end_mask_0, x = K_full_in)[name = string("op_4101_cast_fp16")];
+            int32 var_4103 = const()[name = string("op_4103"), val = int32(0)];
+            bool K_full_out_1_interleave_0 = const()[name = string("K_full_out_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [2, 2, 2048, 512]> K_full_out_1_cast_fp16 = concat(axis = var_4103, interleave = K_full_out_1_interleave_0, values = (new_k_11_cast_fp16, var_4101_cast_fp16))[name = string("K_full_out_1_cast_fp16")];
+            tensor<int32, [4]> var_4114_begin_0 = const()[name = string("op_4114_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> var_4114_end_0 = const()[name = string("op_4114_end_0"), val = tensor<int32, [4]>([2, 2, 2048, 512])];
+            tensor<bool, [4]> var_4114_end_mask_0 = const()[name = string("op_4114_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 2048, 512]> var_4114_cast_fp16 = slice_by_index(begin = var_4114_begin_0, end = var_4114_end_0, end_mask = var_4114_end_mask_0, x = V_full_in)[name = string("op_4114_cast_fp16")];
+            int32 var_4116 = const()[name = string("op_4116"), val = int32(0)];
+            bool V_full_out_1_interleave_0 = const()[name = string("V_full_out_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [2, 2, 2048, 512]> V_full_out_1_cast_fp16 = concat(axis = var_4116, interleave = V_full_out_1_interleave_0, values = (new_v_11_cast_fp16, var_4114_cast_fp16))[name = string("V_full_out_1_cast_fp16")];
+            tensor<int32, [4]> var_4122_begin_0 = const()[name = string("op_4122_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4122_end_0 = const()[name = string("op_4122_end_0"), val = tensor<int32, [4]>([1, 2, 2048, 512])];
+            tensor<bool, [4]> var_4122_end_mask_0 = const()[name = string("op_4122_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 2048, 512]> var_4122_cast_fp16 = slice_by_index(begin = var_4122_begin_0, end = var_4122_end_0, end_mask = var_4122_end_mask_0, x = K_full_out_1_cast_fp16)[name = string("op_4122_cast_fp16")];
+            tensor<int32, [4]> var_4132_begin_0 = const()[name = string("op_4132_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4132_end_0 = const()[name = string("op_4132_end_0"), val = tensor<int32, [4]>([1, 2, 2048, 512])];
+            tensor<bool, [4]> var_4132_end_mask_0 = const()[name = string("op_4132_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 2048, 512]> var_4132_cast_fp16 = slice_by_index(begin = var_4132_begin_0, end = var_4132_end_0, end_mask = var_4132_end_mask_0, x = V_full_out_1_cast_fp16)[name = string("op_4132_cast_fp16")];
+            tensor<int32, [4]> transpose_20_perm_0 = const()[name = string("transpose_20_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_10_reps_0 = const()[name = string("tile_10_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 2048, 512]> transpose_20_cast_fp16 = transpose(perm = transpose_20_perm_0, x = var_4122_cast_fp16)[name = string("transpose_133")];
+            tensor<fp16, [8, 1, 2048, 512]> tile_10_cast_fp16 = tile(reps = tile_10_reps_0, x = transpose_20_cast_fp16)[name = string("tile_10_cast_fp16")];
+            tensor<int32, [5]> concat_22 = const()[name = string("concat_22"), val = tensor<int32, [5]>([4, 2, 1, 2048, 512])];
+            tensor<fp16, [4, 2, 1, 2048, 512]> reshape_20_cast_fp16 = reshape(shape = concat_22, x = tile_10_cast_fp16)[name = string("reshape_20_cast_fp16")];
+            tensor<int32, [5]> transpose_21_perm_0 = const()[name = string("transpose_21_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_23 = const()[name = string("concat_23"), val = tensor<int32, [4]>([-1, 1, 2048, 512])];
+            tensor<fp16, [2, 4, 1, 2048, 512]> transpose_21_cast_fp16 = transpose(perm = transpose_21_perm_0, x = reshape_20_cast_fp16)[name = string("transpose_132")];
+            tensor<fp16, [8, 1, 2048, 512]> reshape_21_cast_fp16 = reshape(shape = concat_23, x = transpose_21_cast_fp16)[name = string("reshape_21_cast_fp16")];
+            tensor<int32, [4]> transpose_65_perm_0 = const()[name = string("transpose_65_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_22_perm_0 = const()[name = string("transpose_22_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_11_reps_0 = const()[name = string("tile_11_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 2048, 512]> transpose_22_cast_fp16 = transpose(perm = transpose_22_perm_0, x = var_4132_cast_fp16)[name = string("transpose_131")];
+            tensor<fp16, [8, 1, 2048, 512]> tile_11_cast_fp16 = tile(reps = tile_11_reps_0, x = transpose_22_cast_fp16)[name = string("tile_11_cast_fp16")];
+            tensor<int32, [5]> concat_24 = const()[name = string("concat_24"), val = tensor<int32, [5]>([4, 2, 1, 2048, 512])];
+            tensor<fp16, [4, 2, 1, 2048, 512]> reshape_22_cast_fp16 = reshape(shape = concat_24, x = tile_11_cast_fp16)[name = string("reshape_22_cast_fp16")];
+            tensor<int32, [5]> transpose_23_perm_0 = const()[name = string("transpose_23_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_25 = const()[name = string("concat_25"), val = tensor<int32, [4]>([-1, 1, 2048, 512])];
+            tensor<fp16, [2, 4, 1, 2048, 512]> transpose_23_cast_fp16 = transpose(perm = transpose_23_perm_0, x = reshape_22_cast_fp16)[name = string("transpose_130")];
+            tensor<fp16, [8, 1, 2048, 512]> reshape_23_cast_fp16 = reshape(shape = concat_25, x = transpose_23_cast_fp16)[name = string("reshape_23_cast_fp16")];
+            tensor<int32, [4]> V_expanded_11_perm_0 = const()[name = string("V_expanded_11_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_21_transpose_x_0 = const()[name = string("attn_weights_21_transpose_x_0"), val = bool(false)];
+            bool attn_weights_21_transpose_y_0 = const()[name = string("attn_weights_21_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 2048]> transpose_65_cast_fp16 = transpose(perm = transpose_65_perm_0, x = reshape_21_cast_fp16)[name = string("transpose_129")];
+            tensor<fp16, [1, 8, 3, 2048]> attn_weights_21_cast_fp16 = matmul(transpose_x = attn_weights_21_transpose_x_0, transpose_y = attn_weights_21_transpose_y_0, x = q_71_cast_fp16, y = transpose_65_cast_fp16)[name = string("attn_weights_21_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 2048]> x_107_cast_fp16 = add(x = attn_weights_21_cast_fp16, y = causal_mask_full)[name = string("x_107_cast_fp16")];
+            tensor<int32, [1]> reduce_max_5_axes_0 = const()[name = string("reduce_max_5_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_5_keep_dims_0 = const()[name = string("reduce_max_5_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_5 = reduce_max(axes = reduce_max_5_axes_0, keep_dims = reduce_max_5_keep_dims_0, x = x_107_cast_fp16)[name = string("reduce_max_5")];
+            tensor<fp16, [1, 8, 3, 2048]> var_4167 = sub(x = x_107_cast_fp16, y = reduce_max_5)[name = string("op_4167")];
+            tensor<fp16, [1, 8, 3, 2048]> var_4173 = exp(x = var_4167)[name = string("op_4173")];
+            tensor<int32, [1]> var_4183_axes_0 = const()[name = string("op_4183_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4183_keep_dims_0 = const()[name = string("op_4183_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_4183 = reduce_sum(axes = var_4183_axes_0, keep_dims = var_4183_keep_dims_0, x = var_4173)[name = string("op_4183")];
+            tensor<fp16, [1, 8, 3, 2048]> var_4189_cast_fp16 = real_div(x = var_4173, y = var_4183)[name = string("op_4189_cast_fp16")];
+            bool attn_output_31_transpose_x_0 = const()[name = string("attn_output_31_transpose_x_0"), val = bool(false)];
+            bool attn_output_31_transpose_y_0 = const()[name = string("attn_output_31_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 2048, 512]> V_expanded_11_cast_fp16 = transpose(perm = V_expanded_11_perm_0, x = reshape_23_cast_fp16)[name = string("transpose_128")];
+            tensor<fp16, [1, 8, 3, 512]> attn_output_31_cast_fp16 = matmul(transpose_x = attn_output_31_transpose_x_0, transpose_y = attn_output_31_transpose_y_0, x = var_4189_cast_fp16, y = V_expanded_11_cast_fp16)[name = string("attn_output_31_cast_fp16")];
+            tensor<int32, [4]> var_4200 = const()[name = string("op_4200"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_4207 = const()[name = string("op_4207"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 512]> var_4201_cast_fp16 = transpose(perm = var_4200, x = attn_output_31_cast_fp16)[name = string("transpose_127")];
+            tensor<fp16, [1, 3, 4096]> attn_output_33_cast_fp16 = reshape(shape = var_4207, x = var_4201_cast_fp16)[name = string("attn_output_33_cast_fp16")];
+            tensor<int32, [3]> var_4212 = const()[name = string("op_4212"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_4228_pad_type_0 = const()[name = string("op_4228_pad_type_0"), val = string("valid")];
+            int32 var_4228_groups_0 = const()[name = string("op_4228_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_4228_strides_0 = const()[name = string("op_4228_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_4228_pad_0 = const()[name = string("op_4228_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_4228_dilations_0 = const()[name = string("op_4228_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 4096, 1]> squeeze_5_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 4096, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(546132672))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(551375616))))[name = string("squeeze_5_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 4096, 3]> var_4213_cast_fp16 = transpose(perm = var_4212, x = attn_output_33_cast_fp16)[name = string("transpose_126")];
+            tensor<fp16, [1, 2560, 3]> var_4228_cast_fp16 = conv(dilations = var_4228_dilations_0, groups = var_4228_groups_0, pad = var_4228_pad_0, pad_type = var_4228_pad_type_0, strides = var_4228_strides_0, weight = squeeze_5_cast_fp16_to_fp32_to_fp16_palettized, x = var_4213_cast_fp16)[name = string("op_4228_cast_fp16")];
+            tensor<int32, [3]> var_4232 = const()[name = string("op_4232"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_4238 = const()[name = string("op_4238"), val = int32(-1)];
+            fp16 const_65_promoted_to_fp16 = const()[name = string("const_65_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_111_cast_fp16 = transpose(perm = var_4232, x = var_4228_cast_fp16)[name = string("transpose_125")];
+            tensor<fp16, [1, 3, 2560]> var_4240_cast_fp16 = mul(x = x_111_cast_fp16, y = const_65_promoted_to_fp16)[name = string("op_4240_cast_fp16")];
+            bool input_161_interleave_0 = const()[name = string("input_161_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_161_cast_fp16 = concat(axis = var_4238, interleave = input_161_interleave_0, values = (x_111_cast_fp16, var_4240_cast_fp16))[name = string("input_161_cast_fp16")];
+            tensor<int32, [1]> normed_153_axes_0 = const()[name = string("normed_153_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4235_to_fp16 = const()[name = string("op_4235_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_153_cast_fp16 = layer_norm(axes = normed_153_axes_0, epsilon = var_4235_to_fp16, x = input_161_cast_fp16)[name = string("normed_153_cast_fp16")];
+            tensor<int32, [2]> var_4245_split_sizes_0 = const()[name = string("op_4245_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4245_axis_0 = const()[name = string("op_4245_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_4245_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_4245_cast_fp16_1 = split(axis = var_4245_axis_0, split_sizes = var_4245_split_sizes_0, x = normed_153_cast_fp16)[name = string("op_4245_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_5_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(551378240)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_35_cast_fp16 = mul(x = var_4245_cast_fp16_0, y = layers_5_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_35_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_113_cast_fp16 = add(x = x_99_cast_fp16, y = attn_output_35_cast_fp16)[name = string("x_113_cast_fp16")];
+            int32 var_4254 = const()[name = string("op_4254"), val = int32(-1)];
+            fp16 const_66_promoted_to_fp16 = const()[name = string("const_66_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_4256_cast_fp16 = mul(x = x_113_cast_fp16, y = const_66_promoted_to_fp16)[name = string("op_4256_cast_fp16")];
+            bool input_163_interleave_0 = const()[name = string("input_163_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_163_cast_fp16 = concat(axis = var_4254, interleave = input_163_interleave_0, values = (x_113_cast_fp16, var_4256_cast_fp16))[name = string("input_163_cast_fp16")];
+            tensor<int32, [1]> normed_157_axes_0 = const()[name = string("normed_157_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4251_to_fp16 = const()[name = string("op_4251_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_157_cast_fp16 = layer_norm(axes = normed_157_axes_0, epsilon = var_4251_to_fp16, x = input_163_cast_fp16)[name = string("normed_157_cast_fp16")];
+            tensor<int32, [2]> var_4261_split_sizes_0 = const()[name = string("op_4261_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4261_axis_0 = const()[name = string("op_4261_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_4261_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_4261_cast_fp16_1 = split(axis = var_4261_axis_0, split_sizes = var_4261_split_sizes_0, x = normed_157_cast_fp16)[name = string("op_4261_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_5_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(551383424)))];
+            tensor<fp16, [1, 3, 2560]> h_33_cast_fp16 = mul(x = var_4261_cast_fp16_0, y = layers_5_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_33_cast_fp16")];
+            tensor<int32, [3]> var_4272 = const()[name = string("op_4272"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_165_axes_0 = const()[name = string("input_165_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_4273 = transpose(perm = var_4272, x = h_33_cast_fp16)[name = string("transpose_124")];
+            tensor<fp16, [1, 2560, 1, 3]> input_165 = expand_dims(axes = input_165_axes_0, x = var_4273)[name = string("input_165")];
+            string gate_21_pad_type_0 = const()[name = string("gate_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_21_strides_0 = const()[name = string("gate_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_21_pad_0 = const()[name = string("gate_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_21_dilations_0 = const()[name = string("gate_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_21_groups_0 = const()[name = string("gate_21_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_21 = conv(dilations = gate_21_dilations_0, groups = gate_21_groups_0, pad = gate_21_pad_0, pad_type = gate_21_pad_type_0, strides = gate_21_strides_0, weight = layers_5_mlp_gate_proj_weight_palettized, x = input_165)[name = string("gate_21")];
+            string up_11_pad_type_0 = const()[name = string("up_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_11_strides_0 = const()[name = string("up_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_11_pad_0 = const()[name = string("up_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_11_dilations_0 = const()[name = string("up_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_11_groups_0 = const()[name = string("up_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_11 = conv(dilations = up_11_dilations_0, groups = up_11_groups_0, pad = up_11_pad_0, pad_type = up_11_pad_type_0, strides = up_11_strides_0, weight = layers_5_mlp_up_proj_weight_palettized, x = input_165)[name = string("up_11")];
+            string gate_23_mode_0 = const()[name = string("gate_23_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_23 = gelu(mode = gate_23_mode_0, x = gate_21)[name = string("gate_23")];
+            tensor<fp16, [1, 10240, 1, 3]> input_167 = mul(x = gate_23, y = up_11)[name = string("input_167")];
+            string mlp_out_11_pad_type_0 = const()[name = string("mlp_out_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_11_strides_0 = const()[name = string("mlp_out_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_11_pad_0 = const()[name = string("mlp_out_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_11_dilations_0 = const()[name = string("mlp_out_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_11_groups_0 = const()[name = string("mlp_out_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_11 = conv(dilations = mlp_out_11_dilations_0, groups = mlp_out_11_groups_0, pad = mlp_out_11_pad_0, pad_type = mlp_out_11_pad_type_0, strides = mlp_out_11_strides_0, weight = layers_5_mlp_down_proj_weight_palettized, x = input_167)[name = string("mlp_out_11")];
+            tensor<int32, [1]> var_4313_axes_0 = const()[name = string("op_4313_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_4313 = squeeze(axes = var_4313_axes_0, x = mlp_out_11)[name = string("op_4313")];
+            tensor<int32, [3]> var_4317 = const()[name = string("op_4317"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_4323 = const()[name = string("op_4323"), val = int32(-1)];
+            fp16 const_67_promoted = const()[name = string("const_67_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_115 = transpose(perm = var_4317, x = var_4313)[name = string("transpose_123")];
+            tensor<fp16, [1, 3, 2560]> var_4325 = mul(x = x_115, y = const_67_promoted)[name = string("op_4325")];
+            bool input_169_interleave_0 = const()[name = string("input_169_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_169 = concat(axis = var_4323, interleave = input_169_interleave_0, values = (x_115, var_4325))[name = string("input_169")];
+            tensor<int32, [1]> normed_161_axes_0 = const()[name = string("normed_161_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4320_to_fp16 = const()[name = string("op_4320_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_161_cast_fp16 = layer_norm(axes = normed_161_axes_0, epsilon = var_4320_to_fp16, x = input_169)[name = string("normed_161_cast_fp16")];
+            tensor<int32, [2]> var_4330_split_sizes_0 = const()[name = string("op_4330_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4330_axis_0 = const()[name = string("op_4330_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_4330_0, tensor<fp16, [1, 3, 2560]> var_4330_1 = split(axis = var_4330_axis_0, split_sizes = var_4330_split_sizes_0, x = normed_161_cast_fp16)[name = string("op_4330")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_53 = mul(x = var_4330_0, y = layers_5_post_feedforward_layernorm_weight)[name = string("hidden_states_53")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_55_cast_fp16 = add(x = x_113_cast_fp16, y = hidden_states_53)[name = string("hidden_states_55_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_11_begin_0 = const()[name = string("per_layer_slice_11_begin_0"), val = tensor<int32, [3]>([0, 0, 4352])];
+            tensor<int32, [3]> per_layer_slice_11_end_0 = const()[name = string("per_layer_slice_11_end_0"), val = tensor<int32, [3]>([1, 3, 4608])];
+            tensor<bool, [3]> per_layer_slice_11_end_mask_0 = const()[name = string("per_layer_slice_11_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_11_cast_fp16 = slice_by_index(begin = per_layer_slice_11_begin_0, end = per_layer_slice_11_end_0, end_mask = per_layer_slice_11_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_11_cast_fp16")];
+            tensor<int32, [3]> var_4358 = const()[name = string("op_4358"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_171_axes_0 = const()[name = string("input_171_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_4359 = transpose(perm = var_4358, x = hidden_states_55_cast_fp16)[name = string("transpose_122")];
+            tensor<fp16, [1, 2560, 1, 3]> input_171 = expand_dims(axes = input_171_axes_0, x = var_4359)[name = string("input_171")];
+            string gated_31_pad_type_0 = const()[name = string("gated_31_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_31_strides_0 = const()[name = string("gated_31_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_31_pad_0 = const()[name = string("gated_31_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_31_dilations_0 = const()[name = string("gated_31_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_31_groups_0 = const()[name = string("gated_31_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_31 = conv(dilations = gated_31_dilations_0, groups = gated_31_groups_0, pad = gated_31_pad_0, pad_type = gated_31_pad_type_0, strides = gated_31_strides_0, weight = layers_5_per_layer_input_gate_weight_palettized, x = input_171)[name = string("gated_31")];
+            string gated_33_mode_0 = const()[name = string("gated_33_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_33 = gelu(mode = gated_33_mode_0, x = gated_31)[name = string("gated_33")];
+            tensor<int32, [3]> var_4378 = const()[name = string("op_4378"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_11_axes_0 = const()[name = string("per_layer_slice_conv_11_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_4379_cast_fp16 = transpose(perm = var_4378, x = per_layer_slice_11_cast_fp16)[name = string("transpose_121")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_11_cast_fp16 = expand_dims(axes = per_layer_slice_conv_11_axes_0, x = var_4379_cast_fp16)[name = string("per_layer_slice_conv_11_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_173_cast_fp16 = mul(x = gated_33, y = per_layer_slice_conv_11_cast_fp16)[name = string("input_173_cast_fp16")];
+            string gated_35_pad_type_0 = const()[name = string("gated_35_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_35_strides_0 = const()[name = string("gated_35_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_35_pad_0 = const()[name = string("gated_35_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_35_dilations_0 = const()[name = string("gated_35_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_35_groups_0 = const()[name = string("gated_35_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_5_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(551388608))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(551716352))))[name = string("layers_5_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_35_cast_fp16 = conv(dilations = gated_35_dilations_0, groups = gated_35_groups_0, pad = gated_35_pad_0, pad_type = gated_35_pad_type_0, strides = gated_35_strides_0, weight = layers_5_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_173_cast_fp16)[name = string("gated_35_cast_fp16")];
+            tensor<int32, [1]> var_4395_axes_0 = const()[name = string("op_4395_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_4395_cast_fp16 = squeeze(axes = var_4395_axes_0, x = gated_35_cast_fp16)[name = string("op_4395_cast_fp16")];
+            tensor<int32, [3]> var_4399 = const()[name = string("op_4399"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_4405 = const()[name = string("op_4405"), val = int32(-1)];
+            fp16 const_68_promoted_to_fp16 = const()[name = string("const_68_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_117_cast_fp16 = transpose(perm = var_4399, x = var_4395_cast_fp16)[name = string("transpose_120")];
+            tensor<fp16, [1, 3, 2560]> var_4407_cast_fp16 = mul(x = x_117_cast_fp16, y = const_68_promoted_to_fp16)[name = string("op_4407_cast_fp16")];
+            bool input_175_interleave_0 = const()[name = string("input_175_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_175_cast_fp16 = concat(axis = var_4405, interleave = input_175_interleave_0, values = (x_117_cast_fp16, var_4407_cast_fp16))[name = string("input_175_cast_fp16")];
+            tensor<int32, [1]> normed_165_axes_0 = const()[name = string("normed_165_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4402_to_fp16 = const()[name = string("op_4402_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_165_cast_fp16 = layer_norm(axes = normed_165_axes_0, epsilon = var_4402_to_fp16, x = input_175_cast_fp16)[name = string("normed_165_cast_fp16")];
+            tensor<int32, [2]> var_4412_split_sizes_0 = const()[name = string("op_4412_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4412_axis_0 = const()[name = string("op_4412_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_4412_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_4412_cast_fp16_1 = split(axis = var_4412_axis_0, split_sizes = var_4412_split_sizes_0, x = normed_165_cast_fp16)[name = string("op_4412_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_5_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(551718976)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_59_cast_fp16 = mul(x = var_4412_cast_fp16_0, y = layers_5_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_59_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_61_cast_fp16 = add(x = hidden_states_55_cast_fp16, y = hidden_states_59_cast_fp16)[name = string("hidden_states_61_cast_fp16")];
+            tensor<fp16, [1]> const_69_promoted_to_fp16 = const()[name = string("const_69_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.b2p-2])];
+            tensor<fp16, [1, 3, 2560]> x_119_cast_fp16 = mul(x = hidden_states_61_cast_fp16, y = const_69_promoted_to_fp16)[name = string("x_119_cast_fp16")];
+            int32 var_4427 = const()[name = string("op_4427"), val = int32(-1)];
+            fp16 const_70_promoted_to_fp16 = const()[name = string("const_70_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_4429_cast_fp16 = mul(x = x_119_cast_fp16, y = const_70_promoted_to_fp16)[name = string("op_4429_cast_fp16")];
+            bool input_177_interleave_0 = const()[name = string("input_177_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_177_cast_fp16 = concat(axis = var_4427, interleave = input_177_interleave_0, values = (x_119_cast_fp16, var_4429_cast_fp16))[name = string("input_177_cast_fp16")];
+            tensor<int32, [1]> normed_169_axes_0 = const()[name = string("normed_169_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4424_to_fp16 = const()[name = string("op_4424_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_169_cast_fp16 = layer_norm(axes = normed_169_axes_0, epsilon = var_4424_to_fp16, x = input_177_cast_fp16)[name = string("normed_169_cast_fp16")];
+            tensor<int32, [2]> var_4434_split_sizes_0 = const()[name = string("op_4434_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4434_axis_0 = const()[name = string("op_4434_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_4434_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_4434_cast_fp16_1 = split(axis = var_4434_axis_0, split_sizes = var_4434_split_sizes_0, x = normed_169_cast_fp16)[name = string("op_4434_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_6_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(551724160)))];
+            tensor<fp16, [1, 3, 2560]> h_37_cast_fp16 = mul(x = var_4434_cast_fp16_0, y = layers_6_input_layernorm_weight_promoted_to_fp16)[name = string("h_37_cast_fp16")];
+            tensor<int32, [3]> var_4440 = const()[name = string("op_4440"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_4443_axes_0 = const()[name = string("op_4443_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_4441_cast_fp16 = transpose(perm = var_4440, x = h_37_cast_fp16)[name = string("transpose_119")];
+            tensor<fp16, [1, 2560, 1, 3]> var_4443_cast_fp16 = expand_dims(axes = var_4443_axes_0, x = var_4441_cast_fp16)[name = string("op_4443_cast_fp16")];
+            string q_73_pad_type_0 = const()[name = string("q_73_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_73_strides_0 = const()[name = string("q_73_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_73_pad_0 = const()[name = string("q_73_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_73_dilations_0 = const()[name = string("q_73_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_73_groups_0 = const()[name = string("q_73_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_73 = conv(dilations = q_73_dilations_0, groups = q_73_groups_0, pad = q_73_pad_0, pad_type = q_73_pad_type_0, strides = q_73_strides_0, weight = layers_6_self_attn_q_proj_weight_palettized, x = var_4443_cast_fp16)[name = string("q_73")];
+            tensor<int32, [4]> var_4464 = const()[name = string("op_4464"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_4465 = reshape(shape = var_4464, x = q_73)[name = string("op_4465")];
+            tensor<int32, [4]> transpose_66_perm_0 = const()[name = string("transpose_66_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_4488 = const()[name = string("op_4488"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_66 = transpose(perm = transpose_66_perm_0, x = var_4465)[name = string("transpose_118")];
+            tensor<fp16, [3, 8, 256]> x_121 = reshape(shape = var_4488, x = transpose_66)[name = string("x_121")];
+            int32 var_4494 = const()[name = string("op_4494"), val = int32(-1)];
+            fp16 const_71_promoted = const()[name = string("const_71_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_4496 = mul(x = x_121, y = const_71_promoted)[name = string("op_4496")];
+            bool input_181_interleave_0 = const()[name = string("input_181_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_181 = concat(axis = var_4494, interleave = input_181_interleave_0, values = (x_121, var_4496))[name = string("input_181")];
+            tensor<int32, [1]> normed_173_axes_0 = const()[name = string("normed_173_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4491_to_fp16 = const()[name = string("op_4491_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_173_cast_fp16 = layer_norm(axes = normed_173_axes_0, epsilon = var_4491_to_fp16, x = input_181)[name = string("normed_173_cast_fp16")];
+            tensor<int32, [2]> var_4501_split_sizes_0 = const()[name = string("op_4501_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_4501_axis_0 = const()[name = string("op_4501_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_4501_0, tensor<fp16, [3, 8, 256]> var_4501_1 = split(axis = var_4501_axis_0, split_sizes = var_4501_split_sizes_0, x = normed_173_cast_fp16)[name = string("op_4501")];
+            tensor<fp16, [3, 8, 256]> q_77 = mul(x = var_4501_0, y = layers_2_self_attn_q_norm_weight)[name = string("q_77")];
+            tensor<int32, [4]> var_4508 = const()[name = string("op_4508"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_4509 = reshape(shape = var_4508, x = q_77)[name = string("op_4509")];
+            tensor<int32, [4]> var_4514 = const()[name = string("op_4514"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_79 = transpose(perm = var_4514, x = var_4509)[name = string("transpose_117")];
+            tensor<fp16, [1, 8, 3, 256]> var_4516_cast_fp16 = mul(x = q_79, y = cos_s)[name = string("op_4516_cast_fp16")];
+            tensor<int32, [2]> var_4517_split_sizes_0 = const()[name = string("op_4517_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_4517_axis_0 = const()[name = string("op_4517_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_4517_0, tensor<fp16, [1, 8, 3, 128]> var_4517_1 = split(axis = var_4517_axis_0, split_sizes = var_4517_split_sizes_0, x = q_79)[name = string("op_4517")];
+            fp16 const_72_promoted = const()[name = string("const_72_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_4519 = mul(x = var_4517_1, y = const_72_promoted)[name = string("op_4519")];
+            int32 var_4521 = const()[name = string("op_4521"), val = int32(-1)];
+            bool var_4522_interleave_0 = const()[name = string("op_4522_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_4522 = concat(axis = var_4521, interleave = var_4522_interleave_0, values = (var_4519, var_4517_0))[name = string("op_4522")];
+            tensor<fp16, [1, 8, 3, 256]> var_4523_cast_fp16 = mul(x = var_4522, y = sin_s)[name = string("op_4523_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_83_cast_fp16 = add(x = var_4516_cast_fp16, y = var_4523_cast_fp16)[name = string("q_83_cast_fp16")];
+            string k_39_pad_type_0 = const()[name = string("k_39_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_39_strides_0 = const()[name = string("k_39_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_39_pad_0 = const()[name = string("k_39_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_39_dilations_0 = const()[name = string("k_39_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_39_groups_0 = const()[name = string("k_39_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> k_39 = conv(dilations = k_39_dilations_0, groups = k_39_groups_0, pad = k_39_pad_0, pad_type = k_39_pad_type_0, strides = k_39_strides_0, weight = layers_6_self_attn_k_proj_weight_palettized, x = var_4443_cast_fp16)[name = string("k_39")];
+            tensor<int32, [4]> var_4541 = const()[name = string("op_4541"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_4542 = reshape(shape = var_4541, x = k_39)[name = string("op_4542")];
+            tensor<int32, [4]> transpose_67_perm_0 = const()[name = string("transpose_67_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            string v_15_pad_type_0 = const()[name = string("v_15_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_15_strides_0 = const()[name = string("v_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_15_pad_0 = const()[name = string("v_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_15_dilations_0 = const()[name = string("v_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_15_groups_0 = const()[name = string("v_15_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> v_15 = conv(dilations = v_15_dilations_0, groups = v_15_groups_0, pad = v_15_pad_0, pad_type = v_15_pad_type_0, strides = v_15_strides_0, weight = layers_6_self_attn_v_proj_weight_palettized, x = var_4443_cast_fp16)[name = string("v_15")];
+            tensor<int32, [4]> var_4569 = const()[name = string("op_4569"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_4570 = reshape(shape = var_4569, x = v_15)[name = string("op_4570")];
+            tensor<int32, [4]> var_4575 = const()[name = string("op_4575"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_4593 = const()[name = string("op_4593"), val = tensor<int32, [3]>([3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> transpose_67 = transpose(perm = transpose_67_perm_0, x = var_4542)[name = string("transpose_116")];
+            tensor<fp16, [3, 2, 256]> x_123 = reshape(shape = var_4593, x = transpose_67)[name = string("x_123")];
+            int32 var_4599 = const()[name = string("op_4599"), val = int32(-1)];
+            fp16 const_73_promoted = const()[name = string("const_73_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 2, 256]> var_4601 = mul(x = x_123, y = const_73_promoted)[name = string("op_4601")];
+            bool input_183_interleave_0 = const()[name = string("input_183_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 2, 512]> input_183 = concat(axis = var_4599, interleave = input_183_interleave_0, values = (x_123, var_4601))[name = string("input_183")];
+            tensor<int32, [1]> normed_177_axes_0 = const()[name = string("normed_177_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4596_to_fp16 = const()[name = string("op_4596_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 2, 512]> normed_177_cast_fp16 = layer_norm(axes = normed_177_axes_0, epsilon = var_4596_to_fp16, x = input_183)[name = string("normed_177_cast_fp16")];
+            tensor<int32, [2]> var_4606_split_sizes_0 = const()[name = string("op_4606_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_4606_axis_0 = const()[name = string("op_4606_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 2, 256]> var_4606_0, tensor<fp16, [3, 2, 256]> var_4606_1 = split(axis = var_4606_axis_0, split_sizes = var_4606_split_sizes_0, x = normed_177_cast_fp16)[name = string("op_4606")];
+            tensor<fp16, [3, 2, 256]> k_43 = mul(x = var_4606_0, y = layers_6_self_attn_k_norm_weight)[name = string("k_43")];
+            tensor<int32, [4]> var_4613 = const()[name = string("op_4613"), val = tensor<int32, [4]>([1, 3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> var_4614 = reshape(shape = var_4613, x = k_43)[name = string("op_4614")];
+            tensor<int32, [4]> var_4619 = const()[name = string("op_4619"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            fp16 var_4621_promoted = const()[name = string("op_4621_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 3, 256]> var_4576 = transpose(perm = var_4575, x = var_4570)[name = string("transpose_115")];
+            tensor<fp16, [1, 2, 3, 256]> var_4622 = pow(x = var_4576, y = var_4621_promoted)[name = string("op_4622")];
+            tensor<int32, [1]> var_4627_axes_0 = const()[name = string("op_4627_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4627_keep_dims_0 = const()[name = string("op_4627_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 3, 1]> var_4627 = reduce_mean(axes = var_4627_axes_0, keep_dims = var_4627_keep_dims_0, x = var_4622)[name = string("op_4627")];
+            fp16 var_4629_to_fp16 = const()[name = string("op_4629_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 3, 1]> mean_sq_13_cast_fp16 = add(x = var_4627, y = var_4629_to_fp16)[name = string("mean_sq_13_cast_fp16")];
+            fp32 var_4631_epsilon_0 = const()[name = string("op_4631_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 3, 1]> var_4631_cast_fp16 = rsqrt(epsilon = var_4631_epsilon_0, x = mean_sq_13_cast_fp16)[name = string("op_4631_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_187_cast_fp16 = mul(x = var_4576, y = var_4631_cast_fp16)[name = string("input_187_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> q_81 = transpose(perm = var_4619, x = var_4614)[name = string("transpose_114")];
+            tensor<fp16, [1, 2, 3, 256]> var_4633_cast_fp16 = mul(x = q_81, y = cos_s)[name = string("op_4633_cast_fp16")];
+            tensor<int32, [2]> var_4634_split_sizes_0 = const()[name = string("op_4634_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_4634_axis_0 = const()[name = string("op_4634_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 3, 128]> var_4634_0, tensor<fp16, [1, 2, 3, 128]> var_4634_1 = split(axis = var_4634_axis_0, split_sizes = var_4634_split_sizes_0, x = q_81)[name = string("op_4634")];
+            fp16 const_74_promoted = const()[name = string("const_74_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 3, 128]> var_4636 = mul(x = var_4634_1, y = const_74_promoted)[name = string("op_4636")];
+            int32 var_4638 = const()[name = string("op_4638"), val = int32(-1)];
+            bool var_4639_interleave_0 = const()[name = string("op_4639_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 3, 256]> var_4639 = concat(axis = var_4638, interleave = var_4639_interleave_0, values = (var_4636, var_4634_0))[name = string("op_4639")];
+            tensor<fp16, [1, 2, 3, 256]> var_4640_cast_fp16 = mul(x = var_4639, y = sin_s)[name = string("op_4640_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_185_cast_fp16 = add(x = var_4633_cast_fp16, y = var_4640_cast_fp16)[name = string("input_185_cast_fp16")];
+            tensor<int32, [8]> k_padded_11_pad_0 = const()[name = string("k_padded_11_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_11_mode_0 = const()[name = string("k_padded_11_mode_0"), val = string("constant")];
+            fp16 const_75_to_fp16 = const()[name = string("const_75_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> k_padded_11_cast_fp16 = pad(constant_val = const_75_to_fp16, mode = k_padded_11_mode_0, pad = k_padded_11_pad_0, x = input_185_cast_fp16)[name = string("k_padded_11_cast_fp16")];
+            tensor<int32, [8]> v_padded_11_pad_0 = const()[name = string("v_padded_11_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_11_mode_0 = const()[name = string("v_padded_11_mode_0"), val = string("constant")];
+            fp16 const_76_to_fp16 = const()[name = string("const_76_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> v_padded_11_cast_fp16 = pad(constant_val = const_76_to_fp16, mode = v_padded_11_mode_0, pad = v_padded_11_pad_0, x = input_187_cast_fp16)[name = string("v_padded_11_cast_fp16")];
+            tensor<int32, [4]> slot_k_13_begin_0 = const()[name = string("slot_k_13_begin_0"), val = tensor<int32, [4]>([5, 0, 0, 0])];
+            tensor<int32, [4]> slot_k_13_end_0 = const()[name = string("slot_k_13_end_0"), val = tensor<int32, [4]>([6, 2, 512, 512])];
+            tensor<bool, [4]> slot_k_13_end_mask_0 = const()[name = string("slot_k_13_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_k_13_cast_fp16 = slice_by_index(begin = slot_k_13_begin_0, end = slot_k_13_end_0, end_mask = slot_k_13_end_mask_0, x = K_sliding_out_9_cast_fp16)[name = string("slot_k_13_cast_fp16")];
+            tensor<int32, [4]> slot_v_13_begin_0 = const()[name = string("slot_v_13_begin_0"), val = tensor<int32, [4]>([5, 0, 0, 0])];
+            tensor<int32, [4]> slot_v_13_end_0 = const()[name = string("slot_v_13_end_0"), val = tensor<int32, [4]>([6, 2, 512, 512])];
+            tensor<bool, [4]> slot_v_13_end_mask_0 = const()[name = string("slot_v_13_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_v_13_cast_fp16 = slice_by_index(begin = slot_v_13_begin_0, end = slot_v_13_end_0, end_mask = slot_v_13_end_mask_0, x = V_sliding_out_9_cast_fp16)[name = string("slot_v_13_cast_fp16")];
+            tensor<int32, [4]> var_4679_begin_0 = const()[name = string("op_4679_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_4679_end_0 = const()[name = string("op_4679_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_4679_end_mask_0 = const()[name = string("op_4679_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_4679_cast_fp16 = slice_by_index(begin = var_4679_begin_0, end = var_4679_end_0, end_mask = var_4679_end_mask_0, x = slot_k_13_cast_fp16)[name = string("op_4679_cast_fp16")];
+            int32 var_4686 = const()[name = string("op_4686"), val = int32(2)];
+            bool new_k_13_interleave_0 = const()[name = string("new_k_13_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_k_13_cast_fp16 = concat(axis = var_4686, interleave = new_k_13_interleave_0, values = (var_4679_cast_fp16, k_padded_11_cast_fp16))[name = string("new_k_13_cast_fp16")];
+            tensor<int32, [4]> var_4702_begin_0 = const()[name = string("op_4702_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_4702_end_0 = const()[name = string("op_4702_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_4702_end_mask_0 = const()[name = string("op_4702_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_4702_cast_fp16 = slice_by_index(begin = var_4702_begin_0, end = var_4702_end_0, end_mask = var_4702_end_mask_0, x = slot_v_13_cast_fp16)[name = string("op_4702_cast_fp16")];
+            int32 var_4709 = const()[name = string("op_4709"), val = int32(2)];
+            bool new_v_13_interleave_0 = const()[name = string("new_v_13_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_v_13_cast_fp16 = concat(axis = var_4709, interleave = new_v_13_interleave_0, values = (var_4702_cast_fp16, v_padded_11_cast_fp16))[name = string("new_v_13_cast_fp16")];
+            tensor<int32, [4]> var_4715_begin_0 = const()[name = string("op_4715_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4715_end_0 = const()[name = string("op_4715_end_0"), val = tensor<int32, [4]>([5, 2, 512, 512])];
+            tensor<bool, [4]> var_4715_end_mask_0 = const()[name = string("op_4715_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [5, 2, 512, 512]> var_4715_cast_fp16 = slice_by_index(begin = var_4715_begin_0, end = var_4715_end_0, end_mask = var_4715_end_mask_0, x = K_sliding_out_9_cast_fp16)[name = string("op_4715_cast_fp16")];
+            tensor<int32, [4]> var_4720_begin_0 = const()[name = string("op_4720_begin_0"), val = tensor<int32, [4]>([6, 0, 0, 0])];
+            tensor<int32, [4]> var_4720_end_0 = const()[name = string("op_4720_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_4720_end_mask_0 = const()[name = string("op_4720_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [4, 2, 512, 512]> var_4720_cast_fp16 = slice_by_index(begin = var_4720_begin_0, end = var_4720_end_0, end_mask = var_4720_end_mask_0, x = K_sliding_out_9_cast_fp16)[name = string("op_4720_cast_fp16")];
+            int32 var_4722 = const()[name = string("op_4722"), val = int32(0)];
+            bool K_sliding_out_11_interleave_0 = const()[name = string("K_sliding_out_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> K_sliding_out_11_cast_fp16 = concat(axis = var_4722, interleave = K_sliding_out_11_interleave_0, values = (var_4715_cast_fp16, new_k_13_cast_fp16, var_4720_cast_fp16))[name = string("K_sliding_out_11_cast_fp16")];
+            tensor<int32, [4]> var_4728_begin_0 = const()[name = string("op_4728_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_4728_end_0 = const()[name = string("op_4728_end_0"), val = tensor<int32, [4]>([5, 2, 512, 512])];
+            tensor<bool, [4]> var_4728_end_mask_0 = const()[name = string("op_4728_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [5, 2, 512, 512]> var_4728_cast_fp16 = slice_by_index(begin = var_4728_begin_0, end = var_4728_end_0, end_mask = var_4728_end_mask_0, x = V_sliding_out_9_cast_fp16)[name = string("op_4728_cast_fp16")];
+            tensor<int32, [4]> var_4733_begin_0 = const()[name = string("op_4733_begin_0"), val = tensor<int32, [4]>([6, 0, 0, 0])];
+            tensor<int32, [4]> var_4733_end_0 = const()[name = string("op_4733_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_4733_end_mask_0 = const()[name = string("op_4733_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [4, 2, 512, 512]> var_4733_cast_fp16 = slice_by_index(begin = var_4733_begin_0, end = var_4733_end_0, end_mask = var_4733_end_mask_0, x = V_sliding_out_9_cast_fp16)[name = string("op_4733_cast_fp16")];
+            int32 var_4735 = const()[name = string("op_4735"), val = int32(0)];
+            bool V_sliding_out_11_interleave_0 = const()[name = string("V_sliding_out_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> V_sliding_out_11_cast_fp16 = concat(axis = var_4735, interleave = V_sliding_out_11_interleave_0, values = (var_4728_cast_fp16, new_v_13_cast_fp16, var_4733_cast_fp16))[name = string("V_sliding_out_11_cast_fp16")];
+            tensor<int32, [4]> var_4741_begin_0 = const()[name = string("op_4741_begin_0"), val = tensor<int32, [4]>([5, 0, 0, 0])];
+            tensor<int32, [4]> var_4741_end_0 = const()[name = string("op_4741_end_0"), val = tensor<int32, [4]>([6, 2, 512, 512])];
+            tensor<bool, [4]> var_4741_end_mask_0 = const()[name = string("op_4741_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_4741_cast_fp16 = slice_by_index(begin = var_4741_begin_0, end = var_4741_end_0, end_mask = var_4741_end_mask_0, x = K_sliding_out_11_cast_fp16)[name = string("op_4741_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_13_begin_0 = const()[name = string("K_for_attn_13_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_13_end_0 = const()[name = string("K_for_attn_13_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_13_end_mask_0 = const()[name = string("K_for_attn_13_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_13_cast_fp16 = slice_by_index(begin = K_for_attn_13_begin_0, end = K_for_attn_13_end_0, end_mask = K_for_attn_13_end_mask_0, x = var_4741_cast_fp16)[name = string("K_for_attn_13_cast_fp16")];
+            tensor<int32, [4]> var_4751_begin_0 = const()[name = string("op_4751_begin_0"), val = tensor<int32, [4]>([5, 0, 0, 0])];
+            tensor<int32, [4]> var_4751_end_0 = const()[name = string("op_4751_end_0"), val = tensor<int32, [4]>([6, 2, 512, 512])];
+            tensor<bool, [4]> var_4751_end_mask_0 = const()[name = string("op_4751_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_4751_cast_fp16 = slice_by_index(begin = var_4751_begin_0, end = var_4751_end_0, end_mask = var_4751_end_mask_0, x = V_sliding_out_11_cast_fp16)[name = string("op_4751_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_13_begin_0 = const()[name = string("V_for_attn_13_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_13_end_0 = const()[name = string("V_for_attn_13_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_13_end_mask_0 = const()[name = string("V_for_attn_13_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_13_cast_fp16 = slice_by_index(begin = V_for_attn_13_begin_0, end = V_for_attn_13_end_0, end_mask = V_for_attn_13_end_mask_0, x = var_4751_cast_fp16)[name = string("V_for_attn_13_cast_fp16")];
+            tensor<int32, [4]> transpose_24_perm_0 = const()[name = string("transpose_24_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_12_reps_0 = const()[name = string("tile_12_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_24_cast_fp16 = transpose(perm = transpose_24_perm_0, x = K_for_attn_13_cast_fp16)[name = string("transpose_113")];
+            tensor<fp16, [8, 1, 512, 256]> tile_12_cast_fp16 = tile(reps = tile_12_reps_0, x = transpose_24_cast_fp16)[name = string("tile_12_cast_fp16")];
+            tensor<int32, [5]> concat_26 = const()[name = string("concat_26"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_24_cast_fp16 = reshape(shape = concat_26, x = tile_12_cast_fp16)[name = string("reshape_24_cast_fp16")];
+            tensor<int32, [5]> transpose_25_perm_0 = const()[name = string("transpose_25_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_27 = const()[name = string("concat_27"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_25_cast_fp16 = transpose(perm = transpose_25_perm_0, x = reshape_24_cast_fp16)[name = string("transpose_112")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_25_cast_fp16 = reshape(shape = concat_27, x = transpose_25_cast_fp16)[name = string("reshape_25_cast_fp16")];
+            tensor<int32, [4]> transpose_68_perm_0 = const()[name = string("transpose_68_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_26_perm_0 = const()[name = string("transpose_26_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_13_reps_0 = const()[name = string("tile_13_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_26_cast_fp16 = transpose(perm = transpose_26_perm_0, x = V_for_attn_13_cast_fp16)[name = string("transpose_111")];
+            tensor<fp16, [8, 1, 512, 256]> tile_13_cast_fp16 = tile(reps = tile_13_reps_0, x = transpose_26_cast_fp16)[name = string("tile_13_cast_fp16")];
+            tensor<int32, [5]> concat_28 = const()[name = string("concat_28"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_26_cast_fp16 = reshape(shape = concat_28, x = tile_13_cast_fp16)[name = string("reshape_26_cast_fp16")];
+            tensor<int32, [5]> transpose_27_perm_0 = const()[name = string("transpose_27_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_29 = const()[name = string("concat_29"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_27_cast_fp16 = transpose(perm = transpose_27_perm_0, x = reshape_26_cast_fp16)[name = string("transpose_110")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_27_cast_fp16 = reshape(shape = concat_29, x = transpose_27_cast_fp16)[name = string("reshape_27_cast_fp16")];
+            tensor<int32, [4]> V_expanded_13_perm_0 = const()[name = string("V_expanded_13_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_25_transpose_x_0 = const()[name = string("attn_weights_25_transpose_x_0"), val = bool(false)];
+            bool attn_weights_25_transpose_y_0 = const()[name = string("attn_weights_25_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_68_cast_fp16 = transpose(perm = transpose_68_perm_0, x = reshape_25_cast_fp16)[name = string("transpose_109")];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_25_cast_fp16 = matmul(transpose_x = attn_weights_25_transpose_x_0, transpose_y = attn_weights_25_transpose_y_0, x = q_83_cast_fp16, y = transpose_68_cast_fp16)[name = string("attn_weights_25_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_127_cast_fp16 = add(x = attn_weights_25_cast_fp16, y = causal_mask_sliding)[name = string("x_127_cast_fp16")];
+            tensor<int32, [1]> reduce_max_6_axes_0 = const()[name = string("reduce_max_6_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_6_keep_dims_0 = const()[name = string("reduce_max_6_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_6 = reduce_max(axes = reduce_max_6_axes_0, keep_dims = reduce_max_6_keep_dims_0, x = x_127_cast_fp16)[name = string("reduce_max_6")];
+            tensor<fp16, [1, 8, 3, 512]> var_4786 = sub(x = x_127_cast_fp16, y = reduce_max_6)[name = string("op_4786")];
+            tensor<fp16, [1, 8, 3, 512]> var_4792 = exp(x = var_4786)[name = string("op_4792")];
+            tensor<int32, [1]> var_4802_axes_0 = const()[name = string("op_4802_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4802_keep_dims_0 = const()[name = string("op_4802_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_4802 = reduce_sum(axes = var_4802_axes_0, keep_dims = var_4802_keep_dims_0, x = var_4792)[name = string("op_4802")];
+            tensor<fp16, [1, 8, 3, 512]> var_4808_cast_fp16 = real_div(x = var_4792, y = var_4802)[name = string("op_4808_cast_fp16")];
+            bool attn_output_37_transpose_x_0 = const()[name = string("attn_output_37_transpose_x_0"), val = bool(false)];
+            bool attn_output_37_transpose_y_0 = const()[name = string("attn_output_37_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_13_cast_fp16 = transpose(perm = V_expanded_13_perm_0, x = reshape_27_cast_fp16)[name = string("transpose_108")];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_37_cast_fp16 = matmul(transpose_x = attn_output_37_transpose_x_0, transpose_y = attn_output_37_transpose_y_0, x = var_4808_cast_fp16, y = V_expanded_13_cast_fp16)[name = string("attn_output_37_cast_fp16")];
+            tensor<int32, [4]> var_4819 = const()[name = string("op_4819"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_4826 = const()[name = string("op_4826"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_4820_cast_fp16 = transpose(perm = var_4819, x = attn_output_37_cast_fp16)[name = string("transpose_107")];
+            tensor<fp16, [1, 3, 2048]> attn_output_39_cast_fp16 = reshape(shape = var_4826, x = var_4820_cast_fp16)[name = string("attn_output_39_cast_fp16")];
+            tensor<int32, [3]> var_4831 = const()[name = string("op_4831"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_4847_pad_type_0 = const()[name = string("op_4847_pad_type_0"), val = string("valid")];
+            int32 var_4847_groups_0 = const()[name = string("op_4847_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_4847_strides_0 = const()[name = string("op_4847_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_4847_pad_0 = const()[name = string("op_4847_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_4847_dilations_0 = const()[name = string("op_4847_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_6_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(551729344))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(554350848))))[name = string("squeeze_6_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_4832_cast_fp16 = transpose(perm = var_4831, x = attn_output_39_cast_fp16)[name = string("transpose_106")];
+            tensor<fp16, [1, 2560, 3]> var_4847_cast_fp16 = conv(dilations = var_4847_dilations_0, groups = var_4847_groups_0, pad = var_4847_pad_0, pad_type = var_4847_pad_type_0, strides = var_4847_strides_0, weight = squeeze_6_cast_fp16_to_fp32_to_fp16_palettized, x = var_4832_cast_fp16)[name = string("op_4847_cast_fp16")];
+            tensor<int32, [3]> var_4851 = const()[name = string("op_4851"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_4857 = const()[name = string("op_4857"), val = int32(-1)];
+            fp16 const_77_promoted_to_fp16 = const()[name = string("const_77_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_131_cast_fp16 = transpose(perm = var_4851, x = var_4847_cast_fp16)[name = string("transpose_105")];
+            tensor<fp16, [1, 3, 2560]> var_4859_cast_fp16 = mul(x = x_131_cast_fp16, y = const_77_promoted_to_fp16)[name = string("op_4859_cast_fp16")];
+            bool input_191_interleave_0 = const()[name = string("input_191_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_191_cast_fp16 = concat(axis = var_4857, interleave = input_191_interleave_0, values = (x_131_cast_fp16, var_4859_cast_fp16))[name = string("input_191_cast_fp16")];
+            tensor<int32, [1]> normed_181_axes_0 = const()[name = string("normed_181_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4854_to_fp16 = const()[name = string("op_4854_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_181_cast_fp16 = layer_norm(axes = normed_181_axes_0, epsilon = var_4854_to_fp16, x = input_191_cast_fp16)[name = string("normed_181_cast_fp16")];
+            tensor<int32, [2]> var_4864_split_sizes_0 = const()[name = string("op_4864_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4864_axis_0 = const()[name = string("op_4864_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_4864_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_4864_cast_fp16_1 = split(axis = var_4864_axis_0, split_sizes = var_4864_split_sizes_0, x = normed_181_cast_fp16)[name = string("op_4864_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_6_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(554353472)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_41_cast_fp16 = mul(x = var_4864_cast_fp16_0, y = layers_6_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_41_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_133_cast_fp16 = add(x = x_119_cast_fp16, y = attn_output_41_cast_fp16)[name = string("x_133_cast_fp16")];
+            int32 var_4873 = const()[name = string("op_4873"), val = int32(-1)];
+            fp16 const_78_promoted_to_fp16 = const()[name = string("const_78_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_4875_cast_fp16 = mul(x = x_133_cast_fp16, y = const_78_promoted_to_fp16)[name = string("op_4875_cast_fp16")];
+            bool input_193_interleave_0 = const()[name = string("input_193_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_193_cast_fp16 = concat(axis = var_4873, interleave = input_193_interleave_0, values = (x_133_cast_fp16, var_4875_cast_fp16))[name = string("input_193_cast_fp16")];
+            tensor<int32, [1]> normed_185_axes_0 = const()[name = string("normed_185_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4870_to_fp16 = const()[name = string("op_4870_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_185_cast_fp16 = layer_norm(axes = normed_185_axes_0, epsilon = var_4870_to_fp16, x = input_193_cast_fp16)[name = string("normed_185_cast_fp16")];
+            tensor<int32, [2]> var_4880_split_sizes_0 = const()[name = string("op_4880_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4880_axis_0 = const()[name = string("op_4880_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_4880_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_4880_cast_fp16_1 = split(axis = var_4880_axis_0, split_sizes = var_4880_split_sizes_0, x = normed_185_cast_fp16)[name = string("op_4880_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_6_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(554358656)))];
+            tensor<fp16, [1, 3, 2560]> h_39_cast_fp16 = mul(x = var_4880_cast_fp16_0, y = layers_6_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_39_cast_fp16")];
+            tensor<int32, [3]> var_4891 = const()[name = string("op_4891"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_195_axes_0 = const()[name = string("input_195_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_4892 = transpose(perm = var_4891, x = h_39_cast_fp16)[name = string("transpose_104")];
+            tensor<fp16, [1, 2560, 1, 3]> input_195 = expand_dims(axes = input_195_axes_0, x = var_4892)[name = string("input_195")];
+            string gate_25_pad_type_0 = const()[name = string("gate_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_25_strides_0 = const()[name = string("gate_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_25_pad_0 = const()[name = string("gate_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_25_dilations_0 = const()[name = string("gate_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_25_groups_0 = const()[name = string("gate_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_25 = conv(dilations = gate_25_dilations_0, groups = gate_25_groups_0, pad = gate_25_pad_0, pad_type = gate_25_pad_type_0, strides = gate_25_strides_0, weight = layers_6_mlp_gate_proj_weight_palettized, x = input_195)[name = string("gate_25")];
+            string up_13_pad_type_0 = const()[name = string("up_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_13_strides_0 = const()[name = string("up_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_13_pad_0 = const()[name = string("up_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_13_dilations_0 = const()[name = string("up_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_13_groups_0 = const()[name = string("up_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_13 = conv(dilations = up_13_dilations_0, groups = up_13_groups_0, pad = up_13_pad_0, pad_type = up_13_pad_type_0, strides = up_13_strides_0, weight = layers_6_mlp_up_proj_weight_palettized, x = input_195)[name = string("up_13")];
+            string gate_27_mode_0 = const()[name = string("gate_27_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_27 = gelu(mode = gate_27_mode_0, x = gate_25)[name = string("gate_27")];
+            tensor<fp16, [1, 10240, 1, 3]> input_197 = mul(x = gate_27, y = up_13)[name = string("input_197")];
+            string mlp_out_13_pad_type_0 = const()[name = string("mlp_out_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_13_strides_0 = const()[name = string("mlp_out_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_13_pad_0 = const()[name = string("mlp_out_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_13_dilations_0 = const()[name = string("mlp_out_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_13_groups_0 = const()[name = string("mlp_out_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_13 = conv(dilations = mlp_out_13_dilations_0, groups = mlp_out_13_groups_0, pad = mlp_out_13_pad_0, pad_type = mlp_out_13_pad_type_0, strides = mlp_out_13_strides_0, weight = layers_6_mlp_down_proj_weight_palettized, x = input_197)[name = string("mlp_out_13")];
+            tensor<int32, [1]> var_4932_axes_0 = const()[name = string("op_4932_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_4932 = squeeze(axes = var_4932_axes_0, x = mlp_out_13)[name = string("op_4932")];
+            tensor<int32, [3]> var_4936 = const()[name = string("op_4936"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_4942 = const()[name = string("op_4942"), val = int32(-1)];
+            fp16 const_79_promoted = const()[name = string("const_79_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_135 = transpose(perm = var_4936, x = var_4932)[name = string("transpose_103")];
+            tensor<fp16, [1, 3, 2560]> var_4944 = mul(x = x_135, y = const_79_promoted)[name = string("op_4944")];
+            bool input_199_interleave_0 = const()[name = string("input_199_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_199 = concat(axis = var_4942, interleave = input_199_interleave_0, values = (x_135, var_4944))[name = string("input_199")];
+            tensor<int32, [1]> normed_189_axes_0 = const()[name = string("normed_189_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4939_to_fp16 = const()[name = string("op_4939_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_189_cast_fp16 = layer_norm(axes = normed_189_axes_0, epsilon = var_4939_to_fp16, x = input_199)[name = string("normed_189_cast_fp16")];
+            tensor<int32, [2]> var_4949_split_sizes_0 = const()[name = string("op_4949_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4949_axis_0 = const()[name = string("op_4949_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_4949_0, tensor<fp16, [1, 3, 2560]> var_4949_1 = split(axis = var_4949_axis_0, split_sizes = var_4949_split_sizes_0, x = normed_189_cast_fp16)[name = string("op_4949")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_63 = mul(x = var_4949_0, y = layers_6_post_feedforward_layernorm_weight)[name = string("hidden_states_63")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_65_cast_fp16 = add(x = x_133_cast_fp16, y = hidden_states_63)[name = string("hidden_states_65_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_13_begin_0 = const()[name = string("per_layer_slice_13_begin_0"), val = tensor<int32, [3]>([0, 0, 4608])];
+            tensor<int32, [3]> per_layer_slice_13_end_0 = const()[name = string("per_layer_slice_13_end_0"), val = tensor<int32, [3]>([1, 3, 4864])];
+            tensor<bool, [3]> per_layer_slice_13_end_mask_0 = const()[name = string("per_layer_slice_13_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_13_cast_fp16 = slice_by_index(begin = per_layer_slice_13_begin_0, end = per_layer_slice_13_end_0, end_mask = per_layer_slice_13_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_13_cast_fp16")];
+            tensor<int32, [3]> var_4977 = const()[name = string("op_4977"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_201_axes_0 = const()[name = string("input_201_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_4978 = transpose(perm = var_4977, x = hidden_states_65_cast_fp16)[name = string("transpose_102")];
+            tensor<fp16, [1, 2560, 1, 3]> input_201 = expand_dims(axes = input_201_axes_0, x = var_4978)[name = string("input_201")];
+            string gated_37_pad_type_0 = const()[name = string("gated_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_37_strides_0 = const()[name = string("gated_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_37_pad_0 = const()[name = string("gated_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_37_dilations_0 = const()[name = string("gated_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_37_groups_0 = const()[name = string("gated_37_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_37 = conv(dilations = gated_37_dilations_0, groups = gated_37_groups_0, pad = gated_37_pad_0, pad_type = gated_37_pad_type_0, strides = gated_37_strides_0, weight = layers_6_per_layer_input_gate_weight_palettized, x = input_201)[name = string("gated_37")];
+            string gated_39_mode_0 = const()[name = string("gated_39_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_39 = gelu(mode = gated_39_mode_0, x = gated_37)[name = string("gated_39")];
+            tensor<int32, [3]> var_4997 = const()[name = string("op_4997"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_13_axes_0 = const()[name = string("per_layer_slice_conv_13_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_4998_cast_fp16 = transpose(perm = var_4997, x = per_layer_slice_13_cast_fp16)[name = string("transpose_101")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_13_cast_fp16 = expand_dims(axes = per_layer_slice_conv_13_axes_0, x = var_4998_cast_fp16)[name = string("per_layer_slice_conv_13_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_203_cast_fp16 = mul(x = gated_39, y = per_layer_slice_conv_13_cast_fp16)[name = string("input_203_cast_fp16")];
+            string gated_41_pad_type_0 = const()[name = string("gated_41_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_41_strides_0 = const()[name = string("gated_41_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_41_pad_0 = const()[name = string("gated_41_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_41_dilations_0 = const()[name = string("gated_41_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_41_groups_0 = const()[name = string("gated_41_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_6_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(554363840))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(554691584))))[name = string("layers_6_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_41_cast_fp16 = conv(dilations = gated_41_dilations_0, groups = gated_41_groups_0, pad = gated_41_pad_0, pad_type = gated_41_pad_type_0, strides = gated_41_strides_0, weight = layers_6_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_203_cast_fp16)[name = string("gated_41_cast_fp16")];
+            tensor<int32, [1]> var_5014_axes_0 = const()[name = string("op_5014_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_5014_cast_fp16 = squeeze(axes = var_5014_axes_0, x = gated_41_cast_fp16)[name = string("op_5014_cast_fp16")];
+            tensor<int32, [3]> var_5018 = const()[name = string("op_5018"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_5024 = const()[name = string("op_5024"), val = int32(-1)];
+            fp16 const_80_promoted_to_fp16 = const()[name = string("const_80_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_137_cast_fp16 = transpose(perm = var_5018, x = var_5014_cast_fp16)[name = string("transpose_100")];
+            tensor<fp16, [1, 3, 2560]> var_5026_cast_fp16 = mul(x = x_137_cast_fp16, y = const_80_promoted_to_fp16)[name = string("op_5026_cast_fp16")];
+            bool input_205_interleave_0 = const()[name = string("input_205_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_205_cast_fp16 = concat(axis = var_5024, interleave = input_205_interleave_0, values = (x_137_cast_fp16, var_5026_cast_fp16))[name = string("input_205_cast_fp16")];
+            tensor<int32, [1]> normed_193_axes_0 = const()[name = string("normed_193_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5021_to_fp16 = const()[name = string("op_5021_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_193_cast_fp16 = layer_norm(axes = normed_193_axes_0, epsilon = var_5021_to_fp16, x = input_205_cast_fp16)[name = string("normed_193_cast_fp16")];
+            tensor<int32, [2]> var_5031_split_sizes_0 = const()[name = string("op_5031_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5031_axis_0 = const()[name = string("op_5031_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_5031_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_5031_cast_fp16_1 = split(axis = var_5031_axis_0, split_sizes = var_5031_split_sizes_0, x = normed_193_cast_fp16)[name = string("op_5031_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_6_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(554694208)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_69_cast_fp16 = mul(x = var_5031_cast_fp16_0, y = layers_6_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_69_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_71_cast_fp16 = add(x = hidden_states_65_cast_fp16, y = hidden_states_69_cast_fp16)[name = string("hidden_states_71_cast_fp16")];
+            tensor<fp16, [1]> const_81_promoted_to_fp16 = const()[name = string("const_81_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.16p-1])];
+            tensor<fp16, [1, 3, 2560]> x_139_cast_fp16 = mul(x = hidden_states_71_cast_fp16, y = const_81_promoted_to_fp16)[name = string("x_139_cast_fp16")];
+            int32 var_5046 = const()[name = string("op_5046"), val = int32(-1)];
+            fp16 const_82_promoted_to_fp16 = const()[name = string("const_82_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_5048_cast_fp16 = mul(x = x_139_cast_fp16, y = const_82_promoted_to_fp16)[name = string("op_5048_cast_fp16")];
+            bool input_207_interleave_0 = const()[name = string("input_207_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_207_cast_fp16 = concat(axis = var_5046, interleave = input_207_interleave_0, values = (x_139_cast_fp16, var_5048_cast_fp16))[name = string("input_207_cast_fp16")];
+            tensor<int32, [1]> normed_197_axes_0 = const()[name = string("normed_197_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5043_to_fp16 = const()[name = string("op_5043_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_197_cast_fp16 = layer_norm(axes = normed_197_axes_0, epsilon = var_5043_to_fp16, x = input_207_cast_fp16)[name = string("normed_197_cast_fp16")];
+            tensor<int32, [2]> var_5053_split_sizes_0 = const()[name = string("op_5053_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5053_axis_0 = const()[name = string("op_5053_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_5053_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_5053_cast_fp16_1 = split(axis = var_5053_axis_0, split_sizes = var_5053_split_sizes_0, x = normed_197_cast_fp16)[name = string("op_5053_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_7_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(554699392)))];
+            tensor<fp16, [1, 3, 2560]> h_43_cast_fp16 = mul(x = var_5053_cast_fp16_0, y = layers_7_input_layernorm_weight_promoted_to_fp16)[name = string("h_43_cast_fp16")];
+            tensor<int32, [3]> var_5059 = const()[name = string("op_5059"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_5062_axes_0 = const()[name = string("op_5062_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_5060_cast_fp16 = transpose(perm = var_5059, x = h_43_cast_fp16)[name = string("transpose_99")];
+            tensor<fp16, [1, 2560, 1, 3]> var_5062_cast_fp16 = expand_dims(axes = var_5062_axes_0, x = var_5060_cast_fp16)[name = string("op_5062_cast_fp16")];
+            string q_85_pad_type_0 = const()[name = string("q_85_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_85_strides_0 = const()[name = string("q_85_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_85_pad_0 = const()[name = string("q_85_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_85_dilations_0 = const()[name = string("q_85_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_85_groups_0 = const()[name = string("q_85_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_85 = conv(dilations = q_85_dilations_0, groups = q_85_groups_0, pad = q_85_pad_0, pad_type = q_85_pad_type_0, strides = q_85_strides_0, weight = layers_7_self_attn_q_proj_weight_palettized, x = var_5062_cast_fp16)[name = string("q_85")];
+            tensor<int32, [4]> var_5083 = const()[name = string("op_5083"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_5084 = reshape(shape = var_5083, x = q_85)[name = string("op_5084")];
+            tensor<int32, [4]> transpose_69_perm_0 = const()[name = string("transpose_69_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_5107 = const()[name = string("op_5107"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_69 = transpose(perm = transpose_69_perm_0, x = var_5084)[name = string("transpose_98")];
+            tensor<fp16, [3, 8, 256]> x_141 = reshape(shape = var_5107, x = transpose_69)[name = string("x_141")];
+            int32 var_5113 = const()[name = string("op_5113"), val = int32(-1)];
+            fp16 const_83_promoted = const()[name = string("const_83_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_5115 = mul(x = x_141, y = const_83_promoted)[name = string("op_5115")];
+            bool input_211_interleave_0 = const()[name = string("input_211_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_211 = concat(axis = var_5113, interleave = input_211_interleave_0, values = (x_141, var_5115))[name = string("input_211")];
+            tensor<int32, [1]> normed_201_axes_0 = const()[name = string("normed_201_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5110_to_fp16 = const()[name = string("op_5110_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_201_cast_fp16 = layer_norm(axes = normed_201_axes_0, epsilon = var_5110_to_fp16, x = input_211)[name = string("normed_201_cast_fp16")];
+            tensor<int32, [2]> var_5120_split_sizes_0 = const()[name = string("op_5120_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_5120_axis_0 = const()[name = string("op_5120_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_5120_0, tensor<fp16, [3, 8, 256]> var_5120_1 = split(axis = var_5120_axis_0, split_sizes = var_5120_split_sizes_0, x = normed_201_cast_fp16)[name = string("op_5120")];
+            tensor<fp16, [3, 8, 256]> q_89 = mul(x = var_5120_0, y = layers_7_self_attn_q_norm_weight)[name = string("q_89")];
+            tensor<int32, [4]> var_5127 = const()[name = string("op_5127"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_5128 = reshape(shape = var_5127, x = q_89)[name = string("op_5128")];
+            tensor<int32, [4]> var_5133 = const()[name = string("op_5133"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_91 = transpose(perm = var_5133, x = var_5128)[name = string("transpose_97")];
+            tensor<fp16, [1, 8, 3, 256]> var_5135_cast_fp16 = mul(x = q_91, y = cos_s)[name = string("op_5135_cast_fp16")];
+            tensor<int32, [2]> var_5136_split_sizes_0 = const()[name = string("op_5136_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_5136_axis_0 = const()[name = string("op_5136_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_5136_0, tensor<fp16, [1, 8, 3, 128]> var_5136_1 = split(axis = var_5136_axis_0, split_sizes = var_5136_split_sizes_0, x = q_91)[name = string("op_5136")];
+            fp16 const_84_promoted = const()[name = string("const_84_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_5138 = mul(x = var_5136_1, y = const_84_promoted)[name = string("op_5138")];
+            int32 var_5140 = const()[name = string("op_5140"), val = int32(-1)];
+            bool var_5141_interleave_0 = const()[name = string("op_5141_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_5141 = concat(axis = var_5140, interleave = var_5141_interleave_0, values = (var_5138, var_5136_0))[name = string("op_5141")];
+            tensor<fp16, [1, 8, 3, 256]> var_5142_cast_fp16 = mul(x = var_5141, y = sin_s)[name = string("op_5142_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_95_cast_fp16 = add(x = var_5135_cast_fp16, y = var_5142_cast_fp16)[name = string("q_95_cast_fp16")];
+            string k_45_pad_type_0 = const()[name = string("k_45_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_45_strides_0 = const()[name = string("k_45_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_45_pad_0 = const()[name = string("k_45_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_45_dilations_0 = const()[name = string("k_45_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_45_groups_0 = const()[name = string("k_45_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> k_45 = conv(dilations = k_45_dilations_0, groups = k_45_groups_0, pad = k_45_pad_0, pad_type = k_45_pad_type_0, strides = k_45_strides_0, weight = layers_7_self_attn_k_proj_weight_palettized, x = var_5062_cast_fp16)[name = string("k_45")];
+            tensor<int32, [4]> var_5160 = const()[name = string("op_5160"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_5161 = reshape(shape = var_5160, x = k_45)[name = string("op_5161")];
+            tensor<int32, [4]> transpose_70_perm_0 = const()[name = string("transpose_70_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            string v_17_pad_type_0 = const()[name = string("v_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_17_strides_0 = const()[name = string("v_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_17_pad_0 = const()[name = string("v_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_17_dilations_0 = const()[name = string("v_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_17_groups_0 = const()[name = string("v_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> v_17 = conv(dilations = v_17_dilations_0, groups = v_17_groups_0, pad = v_17_pad_0, pad_type = v_17_pad_type_0, strides = v_17_strides_0, weight = layers_7_self_attn_v_proj_weight_palettized, x = var_5062_cast_fp16)[name = string("v_17")];
+            tensor<int32, [4]> var_5188 = const()[name = string("op_5188"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_5189 = reshape(shape = var_5188, x = v_17)[name = string("op_5189")];
+            tensor<int32, [4]> var_5194 = const()[name = string("op_5194"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_5212 = const()[name = string("op_5212"), val = tensor<int32, [3]>([3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> transpose_70 = transpose(perm = transpose_70_perm_0, x = var_5161)[name = string("transpose_96")];
+            tensor<fp16, [3, 2, 256]> x_143 = reshape(shape = var_5212, x = transpose_70)[name = string("x_143")];
+            int32 var_5218 = const()[name = string("op_5218"), val = int32(-1)];
+            fp16 const_85_promoted = const()[name = string("const_85_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 2, 256]> var_5220 = mul(x = x_143, y = const_85_promoted)[name = string("op_5220")];
+            bool input_213_interleave_0 = const()[name = string("input_213_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 2, 512]> input_213 = concat(axis = var_5218, interleave = input_213_interleave_0, values = (x_143, var_5220))[name = string("input_213")];
+            tensor<int32, [1]> normed_205_axes_0 = const()[name = string("normed_205_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5215_to_fp16 = const()[name = string("op_5215_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 2, 512]> normed_205_cast_fp16 = layer_norm(axes = normed_205_axes_0, epsilon = var_5215_to_fp16, x = input_213)[name = string("normed_205_cast_fp16")];
+            tensor<int32, [2]> var_5225_split_sizes_0 = const()[name = string("op_5225_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_5225_axis_0 = const()[name = string("op_5225_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 2, 256]> var_5225_0, tensor<fp16, [3, 2, 256]> var_5225_1 = split(axis = var_5225_axis_0, split_sizes = var_5225_split_sizes_0, x = normed_205_cast_fp16)[name = string("op_5225")];
+            tensor<fp16, [3, 2, 256]> k_49 = mul(x = var_5225_0, y = layers_7_self_attn_k_norm_weight)[name = string("k_49")];
+            tensor<int32, [4]> var_5232 = const()[name = string("op_5232"), val = tensor<int32, [4]>([1, 3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> var_5233 = reshape(shape = var_5232, x = k_49)[name = string("op_5233")];
+            tensor<int32, [4]> var_5238 = const()[name = string("op_5238"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            fp16 var_5240_promoted = const()[name = string("op_5240_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 3, 256]> var_5195 = transpose(perm = var_5194, x = var_5189)[name = string("transpose_95")];
+            tensor<fp16, [1, 2, 3, 256]> var_5241 = pow(x = var_5195, y = var_5240_promoted)[name = string("op_5241")];
+            tensor<int32, [1]> var_5246_axes_0 = const()[name = string("op_5246_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5246_keep_dims_0 = const()[name = string("op_5246_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 3, 1]> var_5246 = reduce_mean(axes = var_5246_axes_0, keep_dims = var_5246_keep_dims_0, x = var_5241)[name = string("op_5246")];
+            fp16 var_5248_to_fp16 = const()[name = string("op_5248_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 3, 1]> mean_sq_15_cast_fp16 = add(x = var_5246, y = var_5248_to_fp16)[name = string("mean_sq_15_cast_fp16")];
+            fp32 var_5250_epsilon_0 = const()[name = string("op_5250_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 3, 1]> var_5250_cast_fp16 = rsqrt(epsilon = var_5250_epsilon_0, x = mean_sq_15_cast_fp16)[name = string("op_5250_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_217_cast_fp16 = mul(x = var_5195, y = var_5250_cast_fp16)[name = string("input_217_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> q_93 = transpose(perm = var_5238, x = var_5233)[name = string("transpose_94")];
+            tensor<fp16, [1, 2, 3, 256]> var_5252_cast_fp16 = mul(x = q_93, y = cos_s)[name = string("op_5252_cast_fp16")];
+            tensor<int32, [2]> var_5253_split_sizes_0 = const()[name = string("op_5253_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_5253_axis_0 = const()[name = string("op_5253_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 3, 128]> var_5253_0, tensor<fp16, [1, 2, 3, 128]> var_5253_1 = split(axis = var_5253_axis_0, split_sizes = var_5253_split_sizes_0, x = q_93)[name = string("op_5253")];
+            fp16 const_86_promoted = const()[name = string("const_86_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 3, 128]> var_5255 = mul(x = var_5253_1, y = const_86_promoted)[name = string("op_5255")];
+            int32 var_5257 = const()[name = string("op_5257"), val = int32(-1)];
+            bool var_5258_interleave_0 = const()[name = string("op_5258_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 3, 256]> var_5258 = concat(axis = var_5257, interleave = var_5258_interleave_0, values = (var_5255, var_5253_0))[name = string("op_5258")];
+            tensor<fp16, [1, 2, 3, 256]> var_5259_cast_fp16 = mul(x = var_5258, y = sin_s)[name = string("op_5259_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_215_cast_fp16 = add(x = var_5252_cast_fp16, y = var_5259_cast_fp16)[name = string("input_215_cast_fp16")];
+            tensor<int32, [8]> k_padded_13_pad_0 = const()[name = string("k_padded_13_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_13_mode_0 = const()[name = string("k_padded_13_mode_0"), val = string("constant")];
+            fp16 const_87_to_fp16 = const()[name = string("const_87_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> k_padded_13_cast_fp16 = pad(constant_val = const_87_to_fp16, mode = k_padded_13_mode_0, pad = k_padded_13_pad_0, x = input_215_cast_fp16)[name = string("k_padded_13_cast_fp16")];
+            tensor<int32, [8]> v_padded_13_pad_0 = const()[name = string("v_padded_13_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_13_mode_0 = const()[name = string("v_padded_13_mode_0"), val = string("constant")];
+            fp16 const_88_to_fp16 = const()[name = string("const_88_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> v_padded_13_cast_fp16 = pad(constant_val = const_88_to_fp16, mode = v_padded_13_mode_0, pad = v_padded_13_pad_0, x = input_217_cast_fp16)[name = string("v_padded_13_cast_fp16")];
+            tensor<int32, [4]> slot_k_15_begin_0 = const()[name = string("slot_k_15_begin_0"), val = tensor<int32, [4]>([6, 0, 0, 0])];
+            tensor<int32, [4]> slot_k_15_end_0 = const()[name = string("slot_k_15_end_0"), val = tensor<int32, [4]>([7, 2, 512, 512])];
+            tensor<bool, [4]> slot_k_15_end_mask_0 = const()[name = string("slot_k_15_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_k_15_cast_fp16 = slice_by_index(begin = slot_k_15_begin_0, end = slot_k_15_end_0, end_mask = slot_k_15_end_mask_0, x = K_sliding_out_11_cast_fp16)[name = string("slot_k_15_cast_fp16")];
+            tensor<int32, [4]> slot_v_15_begin_0 = const()[name = string("slot_v_15_begin_0"), val = tensor<int32, [4]>([6, 0, 0, 0])];
+            tensor<int32, [4]> slot_v_15_end_0 = const()[name = string("slot_v_15_end_0"), val = tensor<int32, [4]>([7, 2, 512, 512])];
+            tensor<bool, [4]> slot_v_15_end_mask_0 = const()[name = string("slot_v_15_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_v_15_cast_fp16 = slice_by_index(begin = slot_v_15_begin_0, end = slot_v_15_end_0, end_mask = slot_v_15_end_mask_0, x = V_sliding_out_11_cast_fp16)[name = string("slot_v_15_cast_fp16")];
+            tensor<int32, [4]> var_5298_begin_0 = const()[name = string("op_5298_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_5298_end_0 = const()[name = string("op_5298_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_5298_end_mask_0 = const()[name = string("op_5298_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_5298_cast_fp16 = slice_by_index(begin = var_5298_begin_0, end = var_5298_end_0, end_mask = var_5298_end_mask_0, x = slot_k_15_cast_fp16)[name = string("op_5298_cast_fp16")];
+            int32 var_5305 = const()[name = string("op_5305"), val = int32(2)];
+            bool new_k_15_interleave_0 = const()[name = string("new_k_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_k_15_cast_fp16 = concat(axis = var_5305, interleave = new_k_15_interleave_0, values = (var_5298_cast_fp16, k_padded_13_cast_fp16))[name = string("new_k_15_cast_fp16")];
+            tensor<int32, [4]> var_5321_begin_0 = const()[name = string("op_5321_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_5321_end_0 = const()[name = string("op_5321_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_5321_end_mask_0 = const()[name = string("op_5321_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_5321_cast_fp16 = slice_by_index(begin = var_5321_begin_0, end = var_5321_end_0, end_mask = var_5321_end_mask_0, x = slot_v_15_cast_fp16)[name = string("op_5321_cast_fp16")];
+            int32 var_5328 = const()[name = string("op_5328"), val = int32(2)];
+            bool new_v_15_interleave_0 = const()[name = string("new_v_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_v_15_cast_fp16 = concat(axis = var_5328, interleave = new_v_15_interleave_0, values = (var_5321_cast_fp16, v_padded_13_cast_fp16))[name = string("new_v_15_cast_fp16")];
+            tensor<int32, [4]> var_5334_begin_0 = const()[name = string("op_5334_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_5334_end_0 = const()[name = string("op_5334_end_0"), val = tensor<int32, [4]>([6, 2, 512, 512])];
+            tensor<bool, [4]> var_5334_end_mask_0 = const()[name = string("op_5334_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [6, 2, 512, 512]> var_5334_cast_fp16 = slice_by_index(begin = var_5334_begin_0, end = var_5334_end_0, end_mask = var_5334_end_mask_0, x = K_sliding_out_11_cast_fp16)[name = string("op_5334_cast_fp16")];
+            tensor<int32, [4]> var_5339_begin_0 = const()[name = string("op_5339_begin_0"), val = tensor<int32, [4]>([7, 0, 0, 0])];
+            tensor<int32, [4]> var_5339_end_0 = const()[name = string("op_5339_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_5339_end_mask_0 = const()[name = string("op_5339_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [3, 2, 512, 512]> var_5339_cast_fp16 = slice_by_index(begin = var_5339_begin_0, end = var_5339_end_0, end_mask = var_5339_end_mask_0, x = K_sliding_out_11_cast_fp16)[name = string("op_5339_cast_fp16")];
+            int32 var_5341 = const()[name = string("op_5341"), val = int32(0)];
+            bool K_sliding_out_13_interleave_0 = const()[name = string("K_sliding_out_13_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> K_sliding_out_13_cast_fp16 = concat(axis = var_5341, interleave = K_sliding_out_13_interleave_0, values = (var_5334_cast_fp16, new_k_15_cast_fp16, var_5339_cast_fp16))[name = string("K_sliding_out_13_cast_fp16")];
+            tensor<int32, [4]> var_5347_begin_0 = const()[name = string("op_5347_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_5347_end_0 = const()[name = string("op_5347_end_0"), val = tensor<int32, [4]>([6, 2, 512, 512])];
+            tensor<bool, [4]> var_5347_end_mask_0 = const()[name = string("op_5347_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [6, 2, 512, 512]> var_5347_cast_fp16 = slice_by_index(begin = var_5347_begin_0, end = var_5347_end_0, end_mask = var_5347_end_mask_0, x = V_sliding_out_11_cast_fp16)[name = string("op_5347_cast_fp16")];
+            tensor<int32, [4]> var_5352_begin_0 = const()[name = string("op_5352_begin_0"), val = tensor<int32, [4]>([7, 0, 0, 0])];
+            tensor<int32, [4]> var_5352_end_0 = const()[name = string("op_5352_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_5352_end_mask_0 = const()[name = string("op_5352_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [3, 2, 512, 512]> var_5352_cast_fp16 = slice_by_index(begin = var_5352_begin_0, end = var_5352_end_0, end_mask = var_5352_end_mask_0, x = V_sliding_out_11_cast_fp16)[name = string("op_5352_cast_fp16")];
+            int32 var_5354 = const()[name = string("op_5354"), val = int32(0)];
+            bool V_sliding_out_13_interleave_0 = const()[name = string("V_sliding_out_13_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> V_sliding_out_13_cast_fp16 = concat(axis = var_5354, interleave = V_sliding_out_13_interleave_0, values = (var_5347_cast_fp16, new_v_15_cast_fp16, var_5352_cast_fp16))[name = string("V_sliding_out_13_cast_fp16")];
+            tensor<int32, [4]> var_5360_begin_0 = const()[name = string("op_5360_begin_0"), val = tensor<int32, [4]>([6, 0, 0, 0])];
+            tensor<int32, [4]> var_5360_end_0 = const()[name = string("op_5360_end_0"), val = tensor<int32, [4]>([7, 2, 512, 512])];
+            tensor<bool, [4]> var_5360_end_mask_0 = const()[name = string("op_5360_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_5360_cast_fp16 = slice_by_index(begin = var_5360_begin_0, end = var_5360_end_0, end_mask = var_5360_end_mask_0, x = K_sliding_out_13_cast_fp16)[name = string("op_5360_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_15_begin_0 = const()[name = string("K_for_attn_15_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_15_end_0 = const()[name = string("K_for_attn_15_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_15_end_mask_0 = const()[name = string("K_for_attn_15_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_15_cast_fp16 = slice_by_index(begin = K_for_attn_15_begin_0, end = K_for_attn_15_end_0, end_mask = K_for_attn_15_end_mask_0, x = var_5360_cast_fp16)[name = string("K_for_attn_15_cast_fp16")];
+            tensor<int32, [4]> var_5370_begin_0 = const()[name = string("op_5370_begin_0"), val = tensor<int32, [4]>([6, 0, 0, 0])];
+            tensor<int32, [4]> var_5370_end_0 = const()[name = string("op_5370_end_0"), val = tensor<int32, [4]>([7, 2, 512, 512])];
+            tensor<bool, [4]> var_5370_end_mask_0 = const()[name = string("op_5370_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_5370_cast_fp16 = slice_by_index(begin = var_5370_begin_0, end = var_5370_end_0, end_mask = var_5370_end_mask_0, x = V_sliding_out_13_cast_fp16)[name = string("op_5370_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_15_begin_0 = const()[name = string("V_for_attn_15_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_15_end_0 = const()[name = string("V_for_attn_15_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_15_end_mask_0 = const()[name = string("V_for_attn_15_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_15_cast_fp16 = slice_by_index(begin = V_for_attn_15_begin_0, end = V_for_attn_15_end_0, end_mask = V_for_attn_15_end_mask_0, x = var_5370_cast_fp16)[name = string("V_for_attn_15_cast_fp16")];
+            tensor<int32, [4]> transpose_28_perm_0 = const()[name = string("transpose_28_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_14_reps_0 = const()[name = string("tile_14_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_28_cast_fp16 = transpose(perm = transpose_28_perm_0, x = K_for_attn_15_cast_fp16)[name = string("transpose_93")];
+            tensor<fp16, [8, 1, 512, 256]> tile_14_cast_fp16 = tile(reps = tile_14_reps_0, x = transpose_28_cast_fp16)[name = string("tile_14_cast_fp16")];
+            tensor<int32, [5]> concat_30 = const()[name = string("concat_30"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_28_cast_fp16 = reshape(shape = concat_30, x = tile_14_cast_fp16)[name = string("reshape_28_cast_fp16")];
+            tensor<int32, [5]> transpose_29_perm_0 = const()[name = string("transpose_29_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_31 = const()[name = string("concat_31"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_29_cast_fp16 = transpose(perm = transpose_29_perm_0, x = reshape_28_cast_fp16)[name = string("transpose_92")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_29_cast_fp16 = reshape(shape = concat_31, x = transpose_29_cast_fp16)[name = string("reshape_29_cast_fp16")];
+            tensor<int32, [4]> transpose_71_perm_0 = const()[name = string("transpose_71_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_30_perm_0 = const()[name = string("transpose_30_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_15_reps_0 = const()[name = string("tile_15_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_30_cast_fp16 = transpose(perm = transpose_30_perm_0, x = V_for_attn_15_cast_fp16)[name = string("transpose_91")];
+            tensor<fp16, [8, 1, 512, 256]> tile_15_cast_fp16 = tile(reps = tile_15_reps_0, x = transpose_30_cast_fp16)[name = string("tile_15_cast_fp16")];
+            tensor<int32, [5]> concat_32 = const()[name = string("concat_32"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_30_cast_fp16 = reshape(shape = concat_32, x = tile_15_cast_fp16)[name = string("reshape_30_cast_fp16")];
+            tensor<int32, [5]> transpose_31_perm_0 = const()[name = string("transpose_31_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_33 = const()[name = string("concat_33"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_31_cast_fp16 = transpose(perm = transpose_31_perm_0, x = reshape_30_cast_fp16)[name = string("transpose_90")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_31_cast_fp16 = reshape(shape = concat_33, x = transpose_31_cast_fp16)[name = string("reshape_31_cast_fp16")];
+            tensor<int32, [4]> V_expanded_15_perm_0 = const()[name = string("V_expanded_15_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_29_transpose_x_0 = const()[name = string("attn_weights_29_transpose_x_0"), val = bool(false)];
+            bool attn_weights_29_transpose_y_0 = const()[name = string("attn_weights_29_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_71_cast_fp16 = transpose(perm = transpose_71_perm_0, x = reshape_29_cast_fp16)[name = string("transpose_89")];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_29_cast_fp16 = matmul(transpose_x = attn_weights_29_transpose_x_0, transpose_y = attn_weights_29_transpose_y_0, x = q_95_cast_fp16, y = transpose_71_cast_fp16)[name = string("attn_weights_29_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_147_cast_fp16 = add(x = attn_weights_29_cast_fp16, y = causal_mask_sliding)[name = string("x_147_cast_fp16")];
+            tensor<int32, [1]> reduce_max_7_axes_0 = const()[name = string("reduce_max_7_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_7_keep_dims_0 = const()[name = string("reduce_max_7_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_7 = reduce_max(axes = reduce_max_7_axes_0, keep_dims = reduce_max_7_keep_dims_0, x = x_147_cast_fp16)[name = string("reduce_max_7")];
+            tensor<fp16, [1, 8, 3, 512]> var_5405 = sub(x = x_147_cast_fp16, y = reduce_max_7)[name = string("op_5405")];
+            tensor<fp16, [1, 8, 3, 512]> var_5411 = exp(x = var_5405)[name = string("op_5411")];
+            tensor<int32, [1]> var_5421_axes_0 = const()[name = string("op_5421_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5421_keep_dims_0 = const()[name = string("op_5421_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_5421 = reduce_sum(axes = var_5421_axes_0, keep_dims = var_5421_keep_dims_0, x = var_5411)[name = string("op_5421")];
+            tensor<fp16, [1, 8, 3, 512]> var_5427_cast_fp16 = real_div(x = var_5411, y = var_5421)[name = string("op_5427_cast_fp16")];
+            bool attn_output_43_transpose_x_0 = const()[name = string("attn_output_43_transpose_x_0"), val = bool(false)];
+            bool attn_output_43_transpose_y_0 = const()[name = string("attn_output_43_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_15_cast_fp16 = transpose(perm = V_expanded_15_perm_0, x = reshape_31_cast_fp16)[name = string("transpose_88")];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_43_cast_fp16 = matmul(transpose_x = attn_output_43_transpose_x_0, transpose_y = attn_output_43_transpose_y_0, x = var_5427_cast_fp16, y = V_expanded_15_cast_fp16)[name = string("attn_output_43_cast_fp16")];
+            tensor<int32, [4]> var_5438 = const()[name = string("op_5438"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_5445 = const()[name = string("op_5445"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_5439_cast_fp16 = transpose(perm = var_5438, x = attn_output_43_cast_fp16)[name = string("transpose_87")];
+            tensor<fp16, [1, 3, 2048]> attn_output_45_cast_fp16 = reshape(shape = var_5445, x = var_5439_cast_fp16)[name = string("attn_output_45_cast_fp16")];
+            tensor<int32, [3]> var_5450 = const()[name = string("op_5450"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_5466_pad_type_0 = const()[name = string("op_5466_pad_type_0"), val = string("valid")];
+            int32 var_5466_groups_0 = const()[name = string("op_5466_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_5466_strides_0 = const()[name = string("op_5466_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_5466_pad_0 = const()[name = string("op_5466_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_5466_dilations_0 = const()[name = string("op_5466_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_7_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(554704576))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(557326080))))[name = string("squeeze_7_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_5451_cast_fp16 = transpose(perm = var_5450, x = attn_output_45_cast_fp16)[name = string("transpose_86")];
+            tensor<fp16, [1, 2560, 3]> var_5466_cast_fp16 = conv(dilations = var_5466_dilations_0, groups = var_5466_groups_0, pad = var_5466_pad_0, pad_type = var_5466_pad_type_0, strides = var_5466_strides_0, weight = squeeze_7_cast_fp16_to_fp32_to_fp16_palettized, x = var_5451_cast_fp16)[name = string("op_5466_cast_fp16")];
+            tensor<int32, [3]> var_5470 = const()[name = string("op_5470"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_5476 = const()[name = string("op_5476"), val = int32(-1)];
+            fp16 const_89_promoted_to_fp16 = const()[name = string("const_89_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_151_cast_fp16 = transpose(perm = var_5470, x = var_5466_cast_fp16)[name = string("transpose_85")];
+            tensor<fp16, [1, 3, 2560]> var_5478_cast_fp16 = mul(x = x_151_cast_fp16, y = const_89_promoted_to_fp16)[name = string("op_5478_cast_fp16")];
+            bool input_221_interleave_0 = const()[name = string("input_221_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_221_cast_fp16 = concat(axis = var_5476, interleave = input_221_interleave_0, values = (x_151_cast_fp16, var_5478_cast_fp16))[name = string("input_221_cast_fp16")];
+            tensor<int32, [1]> normed_209_axes_0 = const()[name = string("normed_209_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5473_to_fp16 = const()[name = string("op_5473_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_209_cast_fp16 = layer_norm(axes = normed_209_axes_0, epsilon = var_5473_to_fp16, x = input_221_cast_fp16)[name = string("normed_209_cast_fp16")];
+            tensor<int32, [2]> var_5483_split_sizes_0 = const()[name = string("op_5483_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5483_axis_0 = const()[name = string("op_5483_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_5483_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_5483_cast_fp16_1 = split(axis = var_5483_axis_0, split_sizes = var_5483_split_sizes_0, x = normed_209_cast_fp16)[name = string("op_5483_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_7_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(557328704)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_47_cast_fp16 = mul(x = var_5483_cast_fp16_0, y = layers_7_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_47_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_153_cast_fp16 = add(x = x_139_cast_fp16, y = attn_output_47_cast_fp16)[name = string("x_153_cast_fp16")];
+            int32 var_5492 = const()[name = string("op_5492"), val = int32(-1)];
+            fp16 const_90_promoted_to_fp16 = const()[name = string("const_90_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_5494_cast_fp16 = mul(x = x_153_cast_fp16, y = const_90_promoted_to_fp16)[name = string("op_5494_cast_fp16")];
+            bool input_223_interleave_0 = const()[name = string("input_223_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_223_cast_fp16 = concat(axis = var_5492, interleave = input_223_interleave_0, values = (x_153_cast_fp16, var_5494_cast_fp16))[name = string("input_223_cast_fp16")];
+            tensor<int32, [1]> normed_213_axes_0 = const()[name = string("normed_213_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5489_to_fp16 = const()[name = string("op_5489_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_213_cast_fp16 = layer_norm(axes = normed_213_axes_0, epsilon = var_5489_to_fp16, x = input_223_cast_fp16)[name = string("normed_213_cast_fp16")];
+            tensor<int32, [2]> var_5499_split_sizes_0 = const()[name = string("op_5499_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5499_axis_0 = const()[name = string("op_5499_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_5499_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_5499_cast_fp16_1 = split(axis = var_5499_axis_0, split_sizes = var_5499_split_sizes_0, x = normed_213_cast_fp16)[name = string("op_5499_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_7_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(557333888)))];
+            tensor<fp16, [1, 3, 2560]> h_45_cast_fp16 = mul(x = var_5499_cast_fp16_0, y = layers_7_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_45_cast_fp16")];
+            tensor<int32, [3]> var_5510 = const()[name = string("op_5510"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_225_axes_0 = const()[name = string("input_225_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_5511 = transpose(perm = var_5510, x = h_45_cast_fp16)[name = string("transpose_84")];
+            tensor<fp16, [1, 2560, 1, 3]> input_225 = expand_dims(axes = input_225_axes_0, x = var_5511)[name = string("input_225")];
+            string gate_29_pad_type_0 = const()[name = string("gate_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_29_strides_0 = const()[name = string("gate_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_29_pad_0 = const()[name = string("gate_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_29_dilations_0 = const()[name = string("gate_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_29_groups_0 = const()[name = string("gate_29_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_29 = conv(dilations = gate_29_dilations_0, groups = gate_29_groups_0, pad = gate_29_pad_0, pad_type = gate_29_pad_type_0, strides = gate_29_strides_0, weight = layers_7_mlp_gate_proj_weight_palettized, x = input_225)[name = string("gate_29")];
+            string up_15_pad_type_0 = const()[name = string("up_15_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_15_strides_0 = const()[name = string("up_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_15_pad_0 = const()[name = string("up_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_15_dilations_0 = const()[name = string("up_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_15_groups_0 = const()[name = string("up_15_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_15 = conv(dilations = up_15_dilations_0, groups = up_15_groups_0, pad = up_15_pad_0, pad_type = up_15_pad_type_0, strides = up_15_strides_0, weight = layers_7_mlp_up_proj_weight_palettized, x = input_225)[name = string("up_15")];
+            string gate_31_mode_0 = const()[name = string("gate_31_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_31 = gelu(mode = gate_31_mode_0, x = gate_29)[name = string("gate_31")];
+            tensor<fp16, [1, 10240, 1, 3]> input_227 = mul(x = gate_31, y = up_15)[name = string("input_227")];
+            string mlp_out_15_pad_type_0 = const()[name = string("mlp_out_15_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_15_strides_0 = const()[name = string("mlp_out_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_15_pad_0 = const()[name = string("mlp_out_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_15_dilations_0 = const()[name = string("mlp_out_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_15_groups_0 = const()[name = string("mlp_out_15_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_15 = conv(dilations = mlp_out_15_dilations_0, groups = mlp_out_15_groups_0, pad = mlp_out_15_pad_0, pad_type = mlp_out_15_pad_type_0, strides = mlp_out_15_strides_0, weight = layers_7_mlp_down_proj_weight_palettized, x = input_227)[name = string("mlp_out_15")];
+            tensor<int32, [1]> var_5551_axes_0 = const()[name = string("op_5551_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_5551 = squeeze(axes = var_5551_axes_0, x = mlp_out_15)[name = string("op_5551")];
+            tensor<int32, [3]> var_5555 = const()[name = string("op_5555"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_5561 = const()[name = string("op_5561"), val = int32(-1)];
+            fp16 const_91_promoted = const()[name = string("const_91_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_155 = transpose(perm = var_5555, x = var_5551)[name = string("transpose_83")];
+            tensor<fp16, [1, 3, 2560]> var_5563 = mul(x = x_155, y = const_91_promoted)[name = string("op_5563")];
+            bool input_229_interleave_0 = const()[name = string("input_229_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_229 = concat(axis = var_5561, interleave = input_229_interleave_0, values = (x_155, var_5563))[name = string("input_229")];
+            tensor<int32, [1]> normed_217_axes_0 = const()[name = string("normed_217_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5558_to_fp16 = const()[name = string("op_5558_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_217_cast_fp16 = layer_norm(axes = normed_217_axes_0, epsilon = var_5558_to_fp16, x = input_229)[name = string("normed_217_cast_fp16")];
+            tensor<int32, [2]> var_5568_split_sizes_0 = const()[name = string("op_5568_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5568_axis_0 = const()[name = string("op_5568_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_5568_0, tensor<fp16, [1, 3, 2560]> var_5568_1 = split(axis = var_5568_axis_0, split_sizes = var_5568_split_sizes_0, x = normed_217_cast_fp16)[name = string("op_5568")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_73 = mul(x = var_5568_0, y = layers_7_post_feedforward_layernorm_weight)[name = string("hidden_states_73")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_75_cast_fp16 = add(x = x_153_cast_fp16, y = hidden_states_73)[name = string("hidden_states_75_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_15_begin_0 = const()[name = string("per_layer_slice_15_begin_0"), val = tensor<int32, [3]>([0, 0, 4864])];
+            tensor<int32, [3]> per_layer_slice_15_end_0 = const()[name = string("per_layer_slice_15_end_0"), val = tensor<int32, [3]>([1, 3, 5120])];
+            tensor<bool, [3]> per_layer_slice_15_end_mask_0 = const()[name = string("per_layer_slice_15_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_15_cast_fp16 = slice_by_index(begin = per_layer_slice_15_begin_0, end = per_layer_slice_15_end_0, end_mask = per_layer_slice_15_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_15_cast_fp16")];
+            tensor<int32, [3]> var_5596 = const()[name = string("op_5596"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_231_axes_0 = const()[name = string("input_231_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_5597 = transpose(perm = var_5596, x = hidden_states_75_cast_fp16)[name = string("transpose_82")];
+            tensor<fp16, [1, 2560, 1, 3]> input_231 = expand_dims(axes = input_231_axes_0, x = var_5597)[name = string("input_231")];
+            string gated_43_pad_type_0 = const()[name = string("gated_43_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_43_strides_0 = const()[name = string("gated_43_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_43_pad_0 = const()[name = string("gated_43_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_43_dilations_0 = const()[name = string("gated_43_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_43_groups_0 = const()[name = string("gated_43_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_43 = conv(dilations = gated_43_dilations_0, groups = gated_43_groups_0, pad = gated_43_pad_0, pad_type = gated_43_pad_type_0, strides = gated_43_strides_0, weight = layers_7_per_layer_input_gate_weight_palettized, x = input_231)[name = string("gated_43")];
+            string gated_45_mode_0 = const()[name = string("gated_45_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_45 = gelu(mode = gated_45_mode_0, x = gated_43)[name = string("gated_45")];
+            tensor<int32, [3]> var_5616 = const()[name = string("op_5616"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_15_axes_0 = const()[name = string("per_layer_slice_conv_15_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_5617_cast_fp16 = transpose(perm = var_5616, x = per_layer_slice_15_cast_fp16)[name = string("transpose_81")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_15_cast_fp16 = expand_dims(axes = per_layer_slice_conv_15_axes_0, x = var_5617_cast_fp16)[name = string("per_layer_slice_conv_15_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_233_cast_fp16 = mul(x = gated_45, y = per_layer_slice_conv_15_cast_fp16)[name = string("input_233_cast_fp16")];
+            string gated_47_pad_type_0 = const()[name = string("gated_47_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_47_strides_0 = const()[name = string("gated_47_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_47_pad_0 = const()[name = string("gated_47_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_47_dilations_0 = const()[name = string("gated_47_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_47_groups_0 = const()[name = string("gated_47_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_7_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(557339072))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(557666816))))[name = string("layers_7_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_47_cast_fp16 = conv(dilations = gated_47_dilations_0, groups = gated_47_groups_0, pad = gated_47_pad_0, pad_type = gated_47_pad_type_0, strides = gated_47_strides_0, weight = layers_7_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_233_cast_fp16)[name = string("gated_47_cast_fp16")];
+            tensor<int32, [1]> var_5633_axes_0 = const()[name = string("op_5633_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_5633_cast_fp16 = squeeze(axes = var_5633_axes_0, x = gated_47_cast_fp16)[name = string("op_5633_cast_fp16")];
+            tensor<int32, [3]> var_5637 = const()[name = string("op_5637"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_5643 = const()[name = string("op_5643"), val = int32(-1)];
+            fp16 const_92_promoted_to_fp16 = const()[name = string("const_92_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_157_cast_fp16 = transpose(perm = var_5637, x = var_5633_cast_fp16)[name = string("transpose_80")];
+            tensor<fp16, [1, 3, 2560]> var_5645_cast_fp16 = mul(x = x_157_cast_fp16, y = const_92_promoted_to_fp16)[name = string("op_5645_cast_fp16")];
+            bool input_235_interleave_0 = const()[name = string("input_235_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_235_cast_fp16 = concat(axis = var_5643, interleave = input_235_interleave_0, values = (x_157_cast_fp16, var_5645_cast_fp16))[name = string("input_235_cast_fp16")];
+            tensor<int32, [1]> normed_221_axes_0 = const()[name = string("normed_221_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5640_to_fp16 = const()[name = string("op_5640_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_221_cast_fp16 = layer_norm(axes = normed_221_axes_0, epsilon = var_5640_to_fp16, x = input_235_cast_fp16)[name = string("normed_221_cast_fp16")];
+            tensor<int32, [2]> var_5650_split_sizes_0 = const()[name = string("op_5650_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5650_axis_0 = const()[name = string("op_5650_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_5650_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_5650_cast_fp16_1 = split(axis = var_5650_axis_0, split_sizes = var_5650_split_sizes_0, x = normed_221_cast_fp16)[name = string("op_5650_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_7_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(557669440)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_79_cast_fp16 = mul(x = var_5650_cast_fp16_0, y = layers_7_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_79_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_81_cast_fp16 = add(x = hidden_states_75_cast_fp16, y = hidden_states_79_cast_fp16)[name = string("hidden_states_81_cast_fp16")];
+            tensor<fp16, [1]> const_93_promoted_to_fp16 = const()[name = string("const_93_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.06p-1])];
+            tensor<fp16, [1, 3, 2560]> x_159_cast_fp16 = mul(x = hidden_states_81_cast_fp16, y = const_93_promoted_to_fp16)[name = string("x_159_cast_fp16")];
+            int32 var_5665 = const()[name = string("op_5665"), val = int32(-1)];
+            fp16 const_94_promoted_to_fp16 = const()[name = string("const_94_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_5667_cast_fp16 = mul(x = x_159_cast_fp16, y = const_94_promoted_to_fp16)[name = string("op_5667_cast_fp16")];
+            bool input_237_interleave_0 = const()[name = string("input_237_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_237_cast_fp16 = concat(axis = var_5665, interleave = input_237_interleave_0, values = (x_159_cast_fp16, var_5667_cast_fp16))[name = string("input_237_cast_fp16")];
+            tensor<int32, [1]> normed_225_axes_0 = const()[name = string("normed_225_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5662_to_fp16 = const()[name = string("op_5662_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_225_cast_fp16 = layer_norm(axes = normed_225_axes_0, epsilon = var_5662_to_fp16, x = input_237_cast_fp16)[name = string("normed_225_cast_fp16")];
+            tensor<int32, [2]> var_5672_split_sizes_0 = const()[name = string("op_5672_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5672_axis_0 = const()[name = string("op_5672_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_5672_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_5672_cast_fp16_1 = split(axis = var_5672_axis_0, split_sizes = var_5672_split_sizes_0, x = normed_225_cast_fp16)[name = string("op_5672_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_8_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(557674624)))];
+            tensor<fp16, [1, 3, 2560]> h_49_cast_fp16 = mul(x = var_5672_cast_fp16_0, y = layers_8_input_layernorm_weight_promoted_to_fp16)[name = string("h_49_cast_fp16")];
+            tensor<int32, [3]> var_5678 = const()[name = string("op_5678"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_5681_axes_0 = const()[name = string("op_5681_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_5679_cast_fp16 = transpose(perm = var_5678, x = h_49_cast_fp16)[name = string("transpose_79")];
+            tensor<fp16, [1, 2560, 1, 3]> var_5681_cast_fp16 = expand_dims(axes = var_5681_axes_0, x = var_5679_cast_fp16)[name = string("op_5681_cast_fp16")];
+            string q_97_pad_type_0 = const()[name = string("q_97_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_97_strides_0 = const()[name = string("q_97_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_97_pad_0 = const()[name = string("q_97_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_97_dilations_0 = const()[name = string("q_97_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_97_groups_0 = const()[name = string("q_97_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_97 = conv(dilations = q_97_dilations_0, groups = q_97_groups_0, pad = q_97_pad_0, pad_type = q_97_pad_type_0, strides = q_97_strides_0, weight = layers_8_self_attn_q_proj_weight_palettized, x = var_5681_cast_fp16)[name = string("q_97")];
+            tensor<int32, [4]> var_5702 = const()[name = string("op_5702"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_5703 = reshape(shape = var_5702, x = q_97)[name = string("op_5703")];
+            tensor<int32, [4]> transpose_72_perm_0 = const()[name = string("transpose_72_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_5726 = const()[name = string("op_5726"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_72 = transpose(perm = transpose_72_perm_0, x = var_5703)[name = string("transpose_78")];
+            tensor<fp16, [3, 8, 256]> x_161 = reshape(shape = var_5726, x = transpose_72)[name = string("x_161")];
+            int32 var_5732 = const()[name = string("op_5732"), val = int32(-1)];
+            fp16 const_95_promoted = const()[name = string("const_95_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_5734 = mul(x = x_161, y = const_95_promoted)[name = string("op_5734")];
+            bool input_241_interleave_0 = const()[name = string("input_241_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_241 = concat(axis = var_5732, interleave = input_241_interleave_0, values = (x_161, var_5734))[name = string("input_241")];
+            tensor<int32, [1]> normed_229_axes_0 = const()[name = string("normed_229_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5729_to_fp16 = const()[name = string("op_5729_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_229_cast_fp16 = layer_norm(axes = normed_229_axes_0, epsilon = var_5729_to_fp16, x = input_241)[name = string("normed_229_cast_fp16")];
+            tensor<int32, [2]> var_5739_split_sizes_0 = const()[name = string("op_5739_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_5739_axis_0 = const()[name = string("op_5739_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_5739_0, tensor<fp16, [3, 8, 256]> var_5739_1 = split(axis = var_5739_axis_0, split_sizes = var_5739_split_sizes_0, x = normed_229_cast_fp16)[name = string("op_5739")];
+            tensor<int32, [4]> var_5746 = const()[name = string("op_5746"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_5747 = reshape(shape = var_5746, x = var_5739_0)[name = string("op_5747")];
+            tensor<int32, [4]> var_5752 = const()[name = string("op_5752"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_103 = transpose(perm = var_5752, x = var_5747)[name = string("transpose_77")];
+            tensor<fp16, [1, 8, 3, 256]> var_5754_cast_fp16 = mul(x = q_103, y = cos_s)[name = string("op_5754_cast_fp16")];
+            tensor<int32, [2]> var_5755_split_sizes_0 = const()[name = string("op_5755_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_5755_axis_0 = const()[name = string("op_5755_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_5755_0, tensor<fp16, [1, 8, 3, 128]> var_5755_1 = split(axis = var_5755_axis_0, split_sizes = var_5755_split_sizes_0, x = q_103)[name = string("op_5755")];
+            fp16 const_96_promoted = const()[name = string("const_96_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_5757 = mul(x = var_5755_1, y = const_96_promoted)[name = string("op_5757")];
+            int32 var_5759 = const()[name = string("op_5759"), val = int32(-1)];
+            bool var_5760_interleave_0 = const()[name = string("op_5760_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_5760 = concat(axis = var_5759, interleave = var_5760_interleave_0, values = (var_5757, var_5755_0))[name = string("op_5760")];
+            tensor<fp16, [1, 8, 3, 256]> var_5761_cast_fp16 = mul(x = var_5760, y = sin_s)[name = string("op_5761_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_107_cast_fp16 = add(x = var_5754_cast_fp16, y = var_5761_cast_fp16)[name = string("q_107_cast_fp16")];
+            string k_51_pad_type_0 = const()[name = string("k_51_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_51_strides_0 = const()[name = string("k_51_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_51_pad_0 = const()[name = string("k_51_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_51_dilations_0 = const()[name = string("k_51_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_51_groups_0 = const()[name = string("k_51_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> k_51 = conv(dilations = k_51_dilations_0, groups = k_51_groups_0, pad = k_51_pad_0, pad_type = k_51_pad_type_0, strides = k_51_strides_0, weight = layers_8_self_attn_k_proj_weight_palettized, x = var_5681_cast_fp16)[name = string("k_51")];
+            tensor<int32, [4]> var_5779 = const()[name = string("op_5779"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_5780 = reshape(shape = var_5779, x = k_51)[name = string("op_5780")];
+            tensor<int32, [4]> transpose_73_perm_0 = const()[name = string("transpose_73_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            string v_19_pad_type_0 = const()[name = string("v_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_19_strides_0 = const()[name = string("v_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_19_pad_0 = const()[name = string("v_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_19_dilations_0 = const()[name = string("v_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_19_groups_0 = const()[name = string("v_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> v_19 = conv(dilations = v_19_dilations_0, groups = v_19_groups_0, pad = v_19_pad_0, pad_type = v_19_pad_type_0, strides = v_19_strides_0, weight = layers_8_self_attn_v_proj_weight_palettized, x = var_5681_cast_fp16)[name = string("v_19")];
+            tensor<int32, [4]> var_5807 = const()[name = string("op_5807"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_5808 = reshape(shape = var_5807, x = v_19)[name = string("op_5808")];
+            tensor<int32, [4]> var_5813 = const()[name = string("op_5813"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_5831 = const()[name = string("op_5831"), val = tensor<int32, [3]>([3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> transpose_73 = transpose(perm = transpose_73_perm_0, x = var_5780)[name = string("transpose_76")];
+            tensor<fp16, [3, 2, 256]> x_163 = reshape(shape = var_5831, x = transpose_73)[name = string("x_163")];
+            int32 var_5837 = const()[name = string("op_5837"), val = int32(-1)];
+            fp16 const_97_promoted = const()[name = string("const_97_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 2, 256]> var_5839 = mul(x = x_163, y = const_97_promoted)[name = string("op_5839")];
+            bool input_243_interleave_0 = const()[name = string("input_243_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 2, 512]> input_243 = concat(axis = var_5837, interleave = input_243_interleave_0, values = (x_163, var_5839))[name = string("input_243")];
+            tensor<int32, [1]> normed_233_axes_0 = const()[name = string("normed_233_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5834_to_fp16 = const()[name = string("op_5834_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 2, 512]> normed_233_cast_fp16 = layer_norm(axes = normed_233_axes_0, epsilon = var_5834_to_fp16, x = input_243)[name = string("normed_233_cast_fp16")];
+            tensor<int32, [2]> var_5844_split_sizes_0 = const()[name = string("op_5844_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_5844_axis_0 = const()[name = string("op_5844_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 2, 256]> var_5844_0, tensor<fp16, [3, 2, 256]> var_5844_1 = split(axis = var_5844_axis_0, split_sizes = var_5844_split_sizes_0, x = normed_233_cast_fp16)[name = string("op_5844")];
+            tensor<fp16, [3, 2, 256]> k_55 = mul(x = var_5844_0, y = layers_8_self_attn_k_norm_weight)[name = string("k_55")];
+            tensor<int32, [4]> var_5851 = const()[name = string("op_5851"), val = tensor<int32, [4]>([1, 3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> var_5852 = reshape(shape = var_5851, x = k_55)[name = string("op_5852")];
+            tensor<int32, [4]> var_5857 = const()[name = string("op_5857"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            fp16 var_5859_promoted = const()[name = string("op_5859_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 3, 256]> var_5814 = transpose(perm = var_5813, x = var_5808)[name = string("transpose_75")];
+            tensor<fp16, [1, 2, 3, 256]> var_5860 = pow(x = var_5814, y = var_5859_promoted)[name = string("op_5860")];
+            tensor<int32, [1]> var_5865_axes_0 = const()[name = string("op_5865_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5865_keep_dims_0 = const()[name = string("op_5865_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 3, 1]> var_5865 = reduce_mean(axes = var_5865_axes_0, keep_dims = var_5865_keep_dims_0, x = var_5860)[name = string("op_5865")];
+            fp16 var_5867_to_fp16 = const()[name = string("op_5867_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 3, 1]> mean_sq_17_cast_fp16 = add(x = var_5865, y = var_5867_to_fp16)[name = string("mean_sq_17_cast_fp16")];
+            fp32 var_5869_epsilon_0 = const()[name = string("op_5869_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 3, 1]> var_5869_cast_fp16 = rsqrt(epsilon = var_5869_epsilon_0, x = mean_sq_17_cast_fp16)[name = string("op_5869_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_247_cast_fp16 = mul(x = var_5814, y = var_5869_cast_fp16)[name = string("input_247_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> q_105 = transpose(perm = var_5857, x = var_5852)[name = string("transpose_74")];
+            tensor<fp16, [1, 2, 3, 256]> var_5871_cast_fp16 = mul(x = q_105, y = cos_s)[name = string("op_5871_cast_fp16")];
+            tensor<int32, [2]> var_5872_split_sizes_0 = const()[name = string("op_5872_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_5872_axis_0 = const()[name = string("op_5872_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 3, 128]> var_5872_0, tensor<fp16, [1, 2, 3, 128]> var_5872_1 = split(axis = var_5872_axis_0, split_sizes = var_5872_split_sizes_0, x = q_105)[name = string("op_5872")];
+            fp16 const_98_promoted = const()[name = string("const_98_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 3, 128]> var_5874 = mul(x = var_5872_1, y = const_98_promoted)[name = string("op_5874")];
+            int32 var_5876 = const()[name = string("op_5876"), val = int32(-1)];
+            bool var_5877_interleave_0 = const()[name = string("op_5877_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 3, 256]> var_5877 = concat(axis = var_5876, interleave = var_5877_interleave_0, values = (var_5874, var_5872_0))[name = string("op_5877")];
+            tensor<fp16, [1, 2, 3, 256]> var_5878_cast_fp16 = mul(x = var_5877, y = sin_s)[name = string("op_5878_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_245_cast_fp16 = add(x = var_5871_cast_fp16, y = var_5878_cast_fp16)[name = string("input_245_cast_fp16")];
+            tensor<int32, [8]> k_padded_15_pad_0 = const()[name = string("k_padded_15_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_15_mode_0 = const()[name = string("k_padded_15_mode_0"), val = string("constant")];
+            fp16 const_99_to_fp16 = const()[name = string("const_99_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> k_padded_15_cast_fp16 = pad(constant_val = const_99_to_fp16, mode = k_padded_15_mode_0, pad = k_padded_15_pad_0, x = input_245_cast_fp16)[name = string("k_padded_15_cast_fp16")];
+            tensor<int32, [8]> v_padded_15_pad_0 = const()[name = string("v_padded_15_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_15_mode_0 = const()[name = string("v_padded_15_mode_0"), val = string("constant")];
+            fp16 const_100_to_fp16 = const()[name = string("const_100_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> v_padded_15_cast_fp16 = pad(constant_val = const_100_to_fp16, mode = v_padded_15_mode_0, pad = v_padded_15_pad_0, x = input_247_cast_fp16)[name = string("v_padded_15_cast_fp16")];
+            tensor<int32, [4]> slot_k_17_begin_0 = const()[name = string("slot_k_17_begin_0"), val = tensor<int32, [4]>([7, 0, 0, 0])];
+            tensor<int32, [4]> slot_k_17_end_0 = const()[name = string("slot_k_17_end_0"), val = tensor<int32, [4]>([8, 2, 512, 512])];
+            tensor<bool, [4]> slot_k_17_end_mask_0 = const()[name = string("slot_k_17_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_k_17_cast_fp16 = slice_by_index(begin = slot_k_17_begin_0, end = slot_k_17_end_0, end_mask = slot_k_17_end_mask_0, x = K_sliding_out_13_cast_fp16)[name = string("slot_k_17_cast_fp16")];
+            tensor<int32, [4]> slot_v_17_begin_0 = const()[name = string("slot_v_17_begin_0"), val = tensor<int32, [4]>([7, 0, 0, 0])];
+            tensor<int32, [4]> slot_v_17_end_0 = const()[name = string("slot_v_17_end_0"), val = tensor<int32, [4]>([8, 2, 512, 512])];
+            tensor<bool, [4]> slot_v_17_end_mask_0 = const()[name = string("slot_v_17_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_v_17_cast_fp16 = slice_by_index(begin = slot_v_17_begin_0, end = slot_v_17_end_0, end_mask = slot_v_17_end_mask_0, x = V_sliding_out_13_cast_fp16)[name = string("slot_v_17_cast_fp16")];
+            tensor<int32, [4]> var_5917_begin_0 = const()[name = string("op_5917_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_5917_end_0 = const()[name = string("op_5917_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_5917_end_mask_0 = const()[name = string("op_5917_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_5917_cast_fp16 = slice_by_index(begin = var_5917_begin_0, end = var_5917_end_0, end_mask = var_5917_end_mask_0, x = slot_k_17_cast_fp16)[name = string("op_5917_cast_fp16")];
+            int32 var_5924 = const()[name = string("op_5924"), val = int32(2)];
+            bool new_k_17_interleave_0 = const()[name = string("new_k_17_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_k_17_cast_fp16 = concat(axis = var_5924, interleave = new_k_17_interleave_0, values = (var_5917_cast_fp16, k_padded_15_cast_fp16))[name = string("new_k_17_cast_fp16")];
+            tensor<int32, [4]> var_5940_begin_0 = const()[name = string("op_5940_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_5940_end_0 = const()[name = string("op_5940_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_5940_end_mask_0 = const()[name = string("op_5940_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_5940_cast_fp16 = slice_by_index(begin = var_5940_begin_0, end = var_5940_end_0, end_mask = var_5940_end_mask_0, x = slot_v_17_cast_fp16)[name = string("op_5940_cast_fp16")];
+            int32 var_5947 = const()[name = string("op_5947"), val = int32(2)];
+            bool new_v_17_interleave_0 = const()[name = string("new_v_17_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_v_17_cast_fp16 = concat(axis = var_5947, interleave = new_v_17_interleave_0, values = (var_5940_cast_fp16, v_padded_15_cast_fp16))[name = string("new_v_17_cast_fp16")];
+            tensor<int32, [4]> var_5953_begin_0 = const()[name = string("op_5953_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_5953_end_0 = const()[name = string("op_5953_end_0"), val = tensor<int32, [4]>([7, 2, 512, 512])];
+            tensor<bool, [4]> var_5953_end_mask_0 = const()[name = string("op_5953_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [7, 2, 512, 512]> var_5953_cast_fp16 = slice_by_index(begin = var_5953_begin_0, end = var_5953_end_0, end_mask = var_5953_end_mask_0, x = K_sliding_out_13_cast_fp16)[name = string("op_5953_cast_fp16")];
+            tensor<int32, [4]> var_5958_begin_0 = const()[name = string("op_5958_begin_0"), val = tensor<int32, [4]>([8, 0, 0, 0])];
+            tensor<int32, [4]> var_5958_end_0 = const()[name = string("op_5958_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_5958_end_mask_0 = const()[name = string("op_5958_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [2, 2, 512, 512]> var_5958_cast_fp16 = slice_by_index(begin = var_5958_begin_0, end = var_5958_end_0, end_mask = var_5958_end_mask_0, x = K_sliding_out_13_cast_fp16)[name = string("op_5958_cast_fp16")];
+            int32 var_5960 = const()[name = string("op_5960"), val = int32(0)];
+            bool K_sliding_out_15_interleave_0 = const()[name = string("K_sliding_out_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> K_sliding_out_15_cast_fp16 = concat(axis = var_5960, interleave = K_sliding_out_15_interleave_0, values = (var_5953_cast_fp16, new_k_17_cast_fp16, var_5958_cast_fp16))[name = string("K_sliding_out_15_cast_fp16")];
+            tensor<int32, [4]> var_5966_begin_0 = const()[name = string("op_5966_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_5966_end_0 = const()[name = string("op_5966_end_0"), val = tensor<int32, [4]>([7, 2, 512, 512])];
+            tensor<bool, [4]> var_5966_end_mask_0 = const()[name = string("op_5966_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [7, 2, 512, 512]> var_5966_cast_fp16 = slice_by_index(begin = var_5966_begin_0, end = var_5966_end_0, end_mask = var_5966_end_mask_0, x = V_sliding_out_13_cast_fp16)[name = string("op_5966_cast_fp16")];
+            tensor<int32, [4]> var_5971_begin_0 = const()[name = string("op_5971_begin_0"), val = tensor<int32, [4]>([8, 0, 0, 0])];
+            tensor<int32, [4]> var_5971_end_0 = const()[name = string("op_5971_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_5971_end_mask_0 = const()[name = string("op_5971_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [2, 2, 512, 512]> var_5971_cast_fp16 = slice_by_index(begin = var_5971_begin_0, end = var_5971_end_0, end_mask = var_5971_end_mask_0, x = V_sliding_out_13_cast_fp16)[name = string("op_5971_cast_fp16")];
+            int32 var_5973 = const()[name = string("op_5973"), val = int32(0)];
+            bool V_sliding_out_15_interleave_0 = const()[name = string("V_sliding_out_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> V_sliding_out_15_cast_fp16 = concat(axis = var_5973, interleave = V_sliding_out_15_interleave_0, values = (var_5966_cast_fp16, new_v_17_cast_fp16, var_5971_cast_fp16))[name = string("V_sliding_out_15_cast_fp16")];
+            tensor<int32, [4]> var_5979_begin_0 = const()[name = string("op_5979_begin_0"), val = tensor<int32, [4]>([7, 0, 0, 0])];
+            tensor<int32, [4]> var_5979_end_0 = const()[name = string("op_5979_end_0"), val = tensor<int32, [4]>([8, 2, 512, 512])];
+            tensor<bool, [4]> var_5979_end_mask_0 = const()[name = string("op_5979_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_5979_cast_fp16 = slice_by_index(begin = var_5979_begin_0, end = var_5979_end_0, end_mask = var_5979_end_mask_0, x = K_sliding_out_15_cast_fp16)[name = string("op_5979_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_17_begin_0 = const()[name = string("K_for_attn_17_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_17_end_0 = const()[name = string("K_for_attn_17_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_17_end_mask_0 = const()[name = string("K_for_attn_17_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_17_cast_fp16 = slice_by_index(begin = K_for_attn_17_begin_0, end = K_for_attn_17_end_0, end_mask = K_for_attn_17_end_mask_0, x = var_5979_cast_fp16)[name = string("K_for_attn_17_cast_fp16")];
+            tensor<int32, [4]> var_5989_begin_0 = const()[name = string("op_5989_begin_0"), val = tensor<int32, [4]>([7, 0, 0, 0])];
+            tensor<int32, [4]> var_5989_end_0 = const()[name = string("op_5989_end_0"), val = tensor<int32, [4]>([8, 2, 512, 512])];
+            tensor<bool, [4]> var_5989_end_mask_0 = const()[name = string("op_5989_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_5989_cast_fp16 = slice_by_index(begin = var_5989_begin_0, end = var_5989_end_0, end_mask = var_5989_end_mask_0, x = V_sliding_out_15_cast_fp16)[name = string("op_5989_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_17_begin_0 = const()[name = string("V_for_attn_17_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_17_end_0 = const()[name = string("V_for_attn_17_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_17_end_mask_0 = const()[name = string("V_for_attn_17_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_17_cast_fp16 = slice_by_index(begin = V_for_attn_17_begin_0, end = V_for_attn_17_end_0, end_mask = V_for_attn_17_end_mask_0, x = var_5989_cast_fp16)[name = string("V_for_attn_17_cast_fp16")];
+            tensor<int32, [4]> transpose_32_perm_0 = const()[name = string("transpose_32_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_16_reps_0 = const()[name = string("tile_16_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_32_cast_fp16 = transpose(perm = transpose_32_perm_0, x = K_for_attn_17_cast_fp16)[name = string("transpose_73")];
+            tensor<fp16, [8, 1, 512, 256]> tile_16_cast_fp16 = tile(reps = tile_16_reps_0, x = transpose_32_cast_fp16)[name = string("tile_16_cast_fp16")];
+            tensor<int32, [5]> concat_34 = const()[name = string("concat_34"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_32_cast_fp16 = reshape(shape = concat_34, x = tile_16_cast_fp16)[name = string("reshape_32_cast_fp16")];
+            tensor<int32, [5]> transpose_33_perm_0 = const()[name = string("transpose_33_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_35 = const()[name = string("concat_35"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_33_cast_fp16 = transpose(perm = transpose_33_perm_0, x = reshape_32_cast_fp16)[name = string("transpose_72")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_33_cast_fp16 = reshape(shape = concat_35, x = transpose_33_cast_fp16)[name = string("reshape_33_cast_fp16")];
+            tensor<int32, [4]> transpose_74_perm_0 = const()[name = string("transpose_74_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_34_perm_0 = const()[name = string("transpose_34_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_17_reps_0 = const()[name = string("tile_17_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_34_cast_fp16 = transpose(perm = transpose_34_perm_0, x = V_for_attn_17_cast_fp16)[name = string("transpose_71")];
+            tensor<fp16, [8, 1, 512, 256]> tile_17_cast_fp16 = tile(reps = tile_17_reps_0, x = transpose_34_cast_fp16)[name = string("tile_17_cast_fp16")];
+            tensor<int32, [5]> concat_36 = const()[name = string("concat_36"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_34_cast_fp16 = reshape(shape = concat_36, x = tile_17_cast_fp16)[name = string("reshape_34_cast_fp16")];
+            tensor<int32, [5]> transpose_35_perm_0 = const()[name = string("transpose_35_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_37 = const()[name = string("concat_37"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_35_cast_fp16 = transpose(perm = transpose_35_perm_0, x = reshape_34_cast_fp16)[name = string("transpose_70")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_35_cast_fp16 = reshape(shape = concat_37, x = transpose_35_cast_fp16)[name = string("reshape_35_cast_fp16")];
+            tensor<int32, [4]> V_expanded_17_perm_0 = const()[name = string("V_expanded_17_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_33_transpose_x_0 = const()[name = string("attn_weights_33_transpose_x_0"), val = bool(false)];
+            bool attn_weights_33_transpose_y_0 = const()[name = string("attn_weights_33_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_74_cast_fp16 = transpose(perm = transpose_74_perm_0, x = reshape_33_cast_fp16)[name = string("transpose_69")];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_33_cast_fp16 = matmul(transpose_x = attn_weights_33_transpose_x_0, transpose_y = attn_weights_33_transpose_y_0, x = q_107_cast_fp16, y = transpose_74_cast_fp16)[name = string("attn_weights_33_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_167_cast_fp16 = add(x = attn_weights_33_cast_fp16, y = causal_mask_sliding)[name = string("x_167_cast_fp16")];
+            tensor<int32, [1]> reduce_max_8_axes_0 = const()[name = string("reduce_max_8_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_8_keep_dims_0 = const()[name = string("reduce_max_8_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_8 = reduce_max(axes = reduce_max_8_axes_0, keep_dims = reduce_max_8_keep_dims_0, x = x_167_cast_fp16)[name = string("reduce_max_8")];
+            tensor<fp16, [1, 8, 3, 512]> var_6024 = sub(x = x_167_cast_fp16, y = reduce_max_8)[name = string("op_6024")];
+            tensor<fp16, [1, 8, 3, 512]> var_6030 = exp(x = var_6024)[name = string("op_6030")];
+            tensor<int32, [1]> var_6040_axes_0 = const()[name = string("op_6040_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_6040_keep_dims_0 = const()[name = string("op_6040_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_6040 = reduce_sum(axes = var_6040_axes_0, keep_dims = var_6040_keep_dims_0, x = var_6030)[name = string("op_6040")];
+            tensor<fp16, [1, 8, 3, 512]> var_6046_cast_fp16 = real_div(x = var_6030, y = var_6040)[name = string("op_6046_cast_fp16")];
+            bool attn_output_49_transpose_x_0 = const()[name = string("attn_output_49_transpose_x_0"), val = bool(false)];
+            bool attn_output_49_transpose_y_0 = const()[name = string("attn_output_49_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_17_cast_fp16 = transpose(perm = V_expanded_17_perm_0, x = reshape_35_cast_fp16)[name = string("transpose_68")];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_49_cast_fp16 = matmul(transpose_x = attn_output_49_transpose_x_0, transpose_y = attn_output_49_transpose_y_0, x = var_6046_cast_fp16, y = V_expanded_17_cast_fp16)[name = string("attn_output_49_cast_fp16")];
+            tensor<int32, [4]> var_6057 = const()[name = string("op_6057"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_6064 = const()[name = string("op_6064"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_6058_cast_fp16 = transpose(perm = var_6057, x = attn_output_49_cast_fp16)[name = string("transpose_67")];
+            tensor<fp16, [1, 3, 2048]> attn_output_51_cast_fp16 = reshape(shape = var_6064, x = var_6058_cast_fp16)[name = string("attn_output_51_cast_fp16")];
+            tensor<int32, [3]> var_6069 = const()[name = string("op_6069"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_6085_pad_type_0 = const()[name = string("op_6085_pad_type_0"), val = string("valid")];
+            int32 var_6085_groups_0 = const()[name = string("op_6085_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_6085_strides_0 = const()[name = string("op_6085_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_6085_pad_0 = const()[name = string("op_6085_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_6085_dilations_0 = const()[name = string("op_6085_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_8_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(557679808))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(560301312))))[name = string("squeeze_8_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_6070_cast_fp16 = transpose(perm = var_6069, x = attn_output_51_cast_fp16)[name = string("transpose_66")];
+            tensor<fp16, [1, 2560, 3]> var_6085_cast_fp16 = conv(dilations = var_6085_dilations_0, groups = var_6085_groups_0, pad = var_6085_pad_0, pad_type = var_6085_pad_type_0, strides = var_6085_strides_0, weight = squeeze_8_cast_fp16_to_fp32_to_fp16_palettized, x = var_6070_cast_fp16)[name = string("op_6085_cast_fp16")];
+            tensor<int32, [3]> var_6089 = const()[name = string("op_6089"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_6095 = const()[name = string("op_6095"), val = int32(-1)];
+            fp16 const_101_promoted_to_fp16 = const()[name = string("const_101_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_171_cast_fp16 = transpose(perm = var_6089, x = var_6085_cast_fp16)[name = string("transpose_65")];
+            tensor<fp16, [1, 3, 2560]> var_6097_cast_fp16 = mul(x = x_171_cast_fp16, y = const_101_promoted_to_fp16)[name = string("op_6097_cast_fp16")];
+            bool input_251_interleave_0 = const()[name = string("input_251_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_251_cast_fp16 = concat(axis = var_6095, interleave = input_251_interleave_0, values = (x_171_cast_fp16, var_6097_cast_fp16))[name = string("input_251_cast_fp16")];
+            tensor<int32, [1]> normed_237_axes_0 = const()[name = string("normed_237_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6092_to_fp16 = const()[name = string("op_6092_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_237_cast_fp16 = layer_norm(axes = normed_237_axes_0, epsilon = var_6092_to_fp16, x = input_251_cast_fp16)[name = string("normed_237_cast_fp16")];
+            tensor<int32, [2]> var_6102_split_sizes_0 = const()[name = string("op_6102_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6102_axis_0 = const()[name = string("op_6102_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_6102_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_6102_cast_fp16_1 = split(axis = var_6102_axis_0, split_sizes = var_6102_split_sizes_0, x = normed_237_cast_fp16)[name = string("op_6102_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_8_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(560303936)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_53_cast_fp16 = mul(x = var_6102_cast_fp16_0, y = layers_8_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_53_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_173_cast_fp16 = add(x = x_159_cast_fp16, y = attn_output_53_cast_fp16)[name = string("x_173_cast_fp16")];
+            int32 var_6111 = const()[name = string("op_6111"), val = int32(-1)];
+            fp16 const_102_promoted_to_fp16 = const()[name = string("const_102_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_6113_cast_fp16 = mul(x = x_173_cast_fp16, y = const_102_promoted_to_fp16)[name = string("op_6113_cast_fp16")];
+            bool input_253_interleave_0 = const()[name = string("input_253_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_253_cast_fp16 = concat(axis = var_6111, interleave = input_253_interleave_0, values = (x_173_cast_fp16, var_6113_cast_fp16))[name = string("input_253_cast_fp16")];
+            tensor<int32, [1]> normed_241_axes_0 = const()[name = string("normed_241_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6108_to_fp16 = const()[name = string("op_6108_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_241_cast_fp16 = layer_norm(axes = normed_241_axes_0, epsilon = var_6108_to_fp16, x = input_253_cast_fp16)[name = string("normed_241_cast_fp16")];
+            tensor<int32, [2]> var_6118_split_sizes_0 = const()[name = string("op_6118_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6118_axis_0 = const()[name = string("op_6118_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_6118_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_6118_cast_fp16_1 = split(axis = var_6118_axis_0, split_sizes = var_6118_split_sizes_0, x = normed_241_cast_fp16)[name = string("op_6118_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_8_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(560309120)))];
+            tensor<fp16, [1, 3, 2560]> h_51_cast_fp16 = mul(x = var_6118_cast_fp16_0, y = layers_8_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_51_cast_fp16")];
+            tensor<int32, [3]> var_6129 = const()[name = string("op_6129"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_255_axes_0 = const()[name = string("input_255_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_6130 = transpose(perm = var_6129, x = h_51_cast_fp16)[name = string("transpose_64")];
+            tensor<fp16, [1, 2560, 1, 3]> input_255 = expand_dims(axes = input_255_axes_0, x = var_6130)[name = string("input_255")];
+            string gate_33_pad_type_0 = const()[name = string("gate_33_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_33_strides_0 = const()[name = string("gate_33_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_33_pad_0 = const()[name = string("gate_33_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_33_dilations_0 = const()[name = string("gate_33_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_33_groups_0 = const()[name = string("gate_33_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_33 = conv(dilations = gate_33_dilations_0, groups = gate_33_groups_0, pad = gate_33_pad_0, pad_type = gate_33_pad_type_0, strides = gate_33_strides_0, weight = layers_8_mlp_gate_proj_weight_palettized, x = input_255)[name = string("gate_33")];
+            string up_17_pad_type_0 = const()[name = string("up_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_17_strides_0 = const()[name = string("up_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_17_pad_0 = const()[name = string("up_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_17_dilations_0 = const()[name = string("up_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_17_groups_0 = const()[name = string("up_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_17 = conv(dilations = up_17_dilations_0, groups = up_17_groups_0, pad = up_17_pad_0, pad_type = up_17_pad_type_0, strides = up_17_strides_0, weight = layers_8_mlp_up_proj_weight_palettized, x = input_255)[name = string("up_17")];
+            string gate_35_mode_0 = const()[name = string("gate_35_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_35 = gelu(mode = gate_35_mode_0, x = gate_33)[name = string("gate_35")];
+            tensor<fp16, [1, 10240, 1, 3]> input_257 = mul(x = gate_35, y = up_17)[name = string("input_257")];
+            string mlp_out_17_pad_type_0 = const()[name = string("mlp_out_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_17_strides_0 = const()[name = string("mlp_out_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_17_pad_0 = const()[name = string("mlp_out_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_17_dilations_0 = const()[name = string("mlp_out_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_17_groups_0 = const()[name = string("mlp_out_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_17 = conv(dilations = mlp_out_17_dilations_0, groups = mlp_out_17_groups_0, pad = mlp_out_17_pad_0, pad_type = mlp_out_17_pad_type_0, strides = mlp_out_17_strides_0, weight = layers_8_mlp_down_proj_weight_palettized, x = input_257)[name = string("mlp_out_17")];
+            tensor<int32, [1]> var_6170_axes_0 = const()[name = string("op_6170_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_6170 = squeeze(axes = var_6170_axes_0, x = mlp_out_17)[name = string("op_6170")];
+            tensor<int32, [3]> var_6174 = const()[name = string("op_6174"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_6180 = const()[name = string("op_6180"), val = int32(-1)];
+            fp16 const_103_promoted = const()[name = string("const_103_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_175 = transpose(perm = var_6174, x = var_6170)[name = string("transpose_63")];
+            tensor<fp16, [1, 3, 2560]> var_6182 = mul(x = x_175, y = const_103_promoted)[name = string("op_6182")];
+            bool input_259_interleave_0 = const()[name = string("input_259_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_259 = concat(axis = var_6180, interleave = input_259_interleave_0, values = (x_175, var_6182))[name = string("input_259")];
+            tensor<int32, [1]> normed_245_axes_0 = const()[name = string("normed_245_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6177_to_fp16 = const()[name = string("op_6177_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_245_cast_fp16 = layer_norm(axes = normed_245_axes_0, epsilon = var_6177_to_fp16, x = input_259)[name = string("normed_245_cast_fp16")];
+            tensor<int32, [2]> var_6187_split_sizes_0 = const()[name = string("op_6187_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6187_axis_0 = const()[name = string("op_6187_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_6187_0, tensor<fp16, [1, 3, 2560]> var_6187_1 = split(axis = var_6187_axis_0, split_sizes = var_6187_split_sizes_0, x = normed_245_cast_fp16)[name = string("op_6187")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_83 = mul(x = var_6187_0, y = layers_8_post_feedforward_layernorm_weight)[name = string("hidden_states_83")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_85_cast_fp16 = add(x = x_173_cast_fp16, y = hidden_states_83)[name = string("hidden_states_85_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_17_begin_0 = const()[name = string("per_layer_slice_17_begin_0"), val = tensor<int32, [3]>([0, 0, 5120])];
+            tensor<int32, [3]> per_layer_slice_17_end_0 = const()[name = string("per_layer_slice_17_end_0"), val = tensor<int32, [3]>([1, 3, 5376])];
+            tensor<bool, [3]> per_layer_slice_17_end_mask_0 = const()[name = string("per_layer_slice_17_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_17_cast_fp16 = slice_by_index(begin = per_layer_slice_17_begin_0, end = per_layer_slice_17_end_0, end_mask = per_layer_slice_17_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_17_cast_fp16")];
+            tensor<int32, [3]> var_6215 = const()[name = string("op_6215"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_261_axes_0 = const()[name = string("input_261_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_6216 = transpose(perm = var_6215, x = hidden_states_85_cast_fp16)[name = string("transpose_62")];
+            tensor<fp16, [1, 2560, 1, 3]> input_261 = expand_dims(axes = input_261_axes_0, x = var_6216)[name = string("input_261")];
+            string gated_49_pad_type_0 = const()[name = string("gated_49_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_49_strides_0 = const()[name = string("gated_49_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_49_pad_0 = const()[name = string("gated_49_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_49_dilations_0 = const()[name = string("gated_49_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_49_groups_0 = const()[name = string("gated_49_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_49 = conv(dilations = gated_49_dilations_0, groups = gated_49_groups_0, pad = gated_49_pad_0, pad_type = gated_49_pad_type_0, strides = gated_49_strides_0, weight = layers_8_per_layer_input_gate_weight_palettized, x = input_261)[name = string("gated_49")];
+            string gated_51_mode_0 = const()[name = string("gated_51_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_51 = gelu(mode = gated_51_mode_0, x = gated_49)[name = string("gated_51")];
+            tensor<int32, [3]> var_6235 = const()[name = string("op_6235"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_17_axes_0 = const()[name = string("per_layer_slice_conv_17_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_6236_cast_fp16 = transpose(perm = var_6235, x = per_layer_slice_17_cast_fp16)[name = string("transpose_61")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_17_cast_fp16 = expand_dims(axes = per_layer_slice_conv_17_axes_0, x = var_6236_cast_fp16)[name = string("per_layer_slice_conv_17_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_263_cast_fp16 = mul(x = gated_51, y = per_layer_slice_conv_17_cast_fp16)[name = string("input_263_cast_fp16")];
+            string gated_53_pad_type_0 = const()[name = string("gated_53_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_53_strides_0 = const()[name = string("gated_53_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_53_pad_0 = const()[name = string("gated_53_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_53_dilations_0 = const()[name = string("gated_53_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_53_groups_0 = const()[name = string("gated_53_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_8_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(560314304))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(560642048))))[name = string("layers_8_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_53_cast_fp16 = conv(dilations = gated_53_dilations_0, groups = gated_53_groups_0, pad = gated_53_pad_0, pad_type = gated_53_pad_type_0, strides = gated_53_strides_0, weight = layers_8_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_263_cast_fp16)[name = string("gated_53_cast_fp16")];
+            tensor<int32, [1]> var_6252_axes_0 = const()[name = string("op_6252_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_6252_cast_fp16 = squeeze(axes = var_6252_axes_0, x = gated_53_cast_fp16)[name = string("op_6252_cast_fp16")];
+            tensor<int32, [3]> var_6256 = const()[name = string("op_6256"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_6262 = const()[name = string("op_6262"), val = int32(-1)];
+            fp16 const_104_promoted_to_fp16 = const()[name = string("const_104_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_177_cast_fp16 = transpose(perm = var_6256, x = var_6252_cast_fp16)[name = string("transpose_60")];
+            tensor<fp16, [1, 3, 2560]> var_6264_cast_fp16 = mul(x = x_177_cast_fp16, y = const_104_promoted_to_fp16)[name = string("op_6264_cast_fp16")];
+            bool input_265_interleave_0 = const()[name = string("input_265_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_265_cast_fp16 = concat(axis = var_6262, interleave = input_265_interleave_0, values = (x_177_cast_fp16, var_6264_cast_fp16))[name = string("input_265_cast_fp16")];
+            tensor<int32, [1]> normed_249_axes_0 = const()[name = string("normed_249_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6259_to_fp16 = const()[name = string("op_6259_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_249_cast_fp16 = layer_norm(axes = normed_249_axes_0, epsilon = var_6259_to_fp16, x = input_265_cast_fp16)[name = string("normed_249_cast_fp16")];
+            tensor<int32, [2]> var_6269_split_sizes_0 = const()[name = string("op_6269_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6269_axis_0 = const()[name = string("op_6269_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_6269_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_6269_cast_fp16_1 = split(axis = var_6269_axis_0, split_sizes = var_6269_split_sizes_0, x = normed_249_cast_fp16)[name = string("op_6269_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_8_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(560644672)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_89_cast_fp16 = mul(x = var_6269_cast_fp16_0, y = layers_8_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_89_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_91_cast_fp16 = add(x = hidden_states_85_cast_fp16, y = hidden_states_89_cast_fp16)[name = string("hidden_states_91_cast_fp16")];
+            tensor<fp16, [1]> const_105_promoted_to_fp16 = const()[name = string("const_105_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.bap-2])];
+            tensor<fp16, [1, 3, 2560]> x_179_cast_fp16 = mul(x = hidden_states_91_cast_fp16, y = const_105_promoted_to_fp16)[name = string("x_179_cast_fp16")];
+            int32 var_6284 = const()[name = string("op_6284"), val = int32(-1)];
+            fp16 const_106_promoted_to_fp16 = const()[name = string("const_106_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_6286_cast_fp16 = mul(x = x_179_cast_fp16, y = const_106_promoted_to_fp16)[name = string("op_6286_cast_fp16")];
+            bool input_267_interleave_0 = const()[name = string("input_267_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_267_cast_fp16 = concat(axis = var_6284, interleave = input_267_interleave_0, values = (x_179_cast_fp16, var_6286_cast_fp16))[name = string("input_267_cast_fp16")];
+            tensor<int32, [1]> normed_253_axes_0 = const()[name = string("normed_253_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6281_to_fp16 = const()[name = string("op_6281_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_253_cast_fp16 = layer_norm(axes = normed_253_axes_0, epsilon = var_6281_to_fp16, x = input_267_cast_fp16)[name = string("normed_253_cast_fp16")];
+            tensor<int32, [2]> var_6291_split_sizes_0 = const()[name = string("op_6291_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6291_axis_0 = const()[name = string("op_6291_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_6291_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_6291_cast_fp16_1 = split(axis = var_6291_axis_0, split_sizes = var_6291_split_sizes_0, x = normed_253_cast_fp16)[name = string("op_6291_cast_fp16")];
+            tensor<fp16, [2560]> layers_9_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_9_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(560649856)))];
+            tensor<fp16, [1, 3, 2560]> h_55_cast_fp16 = mul(x = var_6291_cast_fp16_0, y = layers_9_input_layernorm_weight_promoted_to_fp16)[name = string("h_55_cast_fp16")];
+            tensor<int32, [3]> var_6297 = const()[name = string("op_6297"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_6300_axes_0 = const()[name = string("op_6300_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_6298_cast_fp16 = transpose(perm = var_6297, x = h_55_cast_fp16)[name = string("transpose_59")];
+            tensor<fp16, [1, 2560, 1, 3]> var_6300_cast_fp16 = expand_dims(axes = var_6300_axes_0, x = var_6298_cast_fp16)[name = string("op_6300_cast_fp16")];
+            string q_109_pad_type_0 = const()[name = string("q_109_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_109_strides_0 = const()[name = string("q_109_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_109_pad_0 = const()[name = string("q_109_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_109_dilations_0 = const()[name = string("q_109_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_109_groups_0 = const()[name = string("q_109_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_109 = conv(dilations = q_109_dilations_0, groups = q_109_groups_0, pad = q_109_pad_0, pad_type = q_109_pad_type_0, strides = q_109_strides_0, weight = layers_9_self_attn_q_proj_weight_palettized, x = var_6300_cast_fp16)[name = string("q_109")];
+            tensor<int32, [4]> var_6321 = const()[name = string("op_6321"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_6322 = reshape(shape = var_6321, x = q_109)[name = string("op_6322")];
+            tensor<int32, [4]> transpose_75_perm_0 = const()[name = string("transpose_75_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_6345 = const()[name = string("op_6345"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_75 = transpose(perm = transpose_75_perm_0, x = var_6322)[name = string("transpose_58")];
+            tensor<fp16, [3, 8, 256]> x_181 = reshape(shape = var_6345, x = transpose_75)[name = string("x_181")];
+            int32 var_6351 = const()[name = string("op_6351"), val = int32(-1)];
+            fp16 const_107_promoted = const()[name = string("const_107_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_6353 = mul(x = x_181, y = const_107_promoted)[name = string("op_6353")];
+            bool input_271_interleave_0 = const()[name = string("input_271_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_271 = concat(axis = var_6351, interleave = input_271_interleave_0, values = (x_181, var_6353))[name = string("input_271")];
+            tensor<int32, [1]> normed_257_axes_0 = const()[name = string("normed_257_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6348_to_fp16 = const()[name = string("op_6348_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_257_cast_fp16 = layer_norm(axes = normed_257_axes_0, epsilon = var_6348_to_fp16, x = input_271)[name = string("normed_257_cast_fp16")];
+            tensor<int32, [2]> var_6358_split_sizes_0 = const()[name = string("op_6358_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_6358_axis_0 = const()[name = string("op_6358_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_6358_0, tensor<fp16, [3, 8, 256]> var_6358_1 = split(axis = var_6358_axis_0, split_sizes = var_6358_split_sizes_0, x = normed_257_cast_fp16)[name = string("op_6358")];
+            tensor<fp16, [3, 8, 256]> q_113 = mul(x = var_6358_0, y = layers_9_self_attn_q_norm_weight)[name = string("q_113")];
+            tensor<int32, [4]> var_6365 = const()[name = string("op_6365"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_6366 = reshape(shape = var_6365, x = q_113)[name = string("op_6366")];
+            tensor<int32, [4]> var_6371 = const()[name = string("op_6371"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_115 = transpose(perm = var_6371, x = var_6366)[name = string("transpose_57")];
+            tensor<fp16, [1, 8, 3, 256]> var_6373_cast_fp16 = mul(x = q_115, y = cos_s)[name = string("op_6373_cast_fp16")];
+            tensor<int32, [2]> var_6374_split_sizes_0 = const()[name = string("op_6374_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_6374_axis_0 = const()[name = string("op_6374_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_6374_0, tensor<fp16, [1, 8, 3, 128]> var_6374_1 = split(axis = var_6374_axis_0, split_sizes = var_6374_split_sizes_0, x = q_115)[name = string("op_6374")];
+            fp16 const_108_promoted = const()[name = string("const_108_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_6376 = mul(x = var_6374_1, y = const_108_promoted)[name = string("op_6376")];
+            int32 var_6378 = const()[name = string("op_6378"), val = int32(-1)];
+            bool var_6379_interleave_0 = const()[name = string("op_6379_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_6379 = concat(axis = var_6378, interleave = var_6379_interleave_0, values = (var_6376, var_6374_0))[name = string("op_6379")];
+            tensor<fp16, [1, 8, 3, 256]> var_6380_cast_fp16 = mul(x = var_6379, y = sin_s)[name = string("op_6380_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_119_cast_fp16 = add(x = var_6373_cast_fp16, y = var_6380_cast_fp16)[name = string("q_119_cast_fp16")];
+            string k_57_pad_type_0 = const()[name = string("k_57_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_57_strides_0 = const()[name = string("k_57_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_57_pad_0 = const()[name = string("k_57_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_57_dilations_0 = const()[name = string("k_57_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_57_groups_0 = const()[name = string("k_57_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> k_57 = conv(dilations = k_57_dilations_0, groups = k_57_groups_0, pad = k_57_pad_0, pad_type = k_57_pad_type_0, strides = k_57_strides_0, weight = layers_9_self_attn_k_proj_weight_palettized, x = var_6300_cast_fp16)[name = string("k_57")];
+            tensor<int32, [4]> var_6398 = const()[name = string("op_6398"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_6399 = reshape(shape = var_6398, x = k_57)[name = string("op_6399")];
+            tensor<int32, [4]> transpose_76_perm_0 = const()[name = string("transpose_76_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            string v_21_pad_type_0 = const()[name = string("v_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_21_strides_0 = const()[name = string("v_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_21_pad_0 = const()[name = string("v_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_21_dilations_0 = const()[name = string("v_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_21_groups_0 = const()[name = string("v_21_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> v_21 = conv(dilations = v_21_dilations_0, groups = v_21_groups_0, pad = v_21_pad_0, pad_type = v_21_pad_type_0, strides = v_21_strides_0, weight = layers_9_self_attn_v_proj_weight_palettized, x = var_6300_cast_fp16)[name = string("v_21")];
+            tensor<int32, [4]> var_6426 = const()[name = string("op_6426"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_6427 = reshape(shape = var_6426, x = v_21)[name = string("op_6427")];
+            tensor<int32, [4]> var_6432 = const()[name = string("op_6432"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_6450 = const()[name = string("op_6450"), val = tensor<int32, [3]>([3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> transpose_76 = transpose(perm = transpose_76_perm_0, x = var_6399)[name = string("transpose_56")];
+            tensor<fp16, [3, 2, 256]> x_183 = reshape(shape = var_6450, x = transpose_76)[name = string("x_183")];
+            int32 var_6456 = const()[name = string("op_6456"), val = int32(-1)];
+            fp16 const_109_promoted = const()[name = string("const_109_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 2, 256]> var_6458 = mul(x = x_183, y = const_109_promoted)[name = string("op_6458")];
+            bool input_273_interleave_0 = const()[name = string("input_273_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 2, 512]> input_273 = concat(axis = var_6456, interleave = input_273_interleave_0, values = (x_183, var_6458))[name = string("input_273")];
+            tensor<int32, [1]> normed_261_axes_0 = const()[name = string("normed_261_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6453_to_fp16 = const()[name = string("op_6453_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 2, 512]> normed_261_cast_fp16 = layer_norm(axes = normed_261_axes_0, epsilon = var_6453_to_fp16, x = input_273)[name = string("normed_261_cast_fp16")];
+            tensor<int32, [2]> var_6463_split_sizes_0 = const()[name = string("op_6463_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_6463_axis_0 = const()[name = string("op_6463_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 2, 256]> var_6463_0, tensor<fp16, [3, 2, 256]> var_6463_1 = split(axis = var_6463_axis_0, split_sizes = var_6463_split_sizes_0, x = normed_261_cast_fp16)[name = string("op_6463")];
+            tensor<fp16, [3, 2, 256]> k_61 = mul(x = var_6463_0, y = layers_9_self_attn_k_norm_weight)[name = string("k_61")];
+            tensor<int32, [4]> var_6470 = const()[name = string("op_6470"), val = tensor<int32, [4]>([1, 3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> var_6471 = reshape(shape = var_6470, x = k_61)[name = string("op_6471")];
+            tensor<int32, [4]> var_6476 = const()[name = string("op_6476"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            fp16 var_6478_promoted = const()[name = string("op_6478_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 3, 256]> var_6433 = transpose(perm = var_6432, x = var_6427)[name = string("transpose_55")];
+            tensor<fp16, [1, 2, 3, 256]> var_6479 = pow(x = var_6433, y = var_6478_promoted)[name = string("op_6479")];
+            tensor<int32, [1]> var_6484_axes_0 = const()[name = string("op_6484_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_6484_keep_dims_0 = const()[name = string("op_6484_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 3, 1]> var_6484 = reduce_mean(axes = var_6484_axes_0, keep_dims = var_6484_keep_dims_0, x = var_6479)[name = string("op_6484")];
+            fp16 var_6486_to_fp16 = const()[name = string("op_6486_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 3, 1]> mean_sq_19_cast_fp16 = add(x = var_6484, y = var_6486_to_fp16)[name = string("mean_sq_19_cast_fp16")];
+            fp32 var_6488_epsilon_0 = const()[name = string("op_6488_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 3, 1]> var_6488_cast_fp16 = rsqrt(epsilon = var_6488_epsilon_0, x = mean_sq_19_cast_fp16)[name = string("op_6488_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_277_cast_fp16 = mul(x = var_6433, y = var_6488_cast_fp16)[name = string("input_277_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> q_117 = transpose(perm = var_6476, x = var_6471)[name = string("transpose_54")];
+            tensor<fp16, [1, 2, 3, 256]> var_6490_cast_fp16 = mul(x = q_117, y = cos_s)[name = string("op_6490_cast_fp16")];
+            tensor<int32, [2]> var_6491_split_sizes_0 = const()[name = string("op_6491_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_6491_axis_0 = const()[name = string("op_6491_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 3, 128]> var_6491_0, tensor<fp16, [1, 2, 3, 128]> var_6491_1 = split(axis = var_6491_axis_0, split_sizes = var_6491_split_sizes_0, x = q_117)[name = string("op_6491")];
+            fp16 const_110_promoted = const()[name = string("const_110_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 3, 128]> var_6493 = mul(x = var_6491_1, y = const_110_promoted)[name = string("op_6493")];
+            int32 var_6495 = const()[name = string("op_6495"), val = int32(-1)];
+            bool var_6496_interleave_0 = const()[name = string("op_6496_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 3, 256]> var_6496 = concat(axis = var_6495, interleave = var_6496_interleave_0, values = (var_6493, var_6491_0))[name = string("op_6496")];
+            tensor<fp16, [1, 2, 3, 256]> var_6497_cast_fp16 = mul(x = var_6496, y = sin_s)[name = string("op_6497_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_275_cast_fp16 = add(x = var_6490_cast_fp16, y = var_6497_cast_fp16)[name = string("input_275_cast_fp16")];
+            tensor<int32, [8]> k_padded_17_pad_0 = const()[name = string("k_padded_17_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_17_mode_0 = const()[name = string("k_padded_17_mode_0"), val = string("constant")];
+            fp16 const_111_to_fp16 = const()[name = string("const_111_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> k_padded_17_cast_fp16 = pad(constant_val = const_111_to_fp16, mode = k_padded_17_mode_0, pad = k_padded_17_pad_0, x = input_275_cast_fp16)[name = string("k_padded_17_cast_fp16")];
+            tensor<int32, [8]> v_padded_17_pad_0 = const()[name = string("v_padded_17_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_17_mode_0 = const()[name = string("v_padded_17_mode_0"), val = string("constant")];
+            fp16 const_112_to_fp16 = const()[name = string("const_112_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> v_padded_17_cast_fp16 = pad(constant_val = const_112_to_fp16, mode = v_padded_17_mode_0, pad = v_padded_17_pad_0, x = input_277_cast_fp16)[name = string("v_padded_17_cast_fp16")];
+            tensor<int32, [4]> slot_k_19_begin_0 = const()[name = string("slot_k_19_begin_0"), val = tensor<int32, [4]>([8, 0, 0, 0])];
+            tensor<int32, [4]> slot_k_19_end_0 = const()[name = string("slot_k_19_end_0"), val = tensor<int32, [4]>([9, 2, 512, 512])];
+            tensor<bool, [4]> slot_k_19_end_mask_0 = const()[name = string("slot_k_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_k_19_cast_fp16 = slice_by_index(begin = slot_k_19_begin_0, end = slot_k_19_end_0, end_mask = slot_k_19_end_mask_0, x = K_sliding_out_15_cast_fp16)[name = string("slot_k_19_cast_fp16")];
+            tensor<int32, [4]> slot_v_19_begin_0 = const()[name = string("slot_v_19_begin_0"), val = tensor<int32, [4]>([8, 0, 0, 0])];
+            tensor<int32, [4]> slot_v_19_end_0 = const()[name = string("slot_v_19_end_0"), val = tensor<int32, [4]>([9, 2, 512, 512])];
+            tensor<bool, [4]> slot_v_19_end_mask_0 = const()[name = string("slot_v_19_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_v_19_cast_fp16 = slice_by_index(begin = slot_v_19_begin_0, end = slot_v_19_end_0, end_mask = slot_v_19_end_mask_0, x = V_sliding_out_15_cast_fp16)[name = string("slot_v_19_cast_fp16")];
+            tensor<int32, [4]> var_6536_begin_0 = const()[name = string("op_6536_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_6536_end_0 = const()[name = string("op_6536_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_6536_end_mask_0 = const()[name = string("op_6536_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_6536_cast_fp16 = slice_by_index(begin = var_6536_begin_0, end = var_6536_end_0, end_mask = var_6536_end_mask_0, x = slot_k_19_cast_fp16)[name = string("op_6536_cast_fp16")];
+            int32 var_6543 = const()[name = string("op_6543"), val = int32(2)];
+            bool new_k_19_interleave_0 = const()[name = string("new_k_19_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_k_19_cast_fp16 = concat(axis = var_6543, interleave = new_k_19_interleave_0, values = (var_6536_cast_fp16, k_padded_17_cast_fp16))[name = string("new_k_19_cast_fp16")];
+            tensor<int32, [4]> var_6559_begin_0 = const()[name = string("op_6559_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_6559_end_0 = const()[name = string("op_6559_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_6559_end_mask_0 = const()[name = string("op_6559_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_6559_cast_fp16 = slice_by_index(begin = var_6559_begin_0, end = var_6559_end_0, end_mask = var_6559_end_mask_0, x = slot_v_19_cast_fp16)[name = string("op_6559_cast_fp16")];
+            int32 var_6566 = const()[name = string("op_6566"), val = int32(2)];
+            bool new_v_19_interleave_0 = const()[name = string("new_v_19_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_v_19_cast_fp16 = concat(axis = var_6566, interleave = new_v_19_interleave_0, values = (var_6559_cast_fp16, v_padded_17_cast_fp16))[name = string("new_v_19_cast_fp16")];
+            tensor<int32, [4]> var_6572_begin_0 = const()[name = string("op_6572_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_6572_end_0 = const()[name = string("op_6572_end_0"), val = tensor<int32, [4]>([8, 2, 512, 512])];
+            tensor<bool, [4]> var_6572_end_mask_0 = const()[name = string("op_6572_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [8, 2, 512, 512]> var_6572_cast_fp16 = slice_by_index(begin = var_6572_begin_0, end = var_6572_end_0, end_mask = var_6572_end_mask_0, x = K_sliding_out_15_cast_fp16)[name = string("op_6572_cast_fp16")];
+            tensor<int32, [4]> var_6577_begin_0 = const()[name = string("op_6577_begin_0"), val = tensor<int32, [4]>([9, 0, 0, 0])];
+            tensor<int32, [4]> var_6577_end_0 = const()[name = string("op_6577_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_6577_end_mask_0 = const()[name = string("op_6577_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_6577_cast_fp16 = slice_by_index(begin = var_6577_begin_0, end = var_6577_end_0, end_mask = var_6577_end_mask_0, x = K_sliding_out_15_cast_fp16)[name = string("op_6577_cast_fp16")];
+            int32 var_6579 = const()[name = string("op_6579"), val = int32(0)];
+            bool K_sliding_out_17_interleave_0 = const()[name = string("K_sliding_out_17_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> K_sliding_out_17_cast_fp16 = concat(axis = var_6579, interleave = K_sliding_out_17_interleave_0, values = (var_6572_cast_fp16, new_k_19_cast_fp16, var_6577_cast_fp16))[name = string("K_sliding_out_17_cast_fp16")];
+            tensor<int32, [4]> var_6585_begin_0 = const()[name = string("op_6585_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_6585_end_0 = const()[name = string("op_6585_end_0"), val = tensor<int32, [4]>([8, 2, 512, 512])];
+            tensor<bool, [4]> var_6585_end_mask_0 = const()[name = string("op_6585_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [8, 2, 512, 512]> var_6585_cast_fp16 = slice_by_index(begin = var_6585_begin_0, end = var_6585_end_0, end_mask = var_6585_end_mask_0, x = V_sliding_out_15_cast_fp16)[name = string("op_6585_cast_fp16")];
+            tensor<int32, [4]> var_6590_begin_0 = const()[name = string("op_6590_begin_0"), val = tensor<int32, [4]>([9, 0, 0, 0])];
+            tensor<int32, [4]> var_6590_end_0 = const()[name = string("op_6590_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_6590_end_mask_0 = const()[name = string("op_6590_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_6590_cast_fp16 = slice_by_index(begin = var_6590_begin_0, end = var_6590_end_0, end_mask = var_6590_end_mask_0, x = V_sliding_out_15_cast_fp16)[name = string("op_6590_cast_fp16")];
+            int32 var_6592 = const()[name = string("op_6592"), val = int32(0)];
+            bool V_sliding_out_17_interleave_0 = const()[name = string("V_sliding_out_17_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> V_sliding_out_17_cast_fp16 = concat(axis = var_6592, interleave = V_sliding_out_17_interleave_0, values = (var_6585_cast_fp16, new_v_19_cast_fp16, var_6590_cast_fp16))[name = string("V_sliding_out_17_cast_fp16")];
+            tensor<int32, [4]> var_6598_begin_0 = const()[name = string("op_6598_begin_0"), val = tensor<int32, [4]>([8, 0, 0, 0])];
+            tensor<int32, [4]> var_6598_end_0 = const()[name = string("op_6598_end_0"), val = tensor<int32, [4]>([9, 2, 512, 512])];
+            tensor<bool, [4]> var_6598_end_mask_0 = const()[name = string("op_6598_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_6598_cast_fp16 = slice_by_index(begin = var_6598_begin_0, end = var_6598_end_0, end_mask = var_6598_end_mask_0, x = K_sliding_out_17_cast_fp16)[name = string("op_6598_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_19_begin_0 = const()[name = string("K_for_attn_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_19_end_0 = const()[name = string("K_for_attn_19_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_19_end_mask_0 = const()[name = string("K_for_attn_19_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_19_cast_fp16 = slice_by_index(begin = K_for_attn_19_begin_0, end = K_for_attn_19_end_0, end_mask = K_for_attn_19_end_mask_0, x = var_6598_cast_fp16)[name = string("K_for_attn_19_cast_fp16")];
+            tensor<int32, [4]> var_6608_begin_0 = const()[name = string("op_6608_begin_0"), val = tensor<int32, [4]>([8, 0, 0, 0])];
+            tensor<int32, [4]> var_6608_end_0 = const()[name = string("op_6608_end_0"), val = tensor<int32, [4]>([9, 2, 512, 512])];
+            tensor<bool, [4]> var_6608_end_mask_0 = const()[name = string("op_6608_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_6608_cast_fp16 = slice_by_index(begin = var_6608_begin_0, end = var_6608_end_0, end_mask = var_6608_end_mask_0, x = V_sliding_out_17_cast_fp16)[name = string("op_6608_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_19_begin_0 = const()[name = string("V_for_attn_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_19_end_0 = const()[name = string("V_for_attn_19_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_19_end_mask_0 = const()[name = string("V_for_attn_19_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_19_cast_fp16 = slice_by_index(begin = V_for_attn_19_begin_0, end = V_for_attn_19_end_0, end_mask = V_for_attn_19_end_mask_0, x = var_6608_cast_fp16)[name = string("V_for_attn_19_cast_fp16")];
+            tensor<int32, [4]> transpose_36_perm_0 = const()[name = string("transpose_36_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_18_reps_0 = const()[name = string("tile_18_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_36_cast_fp16 = transpose(perm = transpose_36_perm_0, x = K_for_attn_19_cast_fp16)[name = string("transpose_53")];
+            tensor<fp16, [8, 1, 512, 256]> tile_18_cast_fp16 = tile(reps = tile_18_reps_0, x = transpose_36_cast_fp16)[name = string("tile_18_cast_fp16")];
+            tensor<int32, [5]> concat_38 = const()[name = string("concat_38"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_36_cast_fp16 = reshape(shape = concat_38, x = tile_18_cast_fp16)[name = string("reshape_36_cast_fp16")];
+            tensor<int32, [5]> transpose_37_perm_0 = const()[name = string("transpose_37_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_39 = const()[name = string("concat_39"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_37_cast_fp16 = transpose(perm = transpose_37_perm_0, x = reshape_36_cast_fp16)[name = string("transpose_52")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_37_cast_fp16 = reshape(shape = concat_39, x = transpose_37_cast_fp16)[name = string("reshape_37_cast_fp16")];
+            tensor<int32, [4]> transpose_77_perm_0 = const()[name = string("transpose_77_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_38_perm_0 = const()[name = string("transpose_38_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_19_reps_0 = const()[name = string("tile_19_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_38_cast_fp16 = transpose(perm = transpose_38_perm_0, x = V_for_attn_19_cast_fp16)[name = string("transpose_51")];
+            tensor<fp16, [8, 1, 512, 256]> tile_19_cast_fp16 = tile(reps = tile_19_reps_0, x = transpose_38_cast_fp16)[name = string("tile_19_cast_fp16")];
+            tensor<int32, [5]> concat_40 = const()[name = string("concat_40"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_38_cast_fp16 = reshape(shape = concat_40, x = tile_19_cast_fp16)[name = string("reshape_38_cast_fp16")];
+            tensor<int32, [5]> transpose_39_perm_0 = const()[name = string("transpose_39_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_41 = const()[name = string("concat_41"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_39_cast_fp16 = transpose(perm = transpose_39_perm_0, x = reshape_38_cast_fp16)[name = string("transpose_50")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_39_cast_fp16 = reshape(shape = concat_41, x = transpose_39_cast_fp16)[name = string("reshape_39_cast_fp16")];
+            tensor<int32, [4]> V_expanded_19_perm_0 = const()[name = string("V_expanded_19_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_37_transpose_x_0 = const()[name = string("attn_weights_37_transpose_x_0"), val = bool(false)];
+            bool attn_weights_37_transpose_y_0 = const()[name = string("attn_weights_37_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_77_cast_fp16 = transpose(perm = transpose_77_perm_0, x = reshape_37_cast_fp16)[name = string("transpose_49")];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_37_cast_fp16 = matmul(transpose_x = attn_weights_37_transpose_x_0, transpose_y = attn_weights_37_transpose_y_0, x = q_119_cast_fp16, y = transpose_77_cast_fp16)[name = string("attn_weights_37_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_187_cast_fp16 = add(x = attn_weights_37_cast_fp16, y = causal_mask_sliding)[name = string("x_187_cast_fp16")];
+            tensor<int32, [1]> reduce_max_9_axes_0 = const()[name = string("reduce_max_9_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_9_keep_dims_0 = const()[name = string("reduce_max_9_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_9 = reduce_max(axes = reduce_max_9_axes_0, keep_dims = reduce_max_9_keep_dims_0, x = x_187_cast_fp16)[name = string("reduce_max_9")];
+            tensor<fp16, [1, 8, 3, 512]> var_6643 = sub(x = x_187_cast_fp16, y = reduce_max_9)[name = string("op_6643")];
+            tensor<fp16, [1, 8, 3, 512]> var_6649 = exp(x = var_6643)[name = string("op_6649")];
+            tensor<int32, [1]> var_6659_axes_0 = const()[name = string("op_6659_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_6659_keep_dims_0 = const()[name = string("op_6659_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_6659 = reduce_sum(axes = var_6659_axes_0, keep_dims = var_6659_keep_dims_0, x = var_6649)[name = string("op_6659")];
+            tensor<fp16, [1, 8, 3, 512]> var_6665_cast_fp16 = real_div(x = var_6649, y = var_6659)[name = string("op_6665_cast_fp16")];
+            bool attn_output_55_transpose_x_0 = const()[name = string("attn_output_55_transpose_x_0"), val = bool(false)];
+            bool attn_output_55_transpose_y_0 = const()[name = string("attn_output_55_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_19_cast_fp16 = transpose(perm = V_expanded_19_perm_0, x = reshape_39_cast_fp16)[name = string("transpose_48")];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_55_cast_fp16 = matmul(transpose_x = attn_output_55_transpose_x_0, transpose_y = attn_output_55_transpose_y_0, x = var_6665_cast_fp16, y = V_expanded_19_cast_fp16)[name = string("attn_output_55_cast_fp16")];
+            tensor<int32, [4]> var_6676 = const()[name = string("op_6676"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_6683 = const()[name = string("op_6683"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_6677_cast_fp16 = transpose(perm = var_6676, x = attn_output_55_cast_fp16)[name = string("transpose_47")];
+            tensor<fp16, [1, 3, 2048]> attn_output_57_cast_fp16 = reshape(shape = var_6683, x = var_6677_cast_fp16)[name = string("attn_output_57_cast_fp16")];
+            tensor<int32, [3]> var_6688 = const()[name = string("op_6688"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_6704_pad_type_0 = const()[name = string("op_6704_pad_type_0"), val = string("valid")];
+            int32 var_6704_groups_0 = const()[name = string("op_6704_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_6704_strides_0 = const()[name = string("op_6704_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_6704_pad_0 = const()[name = string("op_6704_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_6704_dilations_0 = const()[name = string("op_6704_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_9_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(560655040))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(563276544))))[name = string("squeeze_9_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_6689_cast_fp16 = transpose(perm = var_6688, x = attn_output_57_cast_fp16)[name = string("transpose_46")];
+            tensor<fp16, [1, 2560, 3]> var_6704_cast_fp16 = conv(dilations = var_6704_dilations_0, groups = var_6704_groups_0, pad = var_6704_pad_0, pad_type = var_6704_pad_type_0, strides = var_6704_strides_0, weight = squeeze_9_cast_fp16_to_fp32_to_fp16_palettized, x = var_6689_cast_fp16)[name = string("op_6704_cast_fp16")];
+            tensor<int32, [3]> var_6708 = const()[name = string("op_6708"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_6714 = const()[name = string("op_6714"), val = int32(-1)];
+            fp16 const_113_promoted_to_fp16 = const()[name = string("const_113_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_191_cast_fp16 = transpose(perm = var_6708, x = var_6704_cast_fp16)[name = string("transpose_45")];
+            tensor<fp16, [1, 3, 2560]> var_6716_cast_fp16 = mul(x = x_191_cast_fp16, y = const_113_promoted_to_fp16)[name = string("op_6716_cast_fp16")];
+            bool input_281_interleave_0 = const()[name = string("input_281_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_281_cast_fp16 = concat(axis = var_6714, interleave = input_281_interleave_0, values = (x_191_cast_fp16, var_6716_cast_fp16))[name = string("input_281_cast_fp16")];
+            tensor<int32, [1]> normed_265_axes_0 = const()[name = string("normed_265_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6711_to_fp16 = const()[name = string("op_6711_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_265_cast_fp16 = layer_norm(axes = normed_265_axes_0, epsilon = var_6711_to_fp16, x = input_281_cast_fp16)[name = string("normed_265_cast_fp16")];
+            tensor<int32, [2]> var_6721_split_sizes_0 = const()[name = string("op_6721_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6721_axis_0 = const()[name = string("op_6721_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_6721_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_6721_cast_fp16_1 = split(axis = var_6721_axis_0, split_sizes = var_6721_split_sizes_0, x = normed_265_cast_fp16)[name = string("op_6721_cast_fp16")];
+            tensor<fp16, [2560]> layers_9_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_9_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(563279168)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_59_cast_fp16 = mul(x = var_6721_cast_fp16_0, y = layers_9_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_59_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_193_cast_fp16 = add(x = x_179_cast_fp16, y = attn_output_59_cast_fp16)[name = string("x_193_cast_fp16")];
+            int32 var_6730 = const()[name = string("op_6730"), val = int32(-1)];
+            fp16 const_114_promoted_to_fp16 = const()[name = string("const_114_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_6732_cast_fp16 = mul(x = x_193_cast_fp16, y = const_114_promoted_to_fp16)[name = string("op_6732_cast_fp16")];
+            bool input_283_interleave_0 = const()[name = string("input_283_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_283_cast_fp16 = concat(axis = var_6730, interleave = input_283_interleave_0, values = (x_193_cast_fp16, var_6732_cast_fp16))[name = string("input_283_cast_fp16")];
+            tensor<int32, [1]> normed_269_axes_0 = const()[name = string("normed_269_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6727_to_fp16 = const()[name = string("op_6727_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_269_cast_fp16 = layer_norm(axes = normed_269_axes_0, epsilon = var_6727_to_fp16, x = input_283_cast_fp16)[name = string("normed_269_cast_fp16")];
+            tensor<int32, [2]> var_6737_split_sizes_0 = const()[name = string("op_6737_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6737_axis_0 = const()[name = string("op_6737_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_6737_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_6737_cast_fp16_1 = split(axis = var_6737_axis_0, split_sizes = var_6737_split_sizes_0, x = normed_269_cast_fp16)[name = string("op_6737_cast_fp16")];
+            tensor<fp16, [2560]> layers_9_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_9_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(563284352)))];
+            tensor<fp16, [1, 3, 2560]> h_57_cast_fp16 = mul(x = var_6737_cast_fp16_0, y = layers_9_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_57_cast_fp16")];
+            tensor<int32, [3]> var_6748 = const()[name = string("op_6748"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_285_axes_0 = const()[name = string("input_285_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_6749 = transpose(perm = var_6748, x = h_57_cast_fp16)[name = string("transpose_44")];
+            tensor<fp16, [1, 2560, 1, 3]> input_285 = expand_dims(axes = input_285_axes_0, x = var_6749)[name = string("input_285")];
+            string gate_37_pad_type_0 = const()[name = string("gate_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_37_strides_0 = const()[name = string("gate_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_37_pad_0 = const()[name = string("gate_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_37_dilations_0 = const()[name = string("gate_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_37_groups_0 = const()[name = string("gate_37_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_37 = conv(dilations = gate_37_dilations_0, groups = gate_37_groups_0, pad = gate_37_pad_0, pad_type = gate_37_pad_type_0, strides = gate_37_strides_0, weight = layers_9_mlp_gate_proj_weight_palettized, x = input_285)[name = string("gate_37")];
+            string up_19_pad_type_0 = const()[name = string("up_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_19_strides_0 = const()[name = string("up_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_19_pad_0 = const()[name = string("up_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_19_dilations_0 = const()[name = string("up_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_19_groups_0 = const()[name = string("up_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_19 = conv(dilations = up_19_dilations_0, groups = up_19_groups_0, pad = up_19_pad_0, pad_type = up_19_pad_type_0, strides = up_19_strides_0, weight = layers_9_mlp_up_proj_weight_palettized, x = input_285)[name = string("up_19")];
+            string gate_39_mode_0 = const()[name = string("gate_39_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_39 = gelu(mode = gate_39_mode_0, x = gate_37)[name = string("gate_39")];
+            tensor<fp16, [1, 10240, 1, 3]> input_287 = mul(x = gate_39, y = up_19)[name = string("input_287")];
+            string mlp_out_19_pad_type_0 = const()[name = string("mlp_out_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_19_strides_0 = const()[name = string("mlp_out_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_19_pad_0 = const()[name = string("mlp_out_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_19_dilations_0 = const()[name = string("mlp_out_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_19_groups_0 = const()[name = string("mlp_out_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_19 = conv(dilations = mlp_out_19_dilations_0, groups = mlp_out_19_groups_0, pad = mlp_out_19_pad_0, pad_type = mlp_out_19_pad_type_0, strides = mlp_out_19_strides_0, weight = layers_9_mlp_down_proj_weight_palettized, x = input_287)[name = string("mlp_out_19")];
+            tensor<int32, [1]> var_6789_axes_0 = const()[name = string("op_6789_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_6789 = squeeze(axes = var_6789_axes_0, x = mlp_out_19)[name = string("op_6789")];
+            tensor<int32, [3]> var_6793 = const()[name = string("op_6793"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_6799 = const()[name = string("op_6799"), val = int32(-1)];
+            fp16 const_115_promoted = const()[name = string("const_115_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_195 = transpose(perm = var_6793, x = var_6789)[name = string("transpose_43")];
+            tensor<fp16, [1, 3, 2560]> var_6801 = mul(x = x_195, y = const_115_promoted)[name = string("op_6801")];
+            bool input_289_interleave_0 = const()[name = string("input_289_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_289 = concat(axis = var_6799, interleave = input_289_interleave_0, values = (x_195, var_6801))[name = string("input_289")];
+            tensor<int32, [1]> normed_273_axes_0 = const()[name = string("normed_273_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6796_to_fp16 = const()[name = string("op_6796_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_273_cast_fp16 = layer_norm(axes = normed_273_axes_0, epsilon = var_6796_to_fp16, x = input_289)[name = string("normed_273_cast_fp16")];
+            tensor<int32, [2]> var_6806_split_sizes_0 = const()[name = string("op_6806_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6806_axis_0 = const()[name = string("op_6806_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_6806_0, tensor<fp16, [1, 3, 2560]> var_6806_1 = split(axis = var_6806_axis_0, split_sizes = var_6806_split_sizes_0, x = normed_273_cast_fp16)[name = string("op_6806")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_93 = mul(x = var_6806_0, y = layers_9_post_feedforward_layernorm_weight)[name = string("hidden_states_93")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_95_cast_fp16 = add(x = x_193_cast_fp16, y = hidden_states_93)[name = string("hidden_states_95_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_19_begin_0 = const()[name = string("per_layer_slice_19_begin_0"), val = tensor<int32, [3]>([0, 0, 5376])];
+            tensor<int32, [3]> per_layer_slice_19_end_0 = const()[name = string("per_layer_slice_19_end_0"), val = tensor<int32, [3]>([1, 3, 5632])];
+            tensor<bool, [3]> per_layer_slice_19_end_mask_0 = const()[name = string("per_layer_slice_19_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_19_cast_fp16 = slice_by_index(begin = per_layer_slice_19_begin_0, end = per_layer_slice_19_end_0, end_mask = per_layer_slice_19_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_19_cast_fp16")];
+            tensor<int32, [3]> var_6834 = const()[name = string("op_6834"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_291_axes_0 = const()[name = string("input_291_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_6835 = transpose(perm = var_6834, x = hidden_states_95_cast_fp16)[name = string("transpose_42")];
+            tensor<fp16, [1, 2560, 1, 3]> input_291 = expand_dims(axes = input_291_axes_0, x = var_6835)[name = string("input_291")];
+            string gated_55_pad_type_0 = const()[name = string("gated_55_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_55_strides_0 = const()[name = string("gated_55_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_55_pad_0 = const()[name = string("gated_55_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_55_dilations_0 = const()[name = string("gated_55_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_55_groups_0 = const()[name = string("gated_55_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_55 = conv(dilations = gated_55_dilations_0, groups = gated_55_groups_0, pad = gated_55_pad_0, pad_type = gated_55_pad_type_0, strides = gated_55_strides_0, weight = layers_9_per_layer_input_gate_weight_palettized, x = input_291)[name = string("gated_55")];
+            string gated_57_mode_0 = const()[name = string("gated_57_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_57 = gelu(mode = gated_57_mode_0, x = gated_55)[name = string("gated_57")];
+            tensor<int32, [3]> var_6854 = const()[name = string("op_6854"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_19_axes_0 = const()[name = string("per_layer_slice_conv_19_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_6855_cast_fp16 = transpose(perm = var_6854, x = per_layer_slice_19_cast_fp16)[name = string("transpose_41")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_19_cast_fp16 = expand_dims(axes = per_layer_slice_conv_19_axes_0, x = var_6855_cast_fp16)[name = string("per_layer_slice_conv_19_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_293_cast_fp16 = mul(x = gated_57, y = per_layer_slice_conv_19_cast_fp16)[name = string("input_293_cast_fp16")];
+            string gated_59_pad_type_0 = const()[name = string("gated_59_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_59_strides_0 = const()[name = string("gated_59_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_59_pad_0 = const()[name = string("gated_59_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_59_dilations_0 = const()[name = string("gated_59_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_59_groups_0 = const()[name = string("gated_59_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_9_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(563289536))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(563617280))))[name = string("layers_9_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_59_cast_fp16 = conv(dilations = gated_59_dilations_0, groups = gated_59_groups_0, pad = gated_59_pad_0, pad_type = gated_59_pad_type_0, strides = gated_59_strides_0, weight = layers_9_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_293_cast_fp16)[name = string("gated_59_cast_fp16")];
+            tensor<int32, [1]> var_6871_axes_0 = const()[name = string("op_6871_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_6871_cast_fp16 = squeeze(axes = var_6871_axes_0, x = gated_59_cast_fp16)[name = string("op_6871_cast_fp16")];
+            tensor<int32, [3]> var_6875 = const()[name = string("op_6875"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_6881 = const()[name = string("op_6881"), val = int32(-1)];
+            fp16 const_116_promoted_to_fp16 = const()[name = string("const_116_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_197_cast_fp16 = transpose(perm = var_6875, x = var_6871_cast_fp16)[name = string("transpose_40")];
+            tensor<fp16, [1, 3, 2560]> var_6883_cast_fp16 = mul(x = x_197_cast_fp16, y = const_116_promoted_to_fp16)[name = string("op_6883_cast_fp16")];
+            bool input_295_interleave_0 = const()[name = string("input_295_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_295_cast_fp16 = concat(axis = var_6881, interleave = input_295_interleave_0, values = (x_197_cast_fp16, var_6883_cast_fp16))[name = string("input_295_cast_fp16")];
+            tensor<int32, [1]> normed_277_axes_0 = const()[name = string("normed_277_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6878_to_fp16 = const()[name = string("op_6878_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_277_cast_fp16 = layer_norm(axes = normed_277_axes_0, epsilon = var_6878_to_fp16, x = input_295_cast_fp16)[name = string("normed_277_cast_fp16")];
+            tensor<int32, [2]> var_6888_split_sizes_0 = const()[name = string("op_6888_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6888_axis_0 = const()[name = string("op_6888_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_6888_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_6888_cast_fp16_1 = split(axis = var_6888_axis_0, split_sizes = var_6888_split_sizes_0, x = normed_277_cast_fp16)[name = string("op_6888_cast_fp16")];
+            tensor<fp16, [2560]> layers_9_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_9_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(563619904)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_99_cast_fp16 = mul(x = var_6888_cast_fp16_0, y = layers_9_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_99_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_101_cast_fp16 = add(x = hidden_states_95_cast_fp16, y = hidden_states_99_cast_fp16)[name = string("hidden_states_101_cast_fp16")];
+            tensor<fp16, [1]> const_117_promoted_to_fp16 = const()[name = string("const_117_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.d8p-2])];
+            tensor<fp16, [1, 3, 2560]> x_199_cast_fp16 = mul(x = hidden_states_101_cast_fp16, y = const_117_promoted_to_fp16)[name = string("x_199_cast_fp16")];
+            int32 var_6903 = const()[name = string("op_6903"), val = int32(-1)];
+            fp16 const_118_promoted_to_fp16 = const()[name = string("const_118_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_6905_cast_fp16 = mul(x = x_199_cast_fp16, y = const_118_promoted_to_fp16)[name = string("op_6905_cast_fp16")];
+            bool input_297_interleave_0 = const()[name = string("input_297_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_297_cast_fp16 = concat(axis = var_6903, interleave = input_297_interleave_0, values = (x_199_cast_fp16, var_6905_cast_fp16))[name = string("input_297_cast_fp16")];
+            tensor<int32, [1]> normed_281_axes_0 = const()[name = string("normed_281_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6900_to_fp16 = const()[name = string("op_6900_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_281_cast_fp16 = layer_norm(axes = normed_281_axes_0, epsilon = var_6900_to_fp16, x = input_297_cast_fp16)[name = string("normed_281_cast_fp16")];
+            tensor<int32, [2]> var_6910_split_sizes_0 = const()[name = string("op_6910_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6910_axis_0 = const()[name = string("op_6910_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_6910_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_6910_cast_fp16_1 = split(axis = var_6910_axis_0, split_sizes = var_6910_split_sizes_0, x = normed_281_cast_fp16)[name = string("op_6910_cast_fp16")];
+            tensor<fp16, [2560]> layers_10_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_10_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(563625088)))];
+            tensor<fp16, [1, 3, 2560]> h_61_cast_fp16 = mul(x = var_6910_cast_fp16_0, y = layers_10_input_layernorm_weight_promoted_to_fp16)[name = string("h_61_cast_fp16")];
+            tensor<int32, [3]> var_6916 = const()[name = string("op_6916"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_6919_axes_0 = const()[name = string("op_6919_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_6917_cast_fp16 = transpose(perm = var_6916, x = h_61_cast_fp16)[name = string("transpose_39")];
+            tensor<fp16, [1, 2560, 1, 3]> var_6919_cast_fp16 = expand_dims(axes = var_6919_axes_0, x = var_6917_cast_fp16)[name = string("op_6919_cast_fp16")];
+            string q_121_pad_type_0 = const()[name = string("q_121_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_121_strides_0 = const()[name = string("q_121_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_121_pad_0 = const()[name = string("q_121_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_121_dilations_0 = const()[name = string("q_121_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_121_groups_0 = const()[name = string("q_121_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_121 = conv(dilations = q_121_dilations_0, groups = q_121_groups_0, pad = q_121_pad_0, pad_type = q_121_pad_type_0, strides = q_121_strides_0, weight = layers_10_self_attn_q_proj_weight_palettized, x = var_6919_cast_fp16)[name = string("q_121")];
+            tensor<int32, [4]> var_6940 = const()[name = string("op_6940"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_6941 = reshape(shape = var_6940, x = q_121)[name = string("op_6941")];
+            tensor<int32, [4]> transpose_78_perm_0 = const()[name = string("transpose_78_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_6964 = const()[name = string("op_6964"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_78 = transpose(perm = transpose_78_perm_0, x = var_6941)[name = string("transpose_38")];
+            tensor<fp16, [3, 8, 256]> x_201 = reshape(shape = var_6964, x = transpose_78)[name = string("x_201")];
+            int32 var_6970 = const()[name = string("op_6970"), val = int32(-1)];
+            fp16 const_119_promoted = const()[name = string("const_119_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_6972 = mul(x = x_201, y = const_119_promoted)[name = string("op_6972")];
+            bool input_301_interleave_0 = const()[name = string("input_301_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_301 = concat(axis = var_6970, interleave = input_301_interleave_0, values = (x_201, var_6972))[name = string("input_301")];
+            tensor<int32, [1]> normed_285_axes_0 = const()[name = string("normed_285_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6967_to_fp16 = const()[name = string("op_6967_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_285_cast_fp16 = layer_norm(axes = normed_285_axes_0, epsilon = var_6967_to_fp16, x = input_301)[name = string("normed_285_cast_fp16")];
+            tensor<int32, [2]> var_6977_split_sizes_0 = const()[name = string("op_6977_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_6977_axis_0 = const()[name = string("op_6977_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_6977_0, tensor<fp16, [3, 8, 256]> var_6977_1 = split(axis = var_6977_axis_0, split_sizes = var_6977_split_sizes_0, x = normed_285_cast_fp16)[name = string("op_6977")];
+            tensor<fp16, [3, 8, 256]> q_125 = mul(x = var_6977_0, y = layers_10_self_attn_q_norm_weight)[name = string("q_125")];
+            tensor<int32, [4]> var_6984 = const()[name = string("op_6984"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_6985 = reshape(shape = var_6984, x = q_125)[name = string("op_6985")];
+            tensor<int32, [4]> var_6990 = const()[name = string("op_6990"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_127 = transpose(perm = var_6990, x = var_6985)[name = string("transpose_37")];
+            tensor<fp16, [1, 8, 3, 256]> var_6992_cast_fp16 = mul(x = q_127, y = cos_s)[name = string("op_6992_cast_fp16")];
+            tensor<int32, [2]> var_6993_split_sizes_0 = const()[name = string("op_6993_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_6993_axis_0 = const()[name = string("op_6993_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_6993_0, tensor<fp16, [1, 8, 3, 128]> var_6993_1 = split(axis = var_6993_axis_0, split_sizes = var_6993_split_sizes_0, x = q_127)[name = string("op_6993")];
+            fp16 const_120_promoted = const()[name = string("const_120_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_6995 = mul(x = var_6993_1, y = const_120_promoted)[name = string("op_6995")];
+            int32 var_6997 = const()[name = string("op_6997"), val = int32(-1)];
+            bool var_6998_interleave_0 = const()[name = string("op_6998_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_6998 = concat(axis = var_6997, interleave = var_6998_interleave_0, values = (var_6995, var_6993_0))[name = string("op_6998")];
+            tensor<fp16, [1, 8, 3, 256]> var_6999_cast_fp16 = mul(x = var_6998, y = sin_s)[name = string("op_6999_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_131_cast_fp16 = add(x = var_6992_cast_fp16, y = var_6999_cast_fp16)[name = string("q_131_cast_fp16")];
+            string k_63_pad_type_0 = const()[name = string("k_63_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_63_strides_0 = const()[name = string("k_63_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_63_pad_0 = const()[name = string("k_63_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_63_dilations_0 = const()[name = string("k_63_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_63_groups_0 = const()[name = string("k_63_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> k_63 = conv(dilations = k_63_dilations_0, groups = k_63_groups_0, pad = k_63_pad_0, pad_type = k_63_pad_type_0, strides = k_63_strides_0, weight = layers_10_self_attn_k_proj_weight_palettized, x = var_6919_cast_fp16)[name = string("k_63")];
+            tensor<int32, [4]> var_7017 = const()[name = string("op_7017"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_7018 = reshape(shape = var_7017, x = k_63)[name = string("op_7018")];
+            tensor<int32, [4]> transpose_79_perm_0 = const()[name = string("transpose_79_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            string v_23_pad_type_0 = const()[name = string("v_23_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_23_strides_0 = const()[name = string("v_23_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_23_pad_0 = const()[name = string("v_23_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_23_dilations_0 = const()[name = string("v_23_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_23_groups_0 = const()[name = string("v_23_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 3]> v_23 = conv(dilations = v_23_dilations_0, groups = v_23_groups_0, pad = v_23_pad_0, pad_type = v_23_pad_type_0, strides = v_23_strides_0, weight = layers_10_self_attn_v_proj_weight_palettized, x = var_6919_cast_fp16)[name = string("v_23")];
+            tensor<int32, [4]> var_7045 = const()[name = string("op_7045"), val = tensor<int32, [4]>([1, 2, 256, 3])];
+            tensor<fp16, [1, 2, 256, 3]> var_7046 = reshape(shape = var_7045, x = v_23)[name = string("op_7046")];
+            tensor<int32, [4]> var_7051 = const()[name = string("op_7051"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_7069 = const()[name = string("op_7069"), val = tensor<int32, [3]>([3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> transpose_79 = transpose(perm = transpose_79_perm_0, x = var_7018)[name = string("transpose_36")];
+            tensor<fp16, [3, 2, 256]> x_203 = reshape(shape = var_7069, x = transpose_79)[name = string("x_203")];
+            int32 var_7075 = const()[name = string("op_7075"), val = int32(-1)];
+            fp16 const_121_promoted = const()[name = string("const_121_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 2, 256]> var_7077 = mul(x = x_203, y = const_121_promoted)[name = string("op_7077")];
+            bool input_303_interleave_0 = const()[name = string("input_303_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 2, 512]> input_303 = concat(axis = var_7075, interleave = input_303_interleave_0, values = (x_203, var_7077))[name = string("input_303")];
+            tensor<int32, [1]> normed_289_axes_0 = const()[name = string("normed_289_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7072_to_fp16 = const()[name = string("op_7072_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 2, 512]> normed_289_cast_fp16 = layer_norm(axes = normed_289_axes_0, epsilon = var_7072_to_fp16, x = input_303)[name = string("normed_289_cast_fp16")];
+            tensor<int32, [2]> var_7082_split_sizes_0 = const()[name = string("op_7082_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_7082_axis_0 = const()[name = string("op_7082_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 2, 256]> var_7082_0, tensor<fp16, [3, 2, 256]> var_7082_1 = split(axis = var_7082_axis_0, split_sizes = var_7082_split_sizes_0, x = normed_289_cast_fp16)[name = string("op_7082")];
+            tensor<fp16, [3, 2, 256]> k_67 = mul(x = var_7082_0, y = layers_4_self_attn_k_norm_weight)[name = string("k_67")];
+            tensor<int32, [4]> var_7089 = const()[name = string("op_7089"), val = tensor<int32, [4]>([1, 3, 2, 256])];
+            tensor<fp16, [1, 3, 2, 256]> var_7090 = reshape(shape = var_7089, x = k_67)[name = string("op_7090")];
+            tensor<int32, [4]> var_7095 = const()[name = string("op_7095"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            fp16 var_7097_promoted = const()[name = string("op_7097_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 3, 256]> var_7052 = transpose(perm = var_7051, x = var_7046)[name = string("transpose_35")];
+            tensor<fp16, [1, 2, 3, 256]> var_7098 = pow(x = var_7052, y = var_7097_promoted)[name = string("op_7098")];
+            tensor<int32, [1]> var_7103_axes_0 = const()[name = string("op_7103_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_7103_keep_dims_0 = const()[name = string("op_7103_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 3, 1]> var_7103 = reduce_mean(axes = var_7103_axes_0, keep_dims = var_7103_keep_dims_0, x = var_7098)[name = string("op_7103")];
+            fp16 var_7105_to_fp16 = const()[name = string("op_7105_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 3, 1]> mean_sq_21_cast_fp16 = add(x = var_7103, y = var_7105_to_fp16)[name = string("mean_sq_21_cast_fp16")];
+            fp32 var_7107_epsilon_0 = const()[name = string("op_7107_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 3, 1]> var_7107_cast_fp16 = rsqrt(epsilon = var_7107_epsilon_0, x = mean_sq_21_cast_fp16)[name = string("op_7107_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_307_cast_fp16 = mul(x = var_7052, y = var_7107_cast_fp16)[name = string("input_307_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> q_129 = transpose(perm = var_7095, x = var_7090)[name = string("transpose_34")];
+            tensor<fp16, [1, 2, 3, 256]> var_7109_cast_fp16 = mul(x = q_129, y = cos_s)[name = string("op_7109_cast_fp16")];
+            tensor<int32, [2]> var_7110_split_sizes_0 = const()[name = string("op_7110_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_7110_axis_0 = const()[name = string("op_7110_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 3, 128]> var_7110_0, tensor<fp16, [1, 2, 3, 128]> var_7110_1 = split(axis = var_7110_axis_0, split_sizes = var_7110_split_sizes_0, x = q_129)[name = string("op_7110")];
+            fp16 const_122_promoted = const()[name = string("const_122_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 3, 128]> var_7112 = mul(x = var_7110_1, y = const_122_promoted)[name = string("op_7112")];
+            int32 var_7114 = const()[name = string("op_7114"), val = int32(-1)];
+            bool var_7115_interleave_0 = const()[name = string("op_7115_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 3, 256]> var_7115 = concat(axis = var_7114, interleave = var_7115_interleave_0, values = (var_7112, var_7110_0))[name = string("op_7115")];
+            tensor<fp16, [1, 2, 3, 256]> var_7116_cast_fp16 = mul(x = var_7115, y = sin_s)[name = string("op_7116_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 256]> input_305_cast_fp16 = add(x = var_7109_cast_fp16, y = var_7116_cast_fp16)[name = string("input_305_cast_fp16")];
+            tensor<int32, [8]> k_padded_pad_0 = const()[name = string("k_padded_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_mode_0 = const()[name = string("k_padded_mode_0"), val = string("constant")];
+            fp16 const_123_to_fp16 = const()[name = string("const_123_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> k_padded_cast_fp16 = pad(constant_val = const_123_to_fp16, mode = k_padded_mode_0, pad = k_padded_pad_0, x = input_305_cast_fp16)[name = string("k_padded_cast_fp16")];
+            tensor<int32, [8]> v_padded_pad_0 = const()[name = string("v_padded_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_mode_0 = const()[name = string("v_padded_mode_0"), val = string("constant")];
+            fp16 const_124_to_fp16 = const()[name = string("const_124_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 3, 512]> v_padded_cast_fp16 = pad(constant_val = const_124_to_fp16, mode = v_padded_mode_0, pad = v_padded_pad_0, x = input_307_cast_fp16)[name = string("v_padded_cast_fp16")];
+            tensor<int32, [4]> slot_k_21_begin_0 = const()[name = string("slot_k_21_begin_0"), val = tensor<int32, [4]>([9, 0, 0, 0])];
+            tensor<int32, [4]> slot_k_21_end_0 = const()[name = string("slot_k_21_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> slot_k_21_end_mask_0 = const()[name = string("slot_k_21_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_k_21_cast_fp16 = slice_by_index(begin = slot_k_21_begin_0, end = slot_k_21_end_0, end_mask = slot_k_21_end_mask_0, x = K_sliding_out_17_cast_fp16)[name = string("slot_k_21_cast_fp16")];
+            tensor<int32, [4]> slot_v_21_begin_0 = const()[name = string("slot_v_21_begin_0"), val = tensor<int32, [4]>([9, 0, 0, 0])];
+            tensor<int32, [4]> slot_v_21_end_0 = const()[name = string("slot_v_21_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> slot_v_21_end_mask_0 = const()[name = string("slot_v_21_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> slot_v_21_cast_fp16 = slice_by_index(begin = slot_v_21_begin_0, end = slot_v_21_end_0, end_mask = slot_v_21_end_mask_0, x = V_sliding_out_17_cast_fp16)[name = string("slot_v_21_cast_fp16")];
+            tensor<int32, [4]> var_7155_begin_0 = const()[name = string("op_7155_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_7155_end_0 = const()[name = string("op_7155_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_7155_end_mask_0 = const()[name = string("op_7155_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_7155_cast_fp16 = slice_by_index(begin = var_7155_begin_0, end = var_7155_end_0, end_mask = var_7155_end_mask_0, x = slot_k_21_cast_fp16)[name = string("op_7155_cast_fp16")];
+            int32 var_7162 = const()[name = string("op_7162"), val = int32(2)];
+            bool new_k_21_interleave_0 = const()[name = string("new_k_21_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_k_21_cast_fp16 = concat(axis = var_7162, interleave = new_k_21_interleave_0, values = (var_7155_cast_fp16, k_padded_cast_fp16))[name = string("new_k_21_cast_fp16")];
+            tensor<int32, [4]> var_7178_begin_0 = const()[name = string("op_7178_begin_0"), val = tensor<int32, [4]>([0, 0, 3, 0])];
+            tensor<int32, [4]> var_7178_end_0 = const()[name = string("op_7178_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_7178_end_mask_0 = const()[name = string("op_7178_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 509, 512]> var_7178_cast_fp16 = slice_by_index(begin = var_7178_begin_0, end = var_7178_end_0, end_mask = var_7178_end_mask_0, x = slot_v_21_cast_fp16)[name = string("op_7178_cast_fp16")];
+            int32 var_7185 = const()[name = string("op_7185"), val = int32(2)];
+            bool new_v_21_interleave_0 = const()[name = string("new_v_21_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> new_v_21_cast_fp16 = concat(axis = var_7185, interleave = new_v_21_interleave_0, values = (var_7178_cast_fp16, v_padded_cast_fp16))[name = string("new_v_21_cast_fp16")];
+            tensor<int32, [4]> var_7191_begin_0 = const()[name = string("op_7191_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_7191_end_0 = const()[name = string("op_7191_end_0"), val = tensor<int32, [4]>([9, 2, 512, 512])];
+            tensor<bool, [4]> var_7191_end_mask_0 = const()[name = string("op_7191_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [9, 2, 512, 512]> var_7191_cast_fp16 = slice_by_index(begin = var_7191_begin_0, end = var_7191_end_0, end_mask = var_7191_end_mask_0, x = K_sliding_out_17_cast_fp16)[name = string("op_7191_cast_fp16")];
+            int32 var_7198 = const()[name = string("op_7198"), val = int32(0)];
+            bool K_sliding_out_interleave_0 = const()[name = string("K_sliding_out_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> K_sliding_out = concat(axis = var_7198, interleave = K_sliding_out_interleave_0, values = (var_7191_cast_fp16, new_k_21_cast_fp16))[name = string("K_sliding_out_cast_fp16")];
+            tensor<int32, [4]> var_7204_begin_0 = const()[name = string("op_7204_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_7204_end_0 = const()[name = string("op_7204_end_0"), val = tensor<int32, [4]>([9, 2, 512, 512])];
+            tensor<bool, [4]> var_7204_end_mask_0 = const()[name = string("op_7204_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<fp16, [9, 2, 512, 512]> var_7204_cast_fp16 = slice_by_index(begin = var_7204_begin_0, end = var_7204_end_0, end_mask = var_7204_end_mask_0, x = V_sliding_out_17_cast_fp16)[name = string("op_7204_cast_fp16")];
+            int32 var_7211 = const()[name = string("op_7211"), val = int32(0)];
+            bool V_sliding_out_interleave_0 = const()[name = string("V_sliding_out_interleave_0"), val = bool(false)];
+            tensor<fp16, [10, 2, 512, 512]> V_sliding_out = concat(axis = var_7211, interleave = V_sliding_out_interleave_0, values = (var_7204_cast_fp16, new_v_21_cast_fp16))[name = string("V_sliding_out_cast_fp16")];
+            tensor<int32, [4]> var_7217_begin_0 = const()[name = string("op_7217_begin_0"), val = tensor<int32, [4]>([9, 0, 0, 0])];
+            tensor<int32, [4]> var_7217_end_0 = const()[name = string("op_7217_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_7217_end_mask_0 = const()[name = string("op_7217_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_7217_cast_fp16 = slice_by_index(begin = var_7217_begin_0, end = var_7217_end_0, end_mask = var_7217_end_mask_0, x = K_sliding_out)[name = string("op_7217_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_21_begin_0 = const()[name = string("K_for_attn_21_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_21_end_0 = const()[name = string("K_for_attn_21_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_21_end_mask_0 = const()[name = string("K_for_attn_21_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> kv13_k = slice_by_index(begin = K_for_attn_21_begin_0, end = K_for_attn_21_end_0, end_mask = K_for_attn_21_end_mask_0, x = var_7217_cast_fp16)[name = string("K_for_attn_21_cast_fp16")];
+            tensor<int32, [4]> var_7227_begin_0 = const()[name = string("op_7227_begin_0"), val = tensor<int32, [4]>([9, 0, 0, 0])];
+            tensor<int32, [4]> var_7227_end_0 = const()[name = string("op_7227_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_7227_end_mask_0 = const()[name = string("op_7227_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 512, 512]> var_7227_cast_fp16 = slice_by_index(begin = var_7227_begin_0, end = var_7227_end_0, end_mask = var_7227_end_mask_0, x = V_sliding_out)[name = string("op_7227_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_21_begin_0 = const()[name = string("V_for_attn_21_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_21_end_0 = const()[name = string("V_for_attn_21_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_21_end_mask_0 = const()[name = string("V_for_attn_21_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> kv13_v = slice_by_index(begin = V_for_attn_21_begin_0, end = V_for_attn_21_end_0, end_mask = V_for_attn_21_end_mask_0, x = var_7227_cast_fp16)[name = string("V_for_attn_21_cast_fp16")];
+            tensor<int32, [4]> transpose_40_perm_0 = const()[name = string("transpose_40_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_20_reps_0 = const()[name = string("tile_20_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_40_cast_fp16 = transpose(perm = transpose_40_perm_0, x = kv13_k)[name = string("transpose_33")];
+            tensor<fp16, [8, 1, 512, 256]> tile_20_cast_fp16 = tile(reps = tile_20_reps_0, x = transpose_40_cast_fp16)[name = string("tile_20_cast_fp16")];
+            tensor<int32, [5]> concat_42 = const()[name = string("concat_42"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_40_cast_fp16 = reshape(shape = concat_42, x = tile_20_cast_fp16)[name = string("reshape_40_cast_fp16")];
+            tensor<int32, [5]> transpose_41_perm_0 = const()[name = string("transpose_41_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_43 = const()[name = string("concat_43"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_41_cast_fp16 = transpose(perm = transpose_41_perm_0, x = reshape_40_cast_fp16)[name = string("transpose_32")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_41_cast_fp16 = reshape(shape = concat_43, x = transpose_41_cast_fp16)[name = string("reshape_41_cast_fp16")];
+            tensor<int32, [4]> transpose_80_perm_0 = const()[name = string("transpose_80_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_42_perm_0 = const()[name = string("transpose_42_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_21_reps_0 = const()[name = string("tile_21_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_42_cast_fp16 = transpose(perm = transpose_42_perm_0, x = kv13_v)[name = string("transpose_31")];
+            tensor<fp16, [8, 1, 512, 256]> tile_21_cast_fp16 = tile(reps = tile_21_reps_0, x = transpose_42_cast_fp16)[name = string("tile_21_cast_fp16")];
+            tensor<int32, [5]> concat_44 = const()[name = string("concat_44"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_42_cast_fp16 = reshape(shape = concat_44, x = tile_21_cast_fp16)[name = string("reshape_42_cast_fp16")];
+            tensor<int32, [5]> transpose_43_perm_0 = const()[name = string("transpose_43_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_45 = const()[name = string("concat_45"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_43_cast_fp16 = transpose(perm = transpose_43_perm_0, x = reshape_42_cast_fp16)[name = string("transpose_30")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_43_cast_fp16 = reshape(shape = concat_45, x = transpose_43_cast_fp16)[name = string("reshape_43_cast_fp16")];
+            tensor<int32, [4]> V_expanded_21_perm_0 = const()[name = string("V_expanded_21_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_41_transpose_x_0 = const()[name = string("attn_weights_41_transpose_x_0"), val = bool(false)];
+            bool attn_weights_41_transpose_y_0 = const()[name = string("attn_weights_41_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_80_cast_fp16 = transpose(perm = transpose_80_perm_0, x = reshape_41_cast_fp16)[name = string("transpose_29")];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_41_cast_fp16 = matmul(transpose_x = attn_weights_41_transpose_x_0, transpose_y = attn_weights_41_transpose_y_0, x = q_131_cast_fp16, y = transpose_80_cast_fp16)[name = string("attn_weights_41_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_207_cast_fp16 = add(x = attn_weights_41_cast_fp16, y = causal_mask_sliding)[name = string("x_207_cast_fp16")];
+            tensor<int32, [1]> reduce_max_10_axes_0 = const()[name = string("reduce_max_10_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_10_keep_dims_0 = const()[name = string("reduce_max_10_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_10 = reduce_max(axes = reduce_max_10_axes_0, keep_dims = reduce_max_10_keep_dims_0, x = x_207_cast_fp16)[name = string("reduce_max_10")];
+            tensor<fp16, [1, 8, 3, 512]> var_7282 = sub(x = x_207_cast_fp16, y = reduce_max_10)[name = string("op_7282")];
+            tensor<fp16, [1, 8, 3, 512]> var_7288 = exp(x = var_7282)[name = string("op_7288")];
+            tensor<int32, [1]> var_7298_axes_0 = const()[name = string("op_7298_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_7298_keep_dims_0 = const()[name = string("op_7298_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_7298 = reduce_sum(axes = var_7298_axes_0, keep_dims = var_7298_keep_dims_0, x = var_7288)[name = string("op_7298")];
+            tensor<fp16, [1, 8, 3, 512]> var_7304_cast_fp16 = real_div(x = var_7288, y = var_7298)[name = string("op_7304_cast_fp16")];
+            bool attn_output_61_transpose_x_0 = const()[name = string("attn_output_61_transpose_x_0"), val = bool(false)];
+            bool attn_output_61_transpose_y_0 = const()[name = string("attn_output_61_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_21_cast_fp16 = transpose(perm = V_expanded_21_perm_0, x = reshape_43_cast_fp16)[name = string("transpose_28")];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_61_cast_fp16 = matmul(transpose_x = attn_output_61_transpose_x_0, transpose_y = attn_output_61_transpose_y_0, x = var_7304_cast_fp16, y = V_expanded_21_cast_fp16)[name = string("attn_output_61_cast_fp16")];
+            tensor<int32, [4]> var_7315 = const()[name = string("op_7315"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_7322 = const()[name = string("op_7322"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_7316_cast_fp16 = transpose(perm = var_7315, x = attn_output_61_cast_fp16)[name = string("transpose_27")];
+            tensor<fp16, [1, 3, 2048]> attn_output_63_cast_fp16 = reshape(shape = var_7322, x = var_7316_cast_fp16)[name = string("attn_output_63_cast_fp16")];
+            tensor<int32, [3]> var_7327 = const()[name = string("op_7327"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_7343_pad_type_0 = const()[name = string("op_7343_pad_type_0"), val = string("valid")];
+            int32 var_7343_groups_0 = const()[name = string("op_7343_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_7343_strides_0 = const()[name = string("op_7343_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_7343_pad_0 = const()[name = string("op_7343_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_7343_dilations_0 = const()[name = string("op_7343_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_10_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(563630272))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(566251776))))[name = string("squeeze_10_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_7328_cast_fp16 = transpose(perm = var_7327, x = attn_output_63_cast_fp16)[name = string("transpose_26")];
+            tensor<fp16, [1, 2560, 3]> var_7343_cast_fp16 = conv(dilations = var_7343_dilations_0, groups = var_7343_groups_0, pad = var_7343_pad_0, pad_type = var_7343_pad_type_0, strides = var_7343_strides_0, weight = squeeze_10_cast_fp16_to_fp32_to_fp16_palettized, x = var_7328_cast_fp16)[name = string("op_7343_cast_fp16")];
+            tensor<int32, [3]> var_7347 = const()[name = string("op_7347"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_7353 = const()[name = string("op_7353"), val = int32(-1)];
+            fp16 const_125_promoted_to_fp16 = const()[name = string("const_125_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_211_cast_fp16 = transpose(perm = var_7347, x = var_7343_cast_fp16)[name = string("transpose_25")];
+            tensor<fp16, [1, 3, 2560]> var_7355_cast_fp16 = mul(x = x_211_cast_fp16, y = const_125_promoted_to_fp16)[name = string("op_7355_cast_fp16")];
+            bool input_311_interleave_0 = const()[name = string("input_311_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_311_cast_fp16 = concat(axis = var_7353, interleave = input_311_interleave_0, values = (x_211_cast_fp16, var_7355_cast_fp16))[name = string("input_311_cast_fp16")];
+            tensor<int32, [1]> normed_293_axes_0 = const()[name = string("normed_293_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7350_to_fp16 = const()[name = string("op_7350_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_293_cast_fp16 = layer_norm(axes = normed_293_axes_0, epsilon = var_7350_to_fp16, x = input_311_cast_fp16)[name = string("normed_293_cast_fp16")];
+            tensor<int32, [2]> var_7360_split_sizes_0 = const()[name = string("op_7360_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7360_axis_0 = const()[name = string("op_7360_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_7360_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_7360_cast_fp16_1 = split(axis = var_7360_axis_0, split_sizes = var_7360_split_sizes_0, x = normed_293_cast_fp16)[name = string("op_7360_cast_fp16")];
+            tensor<fp16, [2560]> layers_10_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_10_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(566254400)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_65_cast_fp16 = mul(x = var_7360_cast_fp16_0, y = layers_10_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_65_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_213_cast_fp16 = add(x = x_199_cast_fp16, y = attn_output_65_cast_fp16)[name = string("x_213_cast_fp16")];
+            int32 var_7369 = const()[name = string("op_7369"), val = int32(-1)];
+            fp16 const_126_promoted_to_fp16 = const()[name = string("const_126_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_7371_cast_fp16 = mul(x = x_213_cast_fp16, y = const_126_promoted_to_fp16)[name = string("op_7371_cast_fp16")];
+            bool input_313_interleave_0 = const()[name = string("input_313_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_313_cast_fp16 = concat(axis = var_7369, interleave = input_313_interleave_0, values = (x_213_cast_fp16, var_7371_cast_fp16))[name = string("input_313_cast_fp16")];
+            tensor<int32, [1]> normed_297_axes_0 = const()[name = string("normed_297_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7366_to_fp16 = const()[name = string("op_7366_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_297_cast_fp16 = layer_norm(axes = normed_297_axes_0, epsilon = var_7366_to_fp16, x = input_313_cast_fp16)[name = string("normed_297_cast_fp16")];
+            tensor<int32, [2]> var_7376_split_sizes_0 = const()[name = string("op_7376_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7376_axis_0 = const()[name = string("op_7376_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_7376_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_7376_cast_fp16_1 = split(axis = var_7376_axis_0, split_sizes = var_7376_split_sizes_0, x = normed_297_cast_fp16)[name = string("op_7376_cast_fp16")];
+            tensor<fp16, [2560]> layers_10_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_10_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(566259584)))];
+            tensor<fp16, [1, 3, 2560]> h_63_cast_fp16 = mul(x = var_7376_cast_fp16_0, y = layers_10_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_63_cast_fp16")];
+            tensor<int32, [3]> var_7387 = const()[name = string("op_7387"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_315_axes_0 = const()[name = string("input_315_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_7388 = transpose(perm = var_7387, x = h_63_cast_fp16)[name = string("transpose_24")];
+            tensor<fp16, [1, 2560, 1, 3]> input_315 = expand_dims(axes = input_315_axes_0, x = var_7388)[name = string("input_315")];
+            string gate_41_pad_type_0 = const()[name = string("gate_41_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_41_strides_0 = const()[name = string("gate_41_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_41_pad_0 = const()[name = string("gate_41_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_41_dilations_0 = const()[name = string("gate_41_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_41_groups_0 = const()[name = string("gate_41_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_41 = conv(dilations = gate_41_dilations_0, groups = gate_41_groups_0, pad = gate_41_pad_0, pad_type = gate_41_pad_type_0, strides = gate_41_strides_0, weight = layers_10_mlp_gate_proj_weight_palettized, x = input_315)[name = string("gate_41")];
+            string up_21_pad_type_0 = const()[name = string("up_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_21_strides_0 = const()[name = string("up_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_21_pad_0 = const()[name = string("up_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_21_dilations_0 = const()[name = string("up_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_21_groups_0 = const()[name = string("up_21_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_21 = conv(dilations = up_21_dilations_0, groups = up_21_groups_0, pad = up_21_pad_0, pad_type = up_21_pad_type_0, strides = up_21_strides_0, weight = layers_10_mlp_up_proj_weight_palettized, x = input_315)[name = string("up_21")];
+            string gate_43_mode_0 = const()[name = string("gate_43_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_43 = gelu(mode = gate_43_mode_0, x = gate_41)[name = string("gate_43")];
+            tensor<fp16, [1, 10240, 1, 3]> input_317 = mul(x = gate_43, y = up_21)[name = string("input_317")];
+            string mlp_out_21_pad_type_0 = const()[name = string("mlp_out_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_21_strides_0 = const()[name = string("mlp_out_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_21_pad_0 = const()[name = string("mlp_out_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_21_dilations_0 = const()[name = string("mlp_out_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_21_groups_0 = const()[name = string("mlp_out_21_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_21 = conv(dilations = mlp_out_21_dilations_0, groups = mlp_out_21_groups_0, pad = mlp_out_21_pad_0, pad_type = mlp_out_21_pad_type_0, strides = mlp_out_21_strides_0, weight = layers_10_mlp_down_proj_weight_palettized, x = input_317)[name = string("mlp_out_21")];
+            tensor<int32, [1]> var_7428_axes_0 = const()[name = string("op_7428_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_7428 = squeeze(axes = var_7428_axes_0, x = mlp_out_21)[name = string("op_7428")];
+            tensor<int32, [3]> var_7432 = const()[name = string("op_7432"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_7438 = const()[name = string("op_7438"), val = int32(-1)];
+            fp16 const_127_promoted = const()[name = string("const_127_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_215 = transpose(perm = var_7432, x = var_7428)[name = string("transpose_23")];
+            tensor<fp16, [1, 3, 2560]> var_7440 = mul(x = x_215, y = const_127_promoted)[name = string("op_7440")];
+            bool input_319_interleave_0 = const()[name = string("input_319_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_319 = concat(axis = var_7438, interleave = input_319_interleave_0, values = (x_215, var_7440))[name = string("input_319")];
+            tensor<int32, [1]> normed_301_axes_0 = const()[name = string("normed_301_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7435_to_fp16 = const()[name = string("op_7435_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_301_cast_fp16 = layer_norm(axes = normed_301_axes_0, epsilon = var_7435_to_fp16, x = input_319)[name = string("normed_301_cast_fp16")];
+            tensor<int32, [2]> var_7445_split_sizes_0 = const()[name = string("op_7445_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7445_axis_0 = const()[name = string("op_7445_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_7445_0, tensor<fp16, [1, 3, 2560]> var_7445_1 = split(axis = var_7445_axis_0, split_sizes = var_7445_split_sizes_0, x = normed_301_cast_fp16)[name = string("op_7445")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_103 = mul(x = var_7445_0, y = layers_10_post_feedforward_layernorm_weight)[name = string("hidden_states_103")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_105_cast_fp16 = add(x = x_213_cast_fp16, y = hidden_states_103)[name = string("hidden_states_105_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_21_begin_0 = const()[name = string("per_layer_slice_21_begin_0"), val = tensor<int32, [3]>([0, 0, 5632])];
+            tensor<int32, [3]> per_layer_slice_21_end_0 = const()[name = string("per_layer_slice_21_end_0"), val = tensor<int32, [3]>([1, 3, 5888])];
+            tensor<bool, [3]> per_layer_slice_21_end_mask_0 = const()[name = string("per_layer_slice_21_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_21_cast_fp16 = slice_by_index(begin = per_layer_slice_21_begin_0, end = per_layer_slice_21_end_0, end_mask = per_layer_slice_21_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_21_cast_fp16")];
+            tensor<int32, [3]> var_7473 = const()[name = string("op_7473"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_321_axes_0 = const()[name = string("input_321_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_7474 = transpose(perm = var_7473, x = hidden_states_105_cast_fp16)[name = string("transpose_22")];
+            tensor<fp16, [1, 2560, 1, 3]> input_321 = expand_dims(axes = input_321_axes_0, x = var_7474)[name = string("input_321")];
+            string gated_61_pad_type_0 = const()[name = string("gated_61_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_61_strides_0 = const()[name = string("gated_61_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_61_pad_0 = const()[name = string("gated_61_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_61_dilations_0 = const()[name = string("gated_61_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_61_groups_0 = const()[name = string("gated_61_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_61 = conv(dilations = gated_61_dilations_0, groups = gated_61_groups_0, pad = gated_61_pad_0, pad_type = gated_61_pad_type_0, strides = gated_61_strides_0, weight = layers_10_per_layer_input_gate_weight_palettized, x = input_321)[name = string("gated_61")];
+            string gated_63_mode_0 = const()[name = string("gated_63_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_63 = gelu(mode = gated_63_mode_0, x = gated_61)[name = string("gated_63")];
+            tensor<int32, [3]> var_7493 = const()[name = string("op_7493"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_21_axes_0 = const()[name = string("per_layer_slice_conv_21_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_7494_cast_fp16 = transpose(perm = var_7493, x = per_layer_slice_21_cast_fp16)[name = string("transpose_21")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_21_cast_fp16 = expand_dims(axes = per_layer_slice_conv_21_axes_0, x = var_7494_cast_fp16)[name = string("per_layer_slice_conv_21_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_323_cast_fp16 = mul(x = gated_63, y = per_layer_slice_conv_21_cast_fp16)[name = string("input_323_cast_fp16")];
+            string gated_65_pad_type_0 = const()[name = string("gated_65_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_65_strides_0 = const()[name = string("gated_65_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_65_pad_0 = const()[name = string("gated_65_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_65_dilations_0 = const()[name = string("gated_65_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_65_groups_0 = const()[name = string("gated_65_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_10_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(566264768))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(566592512))))[name = string("layers_10_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_65_cast_fp16 = conv(dilations = gated_65_dilations_0, groups = gated_65_groups_0, pad = gated_65_pad_0, pad_type = gated_65_pad_type_0, strides = gated_65_strides_0, weight = layers_10_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_323_cast_fp16)[name = string("gated_65_cast_fp16")];
+            tensor<int32, [1]> var_7510_axes_0 = const()[name = string("op_7510_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_7510_cast_fp16 = squeeze(axes = var_7510_axes_0, x = gated_65_cast_fp16)[name = string("op_7510_cast_fp16")];
+            tensor<int32, [3]> var_7514 = const()[name = string("op_7514"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_7520 = const()[name = string("op_7520"), val = int32(-1)];
+            fp16 const_128_promoted_to_fp16 = const()[name = string("const_128_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_217_cast_fp16 = transpose(perm = var_7514, x = var_7510_cast_fp16)[name = string("transpose_20")];
+            tensor<fp16, [1, 3, 2560]> var_7522_cast_fp16 = mul(x = x_217_cast_fp16, y = const_128_promoted_to_fp16)[name = string("op_7522_cast_fp16")];
+            bool input_325_interleave_0 = const()[name = string("input_325_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_325_cast_fp16 = concat(axis = var_7520, interleave = input_325_interleave_0, values = (x_217_cast_fp16, var_7522_cast_fp16))[name = string("input_325_cast_fp16")];
+            tensor<int32, [1]> normed_305_axes_0 = const()[name = string("normed_305_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7517_to_fp16 = const()[name = string("op_7517_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_305_cast_fp16 = layer_norm(axes = normed_305_axes_0, epsilon = var_7517_to_fp16, x = input_325_cast_fp16)[name = string("normed_305_cast_fp16")];
+            tensor<int32, [2]> var_7527_split_sizes_0 = const()[name = string("op_7527_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7527_axis_0 = const()[name = string("op_7527_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_7527_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_7527_cast_fp16_1 = split(axis = var_7527_axis_0, split_sizes = var_7527_split_sizes_0, x = normed_305_cast_fp16)[name = string("op_7527_cast_fp16")];
+            tensor<fp16, [2560]> layers_10_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_10_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(566595136)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_109_cast_fp16 = mul(x = var_7527_cast_fp16_0, y = layers_10_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_109_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_111_cast_fp16 = add(x = hidden_states_105_cast_fp16, y = hidden_states_109_cast_fp16)[name = string("hidden_states_111_cast_fp16")];
+            tensor<fp16, [1]> const_129_promoted_to_fp16 = const()[name = string("const_129_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.42p-3])];
+            tensor<fp16, [1, 3, 2560]> x_219_cast_fp16 = mul(x = hidden_states_111_cast_fp16, y = const_129_promoted_to_fp16)[name = string("x_219_cast_fp16")];
+            int32 var_7542 = const()[name = string("op_7542"), val = int32(-1)];
+            fp16 const_130_promoted_to_fp16 = const()[name = string("const_130_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_7544_cast_fp16 = mul(x = x_219_cast_fp16, y = const_130_promoted_to_fp16)[name = string("op_7544_cast_fp16")];
+            bool input_327_interleave_0 = const()[name = string("input_327_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_327_cast_fp16 = concat(axis = var_7542, interleave = input_327_interleave_0, values = (x_219_cast_fp16, var_7544_cast_fp16))[name = string("input_327_cast_fp16")];
+            tensor<int32, [1]> normed_309_axes_0 = const()[name = string("normed_309_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7539_to_fp16 = const()[name = string("op_7539_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_309_cast_fp16 = layer_norm(axes = normed_309_axes_0, epsilon = var_7539_to_fp16, x = input_327_cast_fp16)[name = string("normed_309_cast_fp16")];
+            tensor<int32, [2]> var_7549_split_sizes_0 = const()[name = string("op_7549_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7549_axis_0 = const()[name = string("op_7549_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_7549_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_7549_cast_fp16_1 = split(axis = var_7549_axis_0, split_sizes = var_7549_split_sizes_0, x = normed_309_cast_fp16)[name = string("op_7549_cast_fp16")];
+            tensor<fp16, [2560]> layers_11_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_11_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(566600320)))];
+            tensor<fp16, [1, 3, 2560]> h_67_cast_fp16 = mul(x = var_7549_cast_fp16_0, y = layers_11_input_layernorm_weight_promoted_to_fp16)[name = string("h_67_cast_fp16")];
+            tensor<int32, [3]> var_7555 = const()[name = string("op_7555"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_7558_axes_0 = const()[name = string("op_7558_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_7556_cast_fp16 = transpose(perm = var_7555, x = h_67_cast_fp16)[name = string("transpose_19")];
+            tensor<fp16, [1, 2560, 1, 3]> var_7558_cast_fp16 = expand_dims(axes = var_7558_axes_0, x = var_7556_cast_fp16)[name = string("op_7558_cast_fp16")];
+            string q_133_pad_type_0 = const()[name = string("q_133_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_133_strides_0 = const()[name = string("q_133_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_133_pad_0 = const()[name = string("q_133_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_133_dilations_0 = const()[name = string("q_133_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_133_groups_0 = const()[name = string("q_133_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 4096, 1, 3]> q_133 = conv(dilations = q_133_dilations_0, groups = q_133_groups_0, pad = q_133_pad_0, pad_type = q_133_pad_type_0, strides = q_133_strides_0, weight = layers_11_self_attn_q_proj_weight_palettized, x = var_7558_cast_fp16)[name = string("q_133")];
+            tensor<int32, [4]> var_7579 = const()[name = string("op_7579"), val = tensor<int32, [4]>([1, 8, 512, 3])];
+            tensor<fp16, [1, 8, 512, 3]> var_7580 = reshape(shape = var_7579, x = q_133)[name = string("op_7580")];
+            tensor<int32, [4]> transpose_81_perm_0 = const()[name = string("transpose_81_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_7603 = const()[name = string("op_7603"), val = tensor<int32, [3]>([3, 8, 512])];
+            tensor<fp16, [1, 3, 8, 512]> transpose_81 = transpose(perm = transpose_81_perm_0, x = var_7580)[name = string("transpose_18")];
+            tensor<fp16, [3, 8, 512]> x_221 = reshape(shape = var_7603, x = transpose_81)[name = string("x_221")];
+            int32 var_7609 = const()[name = string("op_7609"), val = int32(-1)];
+            fp16 const_131_promoted = const()[name = string("const_131_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 512]> var_7611 = mul(x = x_221, y = const_131_promoted)[name = string("op_7611")];
+            bool input_331_interleave_0 = const()[name = string("input_331_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 1024]> input_331 = concat(axis = var_7609, interleave = input_331_interleave_0, values = (x_221, var_7611))[name = string("input_331")];
+            tensor<int32, [1]> normed_313_axes_0 = const()[name = string("normed_313_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7606_to_fp16 = const()[name = string("op_7606_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 1024]> normed_313_cast_fp16 = layer_norm(axes = normed_313_axes_0, epsilon = var_7606_to_fp16, x = input_331)[name = string("normed_313_cast_fp16")];
+            tensor<int32, [2]> var_7616_split_sizes_0 = const()[name = string("op_7616_split_sizes_0"), val = tensor<int32, [2]>([512, 512])];
+            int32 var_7616_axis_0 = const()[name = string("op_7616_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 512]> var_7616_0, tensor<fp16, [3, 8, 512]> var_7616_1 = split(axis = var_7616_axis_0, split_sizes = var_7616_split_sizes_0, x = normed_313_cast_fp16)[name = string("op_7616")];
+            tensor<fp16, [3, 8, 512]> q_137 = mul(x = var_7616_0, y = layers_11_self_attn_q_norm_weight)[name = string("q_137")];
+            tensor<int32, [4]> var_7623 = const()[name = string("op_7623"), val = tensor<int32, [4]>([1, 3, 8, 512])];
+            tensor<fp16, [1, 3, 8, 512]> var_7624 = reshape(shape = var_7623, x = q_137)[name = string("op_7624")];
+            tensor<int32, [4]> var_7629 = const()[name = string("op_7629"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 512]> q_139 = transpose(perm = var_7629, x = var_7624)[name = string("transpose_17")];
+            tensor<fp16, [1, 8, 3, 512]> var_7631_cast_fp16 = mul(x = q_139, y = cos_f)[name = string("op_7631_cast_fp16")];
+            tensor<int32, [2]> var_7632_split_sizes_0 = const()[name = string("op_7632_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_7632_axis_0 = const()[name = string("op_7632_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 256]> var_7632_0, tensor<fp16, [1, 8, 3, 256]> var_7632_1 = split(axis = var_7632_axis_0, split_sizes = var_7632_split_sizes_0, x = q_139)[name = string("op_7632")];
+            fp16 const_132_promoted = const()[name = string("const_132_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 256]> var_7634 = mul(x = var_7632_1, y = const_132_promoted)[name = string("op_7634")];
+            int32 var_7636 = const()[name = string("op_7636"), val = int32(-1)];
+            bool var_7637_interleave_0 = const()[name = string("op_7637_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 512]> var_7637 = concat(axis = var_7636, interleave = var_7637_interleave_0, values = (var_7634, var_7632_0))[name = string("op_7637")];
+            tensor<fp16, [1, 8, 3, 512]> var_7638_cast_fp16 = mul(x = var_7637, y = sin_f)[name = string("op_7638_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> q_cast_fp16 = add(x = var_7631_cast_fp16, y = var_7638_cast_fp16)[name = string("q_cast_fp16")];
+            string k_69_pad_type_0 = const()[name = string("k_69_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> k_69_strides_0 = const()[name = string("k_69_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> k_69_pad_0 = const()[name = string("k_69_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> k_69_dilations_0 = const()[name = string("k_69_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 k_69_groups_0 = const()[name = string("k_69_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 3]> k_69 = conv(dilations = k_69_dilations_0, groups = k_69_groups_0, pad = k_69_pad_0, pad_type = k_69_pad_type_0, strides = k_69_strides_0, weight = layers_11_self_attn_k_proj_weight_palettized, x = var_7558_cast_fp16)[name = string("k_69")];
+            tensor<int32, [4]> var_7656 = const()[name = string("op_7656"), val = tensor<int32, [4]>([1, 2, 512, 3])];
+            tensor<fp16, [1, 2, 512, 3]> var_7657 = reshape(shape = var_7656, x = k_69)[name = string("op_7657")];
+            tensor<int32, [4]> transpose_82_perm_0 = const()[name = string("transpose_82_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            string v_25_pad_type_0 = const()[name = string("v_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> v_25_strides_0 = const()[name = string("v_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> v_25_pad_0 = const()[name = string("v_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> v_25_dilations_0 = const()[name = string("v_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 v_25_groups_0 = const()[name = string("v_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 3]> v_25 = conv(dilations = v_25_dilations_0, groups = v_25_groups_0, pad = v_25_pad_0, pad_type = v_25_pad_type_0, strides = v_25_strides_0, weight = layers_11_self_attn_v_proj_weight_palettized, x = var_7558_cast_fp16)[name = string("v_25")];
+            tensor<int32, [4]> var_7684 = const()[name = string("op_7684"), val = tensor<int32, [4]>([1, 2, 512, 3])];
+            tensor<fp16, [1, 2, 512, 3]> var_7685 = reshape(shape = var_7684, x = v_25)[name = string("op_7685")];
+            tensor<int32, [4]> var_7690 = const()[name = string("op_7690"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_7708 = const()[name = string("op_7708"), val = tensor<int32, [3]>([3, 2, 512])];
+            tensor<fp16, [1, 3, 2, 512]> transpose_82 = transpose(perm = transpose_82_perm_0, x = var_7657)[name = string("transpose_16")];
+            tensor<fp16, [3, 2, 512]> x_223 = reshape(shape = var_7708, x = transpose_82)[name = string("x_223")];
+            int32 var_7714 = const()[name = string("op_7714"), val = int32(-1)];
+            fp16 const_133_promoted = const()[name = string("const_133_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 2, 512]> var_7716 = mul(x = x_223, y = const_133_promoted)[name = string("op_7716")];
+            bool input_333_interleave_0 = const()[name = string("input_333_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 2, 1024]> input_333 = concat(axis = var_7714, interleave = input_333_interleave_0, values = (x_223, var_7716))[name = string("input_333")];
+            tensor<int32, [1]> normed_317_axes_0 = const()[name = string("normed_317_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7711_to_fp16 = const()[name = string("op_7711_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 2, 1024]> normed_317_cast_fp16 = layer_norm(axes = normed_317_axes_0, epsilon = var_7711_to_fp16, x = input_333)[name = string("normed_317_cast_fp16")];
+            tensor<int32, [2]> var_7721_split_sizes_0 = const()[name = string("op_7721_split_sizes_0"), val = tensor<int32, [2]>([512, 512])];
+            int32 var_7721_axis_0 = const()[name = string("op_7721_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 2, 512]> var_7721_0, tensor<fp16, [3, 2, 512]> var_7721_1 = split(axis = var_7721_axis_0, split_sizes = var_7721_split_sizes_0, x = normed_317_cast_fp16)[name = string("op_7721")];
+            tensor<fp16, [3, 2, 512]> k_73 = mul(x = var_7721_0, y = layers_11_self_attn_k_norm_weight)[name = string("k_73")];
+            tensor<int32, [4]> var_7728 = const()[name = string("op_7728"), val = tensor<int32, [4]>([1, 3, 2, 512])];
+            tensor<fp16, [1, 3, 2, 512]> var_7729 = reshape(shape = var_7728, x = k_73)[name = string("op_7729")];
+            tensor<int32, [4]> var_7734 = const()[name = string("op_7734"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            fp16 var_7736_promoted = const()[name = string("op_7736_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 3, 512]> var_7691 = transpose(perm = var_7690, x = var_7685)[name = string("transpose_15")];
+            tensor<fp16, [1, 2, 3, 512]> var_7737 = pow(x = var_7691, y = var_7736_promoted)[name = string("op_7737")];
+            tensor<int32, [1]> var_7742_axes_0 = const()[name = string("op_7742_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_7742_keep_dims_0 = const()[name = string("op_7742_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 3, 1]> var_7742 = reduce_mean(axes = var_7742_axes_0, keep_dims = var_7742_keep_dims_0, x = var_7737)[name = string("op_7742")];
+            fp16 var_7744_to_fp16 = const()[name = string("op_7744_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 3, 1]> mean_sq_cast_fp16 = add(x = var_7742, y = var_7744_to_fp16)[name = string("mean_sq_cast_fp16")];
+            fp32 var_7746_epsilon_0 = const()[name = string("op_7746_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 3, 1]> var_7746_cast_fp16 = rsqrt(epsilon = var_7746_epsilon_0, x = mean_sq_cast_fp16)[name = string("op_7746_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 512]> v_cast_fp16 = mul(x = var_7691, y = var_7746_cast_fp16)[name = string("v_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 512]> q_141 = transpose(perm = var_7734, x = var_7729)[name = string("transpose_14")];
+            tensor<fp16, [1, 2, 3, 512]> var_7748_cast_fp16 = mul(x = q_141, y = cos_f)[name = string("op_7748_cast_fp16")];
+            tensor<int32, [2]> var_7749_split_sizes_0 = const()[name = string("op_7749_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_7749_axis_0 = const()[name = string("op_7749_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 3, 256]> var_7749_0, tensor<fp16, [1, 2, 3, 256]> var_7749_1 = split(axis = var_7749_axis_0, split_sizes = var_7749_split_sizes_0, x = q_141)[name = string("op_7749")];
+            fp16 const_134_promoted = const()[name = string("const_134_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 3, 256]> var_7751 = mul(x = var_7749_1, y = const_134_promoted)[name = string("op_7751")];
+            int32 var_7753 = const()[name = string("op_7753"), val = int32(-1)];
+            bool var_7754_interleave_0 = const()[name = string("op_7754_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 3, 512]> var_7754 = concat(axis = var_7753, interleave = var_7754_interleave_0, values = (var_7751, var_7749_0))[name = string("op_7754")];
+            tensor<fp16, [1, 2, 3, 512]> var_7755_cast_fp16 = mul(x = var_7754, y = sin_f)[name = string("op_7755_cast_fp16")];
+            tensor<fp16, [1, 2, 3, 512]> k_cast_fp16 = add(x = var_7748_cast_fp16, y = var_7755_cast_fp16)[name = string("k_cast_fp16")];
+            bool k_scattered_transpose_x_0 = const()[name = string("k_scattered_transpose_x_0"), val = bool(false)];
+            bool k_scattered_transpose_y_0 = const()[name = string("k_scattered_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 2048, 512]> k_scattered_cast_fp16 = matmul(transpose_x = k_scattered_transpose_x_0, transpose_y = k_scattered_transpose_y_0, x = var_4055_cast_fp16, y = k_cast_fp16)[name = string("k_scattered_cast_fp16")];
+            bool v_scattered_transpose_x_0 = const()[name = string("v_scattered_transpose_x_0"), val = bool(false)];
+            bool v_scattered_transpose_y_0 = const()[name = string("v_scattered_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 2048, 512]> v_scattered_cast_fp16 = matmul(transpose_x = v_scattered_transpose_x_0, transpose_y = v_scattered_transpose_y_0, x = var_4055_cast_fp16, y = v_cast_fp16)[name = string("v_scattered_cast_fp16")];
+            tensor<int32, [4]> slot_k_begin_0 = const()[name = string("slot_k_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> slot_k_end_0 = const()[name = string("slot_k_end_0"), val = tensor<int32, [4]>([1, 2, 2048, 512])];
+            tensor<bool, [4]> slot_k_end_mask_0 = const()[name = string("slot_k_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 2048, 512]> slot_k_cast_fp16 = slice_by_index(begin = slot_k_begin_0, end = slot_k_end_0, end_mask = slot_k_end_mask_0, x = K_full_out_1_cast_fp16)[name = string("slot_k_cast_fp16")];
+            tensor<int32, [4]> slot_v_begin_0 = const()[name = string("slot_v_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> slot_v_end_0 = const()[name = string("slot_v_end_0"), val = tensor<int32, [4]>([1, 2, 2048, 512])];
+            tensor<bool, [4]> slot_v_end_mask_0 = const()[name = string("slot_v_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 2048, 512]> slot_v_cast_fp16 = slice_by_index(begin = slot_v_begin_0, end = slot_v_end_0, end_mask = slot_v_end_mask_0, x = V_full_out_1_cast_fp16)[name = string("slot_v_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_7792_cast_fp16 = mul(x = slot_k_cast_fp16, y = var_4082_cast_fp16)[name = string("op_7792_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> new_k_cast_fp16 = add(x = var_7792_cast_fp16, y = k_scattered_cast_fp16)[name = string("new_k_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_7798_cast_fp16 = mul(x = slot_v_cast_fp16, y = var_4082_cast_fp16)[name = string("op_7798_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> new_v_cast_fp16 = add(x = var_7798_cast_fp16, y = v_scattered_cast_fp16)[name = string("new_v_cast_fp16")];
+            int32 var_7812 = const()[name = string("op_7812"), val = int32(0)];
+            bool K_full_out_interleave_0 = const()[name = string("K_full_out_interleave_0"), val = bool(false)];
+            tensor<fp16, [2, 2, 2048, 512]> K_full_out = concat(axis = var_7812, interleave = K_full_out_interleave_0, values = (var_4122_cast_fp16, new_k_cast_fp16))[name = string("K_full_out_cast_fp16")];
+            int32 var_7825 = const()[name = string("op_7825"), val = int32(0)];
+            bool V_full_out_interleave_0 = const()[name = string("V_full_out_interleave_0"), val = bool(false)];
+            tensor<fp16, [2, 2, 2048, 512]> V_full_out = concat(axis = var_7825, interleave = V_full_out_interleave_0, values = (var_4132_cast_fp16, new_v_cast_fp16))[name = string("V_full_out_cast_fp16")];
+            tensor<int32, [4]> var_7831_begin_0 = const()[name = string("op_7831_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> var_7831_end_0 = const()[name = string("op_7831_end_0"), val = tensor<int32, [4]>([1, 2, 2048, 512])];
+            tensor<bool, [4]> var_7831_end_mask_0 = const()[name = string("op_7831_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 2048, 512]> kv14_k = slice_by_index(begin = var_7831_begin_0, end = var_7831_end_0, end_mask = var_7831_end_mask_0, x = K_full_out)[name = string("op_7831_cast_fp16")];
+            tensor<int32, [4]> var_7841_begin_0 = const()[name = string("op_7841_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> var_7841_end_0 = const()[name = string("op_7841_end_0"), val = tensor<int32, [4]>([1, 2, 2048, 512])];
+            tensor<bool, [4]> var_7841_end_mask_0 = const()[name = string("op_7841_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 2048, 512]> kv14_v = slice_by_index(begin = var_7841_begin_0, end = var_7841_end_0, end_mask = var_7841_end_mask_0, x = V_full_out)[name = string("op_7841_cast_fp16")];
+            tensor<int32, [4]> transpose_44_perm_0 = const()[name = string("transpose_44_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_22_reps_0 = const()[name = string("tile_22_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 2048, 512]> transpose_44_cast_fp16 = transpose(perm = transpose_44_perm_0, x = kv14_k)[name = string("transpose_13")];
+            tensor<fp16, [8, 1, 2048, 512]> tile_22_cast_fp16 = tile(reps = tile_22_reps_0, x = transpose_44_cast_fp16)[name = string("tile_22_cast_fp16")];
+            tensor<int32, [5]> concat_48 = const()[name = string("concat_48"), val = tensor<int32, [5]>([4, 2, 1, 2048, 512])];
+            tensor<fp16, [4, 2, 1, 2048, 512]> reshape_44_cast_fp16 = reshape(shape = concat_48, x = tile_22_cast_fp16)[name = string("reshape_44_cast_fp16")];
+            tensor<int32, [5]> transpose_45_perm_0 = const()[name = string("transpose_45_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_49 = const()[name = string("concat_49"), val = tensor<int32, [4]>([-1, 1, 2048, 512])];
+            tensor<fp16, [2, 4, 1, 2048, 512]> transpose_45_cast_fp16 = transpose(perm = transpose_45_perm_0, x = reshape_44_cast_fp16)[name = string("transpose_12")];
+            tensor<fp16, [8, 1, 2048, 512]> reshape_45_cast_fp16 = reshape(shape = concat_49, x = transpose_45_cast_fp16)[name = string("reshape_45_cast_fp16")];
+            tensor<int32, [4]> transpose_83_perm_0 = const()[name = string("transpose_83_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_46_perm_0 = const()[name = string("transpose_46_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_23_reps_0 = const()[name = string("tile_23_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 2048, 512]> transpose_46_cast_fp16 = transpose(perm = transpose_46_perm_0, x = kv14_v)[name = string("transpose_11")];
+            tensor<fp16, [8, 1, 2048, 512]> tile_23_cast_fp16 = tile(reps = tile_23_reps_0, x = transpose_46_cast_fp16)[name = string("tile_23_cast_fp16")];
+            tensor<int32, [5]> concat_50 = const()[name = string("concat_50"), val = tensor<int32, [5]>([4, 2, 1, 2048, 512])];
+            tensor<fp16, [4, 2, 1, 2048, 512]> reshape_46_cast_fp16 = reshape(shape = concat_50, x = tile_23_cast_fp16)[name = string("reshape_46_cast_fp16")];
+            tensor<int32, [5]> transpose_47_perm_0 = const()[name = string("transpose_47_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_51 = const()[name = string("concat_51"), val = tensor<int32, [4]>([-1, 1, 2048, 512])];
+            tensor<fp16, [2, 4, 1, 2048, 512]> transpose_47_cast_fp16 = transpose(perm = transpose_47_perm_0, x = reshape_46_cast_fp16)[name = string("transpose_10")];
+            tensor<fp16, [8, 1, 2048, 512]> reshape_47_cast_fp16 = reshape(shape = concat_51, x = transpose_47_cast_fp16)[name = string("reshape_47_cast_fp16")];
+            tensor<int32, [4]> V_expanded_perm_0 = const()[name = string("V_expanded_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_45_transpose_x_0 = const()[name = string("attn_weights_45_transpose_x_0"), val = bool(false)];
+            bool attn_weights_45_transpose_y_0 = const()[name = string("attn_weights_45_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 2048]> transpose_83_cast_fp16 = transpose(perm = transpose_83_perm_0, x = reshape_45_cast_fp16)[name = string("transpose_9")];
+            tensor<fp16, [1, 8, 3, 2048]> attn_weights_45_cast_fp16 = matmul(transpose_x = attn_weights_45_transpose_x_0, transpose_y = attn_weights_45_transpose_y_0, x = q_cast_fp16, y = transpose_83_cast_fp16)[name = string("attn_weights_45_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 2048]> x_227_cast_fp16 = add(x = attn_weights_45_cast_fp16, y = causal_mask_full)[name = string("x_227_cast_fp16")];
+            tensor<int32, [1]> reduce_max_11_axes_0 = const()[name = string("reduce_max_11_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_11_keep_dims_0 = const()[name = string("reduce_max_11_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_11 = reduce_max(axes = reduce_max_11_axes_0, keep_dims = reduce_max_11_keep_dims_0, x = x_227_cast_fp16)[name = string("reduce_max_11")];
+            tensor<fp16, [1, 8, 3, 2048]> var_7896 = sub(x = x_227_cast_fp16, y = reduce_max_11)[name = string("op_7896")];
+            tensor<fp16, [1, 8, 3, 2048]> var_7902 = exp(x = var_7896)[name = string("op_7902")];
+            tensor<int32, [1]> var_7912_axes_0 = const()[name = string("op_7912_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_7912_keep_dims_0 = const()[name = string("op_7912_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_7912 = reduce_sum(axes = var_7912_axes_0, keep_dims = var_7912_keep_dims_0, x = var_7902)[name = string("op_7912")];
+            tensor<fp16, [1, 8, 3, 2048]> var_7918_cast_fp16 = real_div(x = var_7902, y = var_7912)[name = string("op_7918_cast_fp16")];
+            bool attn_output_67_transpose_x_0 = const()[name = string("attn_output_67_transpose_x_0"), val = bool(false)];
+            bool attn_output_67_transpose_y_0 = const()[name = string("attn_output_67_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 2048, 512]> V_expanded_cast_fp16 = transpose(perm = V_expanded_perm_0, x = reshape_47_cast_fp16)[name = string("transpose_8")];
+            tensor<fp16, [1, 8, 3, 512]> attn_output_67_cast_fp16 = matmul(transpose_x = attn_output_67_transpose_x_0, transpose_y = attn_output_67_transpose_y_0, x = var_7918_cast_fp16, y = V_expanded_cast_fp16)[name = string("attn_output_67_cast_fp16")];
+            tensor<int32, [4]> var_7929 = const()[name = string("op_7929"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_7936 = const()[name = string("op_7936"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 512]> var_7930_cast_fp16 = transpose(perm = var_7929, x = attn_output_67_cast_fp16)[name = string("transpose_7")];
+            tensor<fp16, [1, 3, 4096]> attn_output_69_cast_fp16 = reshape(shape = var_7936, x = var_7930_cast_fp16)[name = string("attn_output_69_cast_fp16")];
+            tensor<int32, [3]> var_7941 = const()[name = string("op_7941"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_7957_pad_type_0 = const()[name = string("op_7957_pad_type_0"), val = string("valid")];
+            int32 var_7957_groups_0 = const()[name = string("op_7957_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_7957_strides_0 = const()[name = string("op_7957_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_7957_pad_0 = const()[name = string("op_7957_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_7957_dilations_0 = const()[name = string("op_7957_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 4096, 1]> squeeze_11_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 4096, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(566605504))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(571848448))))[name = string("squeeze_11_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 4096, 3]> var_7942_cast_fp16 = transpose(perm = var_7941, x = attn_output_69_cast_fp16)[name = string("transpose_6")];
+            tensor<fp16, [1, 2560, 3]> var_7957_cast_fp16 = conv(dilations = var_7957_dilations_0, groups = var_7957_groups_0, pad = var_7957_pad_0, pad_type = var_7957_pad_type_0, strides = var_7957_strides_0, weight = squeeze_11_cast_fp16_to_fp32_to_fp16_palettized, x = var_7942_cast_fp16)[name = string("op_7957_cast_fp16")];
+            tensor<int32, [3]> var_7961 = const()[name = string("op_7961"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_7967 = const()[name = string("op_7967"), val = int32(-1)];
+            fp16 const_135_promoted_to_fp16 = const()[name = string("const_135_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_231_cast_fp16 = transpose(perm = var_7961, x = var_7957_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 3, 2560]> var_7969_cast_fp16 = mul(x = x_231_cast_fp16, y = const_135_promoted_to_fp16)[name = string("op_7969_cast_fp16")];
+            bool input_337_interleave_0 = const()[name = string("input_337_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_337_cast_fp16 = concat(axis = var_7967, interleave = input_337_interleave_0, values = (x_231_cast_fp16, var_7969_cast_fp16))[name = string("input_337_cast_fp16")];
+            tensor<int32, [1]> normed_321_axes_0 = const()[name = string("normed_321_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7964_to_fp16 = const()[name = string("op_7964_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_321_cast_fp16 = layer_norm(axes = normed_321_axes_0, epsilon = var_7964_to_fp16, x = input_337_cast_fp16)[name = string("normed_321_cast_fp16")];
+            tensor<int32, [2]> var_7974_split_sizes_0 = const()[name = string("op_7974_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7974_axis_0 = const()[name = string("op_7974_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_7974_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_7974_cast_fp16_1 = split(axis = var_7974_axis_0, split_sizes = var_7974_split_sizes_0, x = normed_321_cast_fp16)[name = string("op_7974_cast_fp16")];
+            tensor<fp16, [2560]> layers_11_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_11_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(571851072)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_cast_fp16 = mul(x = var_7974_cast_fp16_0, y = layers_11_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_233_cast_fp16 = add(x = x_219_cast_fp16, y = attn_output_cast_fp16)[name = string("x_233_cast_fp16")];
+            int32 var_7983 = const()[name = string("op_7983"), val = int32(-1)];
+            fp16 const_136_promoted_to_fp16 = const()[name = string("const_136_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_7985_cast_fp16 = mul(x = x_233_cast_fp16, y = const_136_promoted_to_fp16)[name = string("op_7985_cast_fp16")];
+            bool input_339_interleave_0 = const()[name = string("input_339_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_339_cast_fp16 = concat(axis = var_7983, interleave = input_339_interleave_0, values = (x_233_cast_fp16, var_7985_cast_fp16))[name = string("input_339_cast_fp16")];
+            tensor<int32, [1]> normed_325_axes_0 = const()[name = string("normed_325_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7980_to_fp16 = const()[name = string("op_7980_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_325_cast_fp16 = layer_norm(axes = normed_325_axes_0, epsilon = var_7980_to_fp16, x = input_339_cast_fp16)[name = string("normed_325_cast_fp16")];
+            tensor<int32, [2]> var_7990_split_sizes_0 = const()[name = string("op_7990_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7990_axis_0 = const()[name = string("op_7990_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_7990_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_7990_cast_fp16_1 = split(axis = var_7990_axis_0, split_sizes = var_7990_split_sizes_0, x = normed_325_cast_fp16)[name = string("op_7990_cast_fp16")];
+            tensor<fp16, [2560]> layers_11_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_11_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(571856256)))];
+            tensor<fp16, [1, 3, 2560]> h_69_cast_fp16 = mul(x = var_7990_cast_fp16_0, y = layers_11_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_69_cast_fp16")];
+            tensor<int32, [3]> var_8001 = const()[name = string("op_8001"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_341_axes_0 = const()[name = string("input_341_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_8002 = transpose(perm = var_8001, x = h_69_cast_fp16)[name = string("transpose_4")];
+            tensor<fp16, [1, 2560, 1, 3]> input_341 = expand_dims(axes = input_341_axes_0, x = var_8002)[name = string("input_341")];
+            string gate_45_pad_type_0 = const()[name = string("gate_45_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_45_strides_0 = const()[name = string("gate_45_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_45_pad_0 = const()[name = string("gate_45_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_45_dilations_0 = const()[name = string("gate_45_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_45_groups_0 = const()[name = string("gate_45_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_45 = conv(dilations = gate_45_dilations_0, groups = gate_45_groups_0, pad = gate_45_pad_0, pad_type = gate_45_pad_type_0, strides = gate_45_strides_0, weight = layers_11_mlp_gate_proj_weight_palettized, x = input_341)[name = string("gate_45")];
+            string up_pad_type_0 = const()[name = string("up_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_strides_0 = const()[name = string("up_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_pad_0 = const()[name = string("up_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_dilations_0 = const()[name = string("up_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_groups_0 = const()[name = string("up_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up = conv(dilations = up_dilations_0, groups = up_groups_0, pad = up_pad_0, pad_type = up_pad_type_0, strides = up_strides_0, weight = layers_11_mlp_up_proj_weight_palettized, x = input_341)[name = string("up")];
+            string gate_mode_0 = const()[name = string("gate_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate = gelu(mode = gate_mode_0, x = gate_45)[name = string("gate")];
+            tensor<fp16, [1, 10240, 1, 3]> input_343 = mul(x = gate, y = up)[name = string("input_343")];
+            string mlp_out_pad_type_0 = const()[name = string("mlp_out_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_strides_0 = const()[name = string("mlp_out_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_pad_0 = const()[name = string("mlp_out_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_dilations_0 = const()[name = string("mlp_out_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_groups_0 = const()[name = string("mlp_out_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out = conv(dilations = mlp_out_dilations_0, groups = mlp_out_groups_0, pad = mlp_out_pad_0, pad_type = mlp_out_pad_type_0, strides = mlp_out_strides_0, weight = layers_11_mlp_down_proj_weight_palettized, x = input_343)[name = string("mlp_out")];
+            tensor<int32, [1]> var_8042_axes_0 = const()[name = string("op_8042_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_8042 = squeeze(axes = var_8042_axes_0, x = mlp_out)[name = string("op_8042")];
+            tensor<int32, [3]> var_8046 = const()[name = string("op_8046"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_8052 = const()[name = string("op_8052"), val = int32(-1)];
+            fp16 const_137_promoted = const()[name = string("const_137_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_235 = transpose(perm = var_8046, x = var_8042)[name = string("transpose_3")];
+            tensor<fp16, [1, 3, 2560]> var_8054 = mul(x = x_235, y = const_137_promoted)[name = string("op_8054")];
+            bool input_345_interleave_0 = const()[name = string("input_345_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_345 = concat(axis = var_8052, interleave = input_345_interleave_0, values = (x_235, var_8054))[name = string("input_345")];
+            tensor<int32, [1]> normed_329_axes_0 = const()[name = string("normed_329_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_8049_to_fp16 = const()[name = string("op_8049_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_329_cast_fp16 = layer_norm(axes = normed_329_axes_0, epsilon = var_8049_to_fp16, x = input_345)[name = string("normed_329_cast_fp16")];
+            tensor<int32, [2]> var_8059_split_sizes_0 = const()[name = string("op_8059_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_8059_axis_0 = const()[name = string("op_8059_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_8059_0, tensor<fp16, [1, 3, 2560]> var_8059_1 = split(axis = var_8059_axis_0, split_sizes = var_8059_split_sizes_0, x = normed_329_cast_fp16)[name = string("op_8059")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_113 = mul(x = var_8059_0, y = layers_11_post_feedforward_layernorm_weight)[name = string("hidden_states_113")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_115_cast_fp16 = add(x = x_233_cast_fp16, y = hidden_states_113)[name = string("hidden_states_115_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_begin_0 = const()[name = string("per_layer_slice_begin_0"), val = tensor<int32, [3]>([0, 0, 5888])];
+            tensor<int32, [3]> per_layer_slice_end_0 = const()[name = string("per_layer_slice_end_0"), val = tensor<int32, [3]>([1, 3, 6144])];
+            tensor<bool, [3]> per_layer_slice_end_mask_0 = const()[name = string("per_layer_slice_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_cast_fp16 = slice_by_index(begin = per_layer_slice_begin_0, end = per_layer_slice_end_0, end_mask = per_layer_slice_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_cast_fp16")];
+            tensor<int32, [3]> var_8087 = const()[name = string("op_8087"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_347_axes_0 = const()[name = string("input_347_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_8088 = transpose(perm = var_8087, x = hidden_states_115_cast_fp16)[name = string("transpose_2")];
+            tensor<fp16, [1, 2560, 1, 3]> input_347 = expand_dims(axes = input_347_axes_0, x = var_8088)[name = string("input_347")];
+            string gated_67_pad_type_0 = const()[name = string("gated_67_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_67_strides_0 = const()[name = string("gated_67_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_67_pad_0 = const()[name = string("gated_67_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_67_dilations_0 = const()[name = string("gated_67_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_67_groups_0 = const()[name = string("gated_67_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_67 = conv(dilations = gated_67_dilations_0, groups = gated_67_groups_0, pad = gated_67_pad_0, pad_type = gated_67_pad_type_0, strides = gated_67_strides_0, weight = layers_11_per_layer_input_gate_weight_palettized, x = input_347)[name = string("gated_67")];
+            string gated_69_mode_0 = const()[name = string("gated_69_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_69 = gelu(mode = gated_69_mode_0, x = gated_67)[name = string("gated_69")];
+            tensor<int32, [3]> var_8107 = const()[name = string("op_8107"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_axes_0 = const()[name = string("per_layer_slice_conv_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_8108_cast_fp16 = transpose(perm = var_8107, x = per_layer_slice_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_cast_fp16 = expand_dims(axes = per_layer_slice_conv_axes_0, x = var_8108_cast_fp16)[name = string("per_layer_slice_conv_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_349_cast_fp16 = mul(x = gated_69, y = per_layer_slice_conv_cast_fp16)[name = string("input_349_cast_fp16")];
+            string gated_pad_type_0 = const()[name = string("gated_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_strides_0 = const()[name = string("gated_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_pad_0 = const()[name = string("gated_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_dilations_0 = const()[name = string("gated_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_groups_0 = const()[name = string("gated_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_11_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(571861440))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(572189184))))[name = string("layers_11_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_cast_fp16 = conv(dilations = gated_dilations_0, groups = gated_groups_0, pad = gated_pad_0, pad_type = gated_pad_type_0, strides = gated_strides_0, weight = layers_11_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_349_cast_fp16)[name = string("gated_cast_fp16")];
+            tensor<int32, [1]> var_8124_axes_0 = const()[name = string("op_8124_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_8124_cast_fp16 = squeeze(axes = var_8124_axes_0, x = gated_cast_fp16)[name = string("op_8124_cast_fp16")];
+            tensor<int32, [3]> var_8128 = const()[name = string("op_8128"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_8134 = const()[name = string("op_8134"), val = int32(-1)];
+            fp16 const_138_promoted_to_fp16 = const()[name = string("const_138_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_cast_fp16 = transpose(perm = var_8128, x = var_8124_cast_fp16)[name = string("transpose_0")];
+            tensor<fp16, [1, 3, 2560]> var_8136_cast_fp16 = mul(x = x_cast_fp16, y = const_138_promoted_to_fp16)[name = string("op_8136_cast_fp16")];
+            bool input_interleave_0 = const()[name = string("input_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_cast_fp16 = concat(axis = var_8134, interleave = input_interleave_0, values = (x_cast_fp16, var_8136_cast_fp16))[name = string("input_cast_fp16")];
+            tensor<int32, [1]> normed_333_axes_0 = const()[name = string("normed_333_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_8131_to_fp16 = const()[name = string("op_8131_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_333_cast_fp16 = layer_norm(axes = normed_333_axes_0, epsilon = var_8131_to_fp16, x = input_cast_fp16)[name = string("normed_333_cast_fp16")];
+            tensor<int32, [2]> var_8141_split_sizes_0 = const()[name = string("op_8141_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_8141_axis_0 = const()[name = string("op_8141_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_8141_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_8141_cast_fp16_1 = split(axis = var_8141_axis_0, split_sizes = var_8141_split_sizes_0, x = normed_333_cast_fp16)[name = string("op_8141_cast_fp16")];
+            tensor<fp16, [2560]> layers_11_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_11_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(572191808)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_119_cast_fp16 = mul(x = var_8141_cast_fp16_0, y = layers_11_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_119_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_cast_fp16 = add(x = hidden_states_115_cast_fp16, y = hidden_states_119_cast_fp16)[name = string("hidden_states_cast_fp16")];
+            tensor<fp16, [1]> const_139_promoted_to_fp16 = const()[name = string("const_139_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.0cp-4])];
+            tensor<fp16, [1, 3, 2560]> hidden_states_out = mul(x = hidden_states_cast_fp16, y = const_139_promoted_to_fp16)[name = string("op_8151_cast_fp16")];
+        } -> (hidden_states_out, K_sliding_out, V_sliding_out, K_full_out, V_full_out, kv13_k, kv13_v, kv14_k, kv14_v);
+}
\ No newline at end of file
diff --git a/chunk2.mlmodelc/weights/weight.bin b/chunk2.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f8045f8f733b5dc1a6d7f1a7af51616c898f200d
--- /dev/null
+++ b/chunk2.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c6cd92e3945e5e809a15df7a1d9e648fb651e859d733d9589eba817805e2d96d
+size 572196992
diff --git a/chunk2_3way.mlmodelc/analytics/coremldata.bin b/chunk2_3way.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f60ff7a72ea4dd66c7a0676779b6bc8b8c360445
--- /dev/null
+++ b/chunk2_3way.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:223a79744041af35a291271aca045883b40f5cc88ad1fb9040a2ee0a2a5b25b9
+size 243
diff --git a/chunk2_3way.mlmodelc/coremldata.bin b/chunk2_3way.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..b39ce2e01c235a36ebcc7235d33b5057c27ec648
--- /dev/null
+++ b/chunk2_3way.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:476686c14666d2a23a5e04271a9bb1f2ce006c76ac085370b9f10fe90a05c810
+size 979
diff --git a/chunk2_3way.mlmodelc/metadata.json b/chunk2_3way.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..33391944fd34a8cf16e1b72070ee0274a55a4b18
--- /dev/null
+++ b/chunk2_3way.mlmodelc/metadata.json
@@ -0,0 +1,285 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Mixed (Float16, Palettized (10 bits), Palettized (11 bits), Palettized (13 bits), Palettized (7 bits), Palettized (8 bits), Palettized (9 bits), UInt4)",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1 × 2560)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 2560]",
+        "name" : "hidden_states_out",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 10 × 2 × 512 × 512)",
+        "shortDescription" : "",
+        "shape" : "[10, 2, 512, 512]",
+        "name" : "K_sliding_out",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 10 × 2 × 512 × 512)",
+        "shortDescription" : "",
+        "shape" : "[10, 2, 512, 512]",
+        "name" : "V_sliding_out",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 2 × 2 × 2048 × 512)",
+        "shortDescription" : "",
+        "shape" : "[2, 2, 2048, 512]",
+        "name" : "K_full_out",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 2 × 2 × 2048 × 512)",
+        "shortDescription" : "",
+        "shape" : "[2, 2, 2048, 512]",
+        "name" : "V_full_out",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 2 × 512 × 256)",
+        "shortDescription" : "",
+        "shape" : "[1, 2, 512, 256]",
+        "name" : "kv13_k",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 2 × 512 × 256)",
+        "shortDescription" : "",
+        "shape" : "[1, 2, 512, 256]",
+        "name" : "kv13_v",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 2 × 2048 × 512)",
+        "shortDescription" : "",
+        "shape" : "[1, 2, 2048, 512]",
+        "name" : "kv14_k",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 2 × 2048 × 512)",
+        "shortDescription" : "",
+        "shape" : "[1, 2, 2048, 512]",
+        "name" : "kv14_v",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 9,
+    "mlProgramOperationTypeHistogram" : {
+      "Ios18.expandDims" : 108,
+      "Ios18.mul" : 457,
+      "Ios18.matmul" : 42,
+      "Ios18.rsqrt" : 12,
+      "Ios18.exp" : 21,
+      "Ios16.reduceMean" : 12,
+      "Ios18.realDiv" : 21,
+      "Split" : 171,
+      "Ios16.reduceMax" : 21,
+      "Tile" : 28,
+      "Ios18.add" : 133,
+      "Ios16.reduceSum" : 21,
+      "Ios18.layerNorm" : 138,
+      "Ios18.reshape" : 180,
+      "Pad" : 20,
+      "Ios18.constexprLutToDense" : 171,
+      "Ios18.conv" : 171,
+      "Ios18.concat" : 191,
+      "Ios18.transpose" : 306,
+      "Ios18.sub" : 22,
+      "Ios18.pow" : 12,
+      "Ios18.gelu" : 42,
+      "Stack" : 4,
+      "Ios18.sliceByIndex" : 85,
+      "Ios18.squeeze" : 66
+    },
+    "computePrecision" : "Mixed (Float16, Float32, Int32)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+
+    ],
+    "availability" : {
+      "macOS" : "15.0",
+      "tvOS" : "18.0",
+      "visionOS" : "2.0",
+      "watchOS" : "11.0",
+      "iOS" : "18.0",
+      "macCatalyst" : "18.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.conversion_date" : "2026-04-30",
+      "com.github.apple.coremltools.source" : "torch==2.11.0",
+      "com.github.apple.coremltools.version" : "9.0",
+      "com.github.apple.coremltools.source_dialect" : "TorchScript"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1 × 2560)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 2560]",
+        "name" : "hidden_states",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1 × 1 × 2048)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 1, 2048]",
+        "name" : "causal_mask_full",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1 × 1 × 512)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 1, 512]",
+        "name" : "causal_mask_sliding",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1 × 2048 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 2048, 1]",
+        "name" : "update_mask",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1 × 10752)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 10752]",
+        "name" : "per_layer_combined",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1 × 1 × 256)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 1, 256]",
+        "name" : "cos_s",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1 × 1 × 256)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 1, 256]",
+        "name" : "sin_s",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1 × 1 × 512)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 1, 512]",
+        "name" : "cos_f",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1 × 1 × 512)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 1, 512]",
+        "name" : "sin_f",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 10 × 2 × 512 × 512)",
+        "shortDescription" : "",
+        "shape" : "[10, 2, 512, 512]",
+        "name" : "K_sliding_in",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 10 × 2 × 512 × 512)",
+        "shortDescription" : "",
+        "shape" : "[10, 2, 512, 512]",
+        "name" : "V_sliding_in",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 2 × 2 × 2048 × 512)",
+        "shortDescription" : "",
+        "shape" : "[2, 2, 2048, 512]",
+        "name" : "K_full_in",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 2 × 2 × 2048 × 512)",
+        "shortDescription" : "",
+        "shape" : "[2, 2, 2048, 512]",
+        "name" : "V_full_in",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "chunk2_3way",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/chunk2_3way.mlmodelc/model.mil b/chunk2_3way.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..4bca220b2305808e8f867a57f71ff4f06053db2b
--- /dev/null
+++ b/chunk2_3way.mlmodelc/model.mil
@@ -0,0 +1,5936 @@
+program(1.3)
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3500.14.1"}, {"coremlc-version", "3500.32.1"}})]
+{
+    func main<ios18>(tensor<fp16, [2, 2, 2048, 512]> K_full_in, tensor<fp16, [10, 2, 512, 512]> K_sliding_in, tensor<fp16, [2, 2, 2048, 512]> V_full_in, tensor<fp16, [10, 2, 512, 512]> V_sliding_in, tensor<fp16, [1, 1, 1, 2048]> causal_mask_full, tensor<fp16, [1, 1, 1, 512]> causal_mask_sliding, tensor<fp16, [1, 1, 1, 512]> cos_f, tensor<fp16, [1, 1, 1, 256]> cos_s, tensor<fp16, [1, 1, 2560]> hidden_states, tensor<fp16, [1, 1, 10752]> per_layer_combined, tensor<fp16, [1, 1, 1, 512]> sin_f, tensor<fp16, [1, 1, 1, 256]> sin_s, tensor<fp16, [1, 1, 2048, 1]> update_mask) {
+            tensor<fp16, [2048, 2560, 1, 1]> layers_c2_0_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2621568))))[name = string("layers_c2_0_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_c2_0_self_attn_q_norm_weight = const()[name = string("layers_c2_0_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2623680)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_c2_0_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2624256))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(3279680))))[name = string("layers_c2_0_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_c2_0_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(3280256))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(3935680))))[name = string("layers_c2_0_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_c2_0_self_attn_k_norm_weight = const()[name = string("layers_c2_0_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(3936256)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c2_0_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(3936832))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17044096))))[name = string("layers_c2_0_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c2_0_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(17054400))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(30161664))))[name = string("layers_c2_0_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_c2_0_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(30171968))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(43279232))))[name = string("layers_c2_0_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_c2_0_post_feedforward_layernorm_weight = const()[name = string("layers_c2_0_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(43281856)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_c2_0_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(43287040))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(43614784))))[name = string("layers_c2_0_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_c2_1_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(43615104))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(46236608))))[name = string("layers_c2_1_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_c2_1_self_attn_q_norm_weight = const()[name = string("layers_c2_1_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(46238720)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_c2_1_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(46239296))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(46894720))))[name = string("layers_c2_1_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_c2_1_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(46895296))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(47550720))))[name = string("layers_c2_1_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_c2_1_self_attn_k_norm_weight = const()[name = string("layers_c2_1_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(47551296)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c2_1_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(47551872))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(60659136))))[name = string("layers_c2_1_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c2_1_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(60669440))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(73776704))))[name = string("layers_c2_1_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_c2_1_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(73787008))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(86894272))))[name = string("layers_c2_1_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_c2_1_post_feedforward_layernorm_weight = const()[name = string("layers_c2_1_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(86896896)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_c2_1_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(86902080))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(87229824))))[name = string("layers_c2_1_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_c2_2_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(87230144))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(89851648))))[name = string("layers_c2_2_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_c2_2_self_attn_q_norm_weight = const()[name = string("layers_c2_2_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(89853760)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_c2_2_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(89854336))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(90509760))))[name = string("layers_c2_2_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_c2_2_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(90510336))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91165760))))[name = string("layers_c2_2_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_c2_2_self_attn_k_norm_weight = const()[name = string("layers_c2_2_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91166336)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c2_2_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(91166912))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(104274176))))[name = string("layers_c2_2_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c2_2_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(104284480))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(117391744))))[name = string("layers_c2_2_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_c2_2_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(117402048))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(130509312))))[name = string("layers_c2_2_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_c2_2_post_feedforward_layernorm_weight = const()[name = string("layers_c2_2_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(130511936)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_c2_2_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(130517120))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(130844864))))[name = string("layers_c2_2_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_c2_3_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(130845184))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(133466688))))[name = string("layers_c2_3_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_c2_3_self_attn_q_norm_weight = const()[name = string("layers_c2_3_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(133468800)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_c2_3_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(133469376))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(134124800))))[name = string("layers_c2_3_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_c2_3_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(134125376))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(134780800))))[name = string("layers_c2_3_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_c2_3_self_attn_k_norm_weight = const()[name = string("layers_c2_3_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(134781376)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c2_3_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(134781952))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(147889216))))[name = string("layers_c2_3_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c2_3_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(147899520))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(161006784))))[name = string("layers_c2_3_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_c2_3_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(161017088))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(174124352))))[name = string("layers_c2_3_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_c2_3_post_feedforward_layernorm_weight = const()[name = string("layers_c2_3_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(174126976)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_c2_3_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(174132160))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(174459904))))[name = string("layers_c2_3_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_c2_4_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(174460224))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(177081728))))[name = string("layers_c2_4_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_c2_4_self_attn_q_norm_weight = const()[name = string("layers_c2_4_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(177083840)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_c2_4_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(177084416))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(177739840))))[name = string("layers_c2_4_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_c2_4_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(177740416))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(178395840))))[name = string("layers_c2_4_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_c2_4_self_attn_k_norm_weight = const()[name = string("layers_c2_4_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(178396416)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c2_4_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(178396992))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(191504256))))[name = string("layers_c2_4_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c2_4_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(191514560))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(204621824))))[name = string("layers_c2_4_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_c2_4_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(204632128))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(217739392))))[name = string("layers_c2_4_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_c2_4_post_feedforward_layernorm_weight = const()[name = string("layers_c2_4_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(217742016)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_c2_4_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(217747200))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(218074944))))[name = string("layers_c2_4_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [4096, 2560, 1, 1]> layers_c2_5_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(218075264))), lut = tensor<fp16, [128, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(223318208))))[name = string("layers_c2_5_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [512]> layers_c2_5_self_attn_q_norm_weight = const()[name = string("layers_c2_5_self_attn_q_norm_weight"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(223322368)))];
+            tensor<fp16, [1024, 2560, 1, 1]> layers_c2_5_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(223323456))), lut = tensor<fp16, [32, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(224634240))))[name = string("layers_c2_5_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [1024, 2560, 1, 1]> layers_c2_5_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(224635328))), lut = tensor<fp16, [32, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(225946112))))[name = string("layers_c2_5_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [512]> layers_c2_5_self_attn_k_norm_weight = const()[name = string("layers_c2_5_self_attn_k_norm_weight"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(225947200)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c2_5_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(225948288))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(239055552))))[name = string("layers_c2_5_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c2_5_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(239065856))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(252173120))))[name = string("layers_c2_5_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_c2_5_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(252183424))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(265290688))))[name = string("layers_c2_5_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_c2_5_post_feedforward_layernorm_weight = const()[name = string("layers_c2_5_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(265293312)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_c2_5_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(265298496))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(265626240))))[name = string("layers_c2_5_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_c2_6_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(265626560))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(268248064))))[name = string("layers_c2_6_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_c2_6_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(268250176))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(268905600))))[name = string("layers_c2_6_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_c2_6_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(268906176))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(269561600))))[name = string("layers_c2_6_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_c2_6_self_attn_k_norm_weight = const()[name = string("layers_c2_6_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(269562176)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c2_6_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(269562752))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(282670016))))[name = string("layers_c2_6_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c2_6_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(282680320))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(295787584))))[name = string("layers_c2_6_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_c2_6_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(295797888))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(308905152))))[name = string("layers_c2_6_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_c2_6_post_feedforward_layernorm_weight = const()[name = string("layers_c2_6_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(308907776)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_c2_6_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(308912960))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(309240704))))[name = string("layers_c2_6_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_c2_7_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(309241024))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(311862528))))[name = string("layers_c2_7_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_c2_7_self_attn_q_norm_weight = const()[name = string("layers_c2_7_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(311864640)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_c2_7_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(311865216))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(312520640))))[name = string("layers_c2_7_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_c2_7_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(312521216))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(313176640))))[name = string("layers_c2_7_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_c2_7_self_attn_k_norm_weight = const()[name = string("layers_c2_7_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(313177216)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c2_7_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(313177792))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(326285056))))[name = string("layers_c2_7_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c2_7_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(326295360))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(339402624))))[name = string("layers_c2_7_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_c2_7_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(339412928))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(352520192))))[name = string("layers_c2_7_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_c2_7_post_feedforward_layernorm_weight = const()[name = string("layers_c2_7_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(352522816)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_c2_7_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(352528000))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(352855744))))[name = string("layers_c2_7_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_c2_8_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(352856064))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(355477568))))[name = string("layers_c2_8_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_c2_8_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(355479680))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(356135104))))[name = string("layers_c2_8_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_c2_8_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(356135680))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(356791104))))[name = string("layers_c2_8_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_c2_8_self_attn_k_norm_weight = const()[name = string("layers_c2_8_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(356791680)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c2_8_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(356792256))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(369899520))))[name = string("layers_c2_8_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c2_8_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(369909824))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(383017088))))[name = string("layers_c2_8_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_c2_8_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(383027392))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(396134656))))[name = string("layers_c2_8_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_c2_8_post_feedforward_layernorm_weight = const()[name = string("layers_c2_8_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(396137280)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_c2_8_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(396142464))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(396470208))))[name = string("layers_c2_8_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_c2_9_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(396470528))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(399092032))))[name = string("layers_c2_9_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_c2_9_self_attn_q_norm_weight = const()[name = string("layers_c2_9_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(399094144)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_c2_9_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(399094720))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(399750144))))[name = string("layers_c2_9_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_c2_9_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(399750720))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400406144))))[name = string("layers_c2_9_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_c2_9_self_attn_k_norm_weight = const()[name = string("layers_c2_9_self_attn_k_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400406720)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c2_9_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400407296))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(413514560))))[name = string("layers_c2_9_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c2_9_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(413524864))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(426632128))))[name = string("layers_c2_9_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_c2_9_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(426642432))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(439749696))))[name = string("layers_c2_9_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_c2_9_post_feedforward_layernorm_weight = const()[name = string("layers_c2_9_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(439752320)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_c2_9_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(439757504))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(440085248))))[name = string("layers_c2_9_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_c2_10_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(440085568))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(442707072))))[name = string("layers_c2_10_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_c2_10_self_attn_q_norm_weight = const()[name = string("layers_c2_10_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(442709184)))];
+            tensor<fp16, [512, 2560, 1, 1]> layers_c2_10_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(442709760))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(443365184))))[name = string("layers_c2_10_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [512, 2560, 1, 1]> layers_c2_10_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [512, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(443365760))), lut = tensor<fp16, [16, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(444021184))))[name = string("layers_c2_10_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c2_10_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(444021760))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(457129024))))[name = string("layers_c2_10_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c2_10_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(457139328))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(470246592))))[name = string("layers_c2_10_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_c2_10_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(470256896))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(483364160))))[name = string("layers_c2_10_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_c2_10_post_feedforward_layernorm_weight = const()[name = string("layers_c2_10_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(483366784)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_c2_10_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(483371968))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(483699712))))[name = string("layers_c2_10_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [4096, 2560, 1, 1]> layers_c2_11_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(483700032))), lut = tensor<fp16, [128, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(488942976))))[name = string("layers_c2_11_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [512]> layers_c2_11_self_attn_q_norm_weight = const()[name = string("layers_c2_11_self_attn_q_norm_weight"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(488947136)))];
+            tensor<fp16, [1024, 2560, 1, 1]> layers_c2_11_self_attn_k_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(488948224))), lut = tensor<fp16, [32, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(490259008))))[name = string("layers_c2_11_self_attn_k_proj_weight_palettized")];
+            tensor<fp16, [1024, 2560, 1, 1]> layers_c2_11_self_attn_v_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [1024, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(490260096))), lut = tensor<fp16, [32, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(491570880))))[name = string("layers_c2_11_self_attn_v_proj_weight_palettized")];
+            tensor<fp16, [512]> layers_c2_11_self_attn_k_norm_weight = const()[name = string("layers_c2_11_self_attn_k_norm_weight"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(491571968)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c2_11_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(491573056))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(504680320))))[name = string("layers_c2_11_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c2_11_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(504690624))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(517797888))))[name = string("layers_c2_11_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_c2_11_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(517808192))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(530915456))))[name = string("layers_c2_11_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_c2_11_post_feedforward_layernorm_weight = const()[name = string("layers_c2_11_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(530918080)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_c2_11_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(530923264))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(531251008))))[name = string("layers_c2_11_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_c3_0_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(531251328))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(533872832))))[name = string("layers_c3_0_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c3_0_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(533874944))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(546982208))))[name = string("layers_c3_0_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c3_0_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(546992512))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(560099776))))[name = string("layers_c3_0_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_c3_0_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(560110080))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(573217344))))[name = string("layers_c3_0_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_c3_0_post_feedforward_layernorm_weight = const()[name = string("layers_c3_0_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(573219968)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_c3_0_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(573225152))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(573552896))))[name = string("layers_c3_0_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_c3_1_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(573553216))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(576174720))))[name = string("layers_c3_1_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c3_1_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(576176832))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(589284096))))[name = string("layers_c3_1_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c3_1_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(589294400))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(602401664))))[name = string("layers_c3_1_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_c3_1_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(602411968))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(615519232))))[name = string("layers_c3_1_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_c3_1_post_feedforward_layernorm_weight = const()[name = string("layers_c3_1_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(615521856)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_c3_1_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(615527040))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(615854784))))[name = string("layers_c3_1_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_c3_2_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(615855104))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(618476608))))[name = string("layers_c3_2_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c3_2_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(618478720))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(631585984))))[name = string("layers_c3_2_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c3_2_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(631596288))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(644703552))))[name = string("layers_c3_2_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_c3_2_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(644713856))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(657821120))))[name = string("layers_c3_2_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_c3_2_post_feedforward_layernorm_weight = const()[name = string("layers_c3_2_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(657823744)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_c3_2_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(657828928))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(658156672))))[name = string("layers_c3_2_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_c3_3_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(658156992))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(660778496))))[name = string("layers_c3_3_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c3_3_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(660780608))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(673887872))))[name = string("layers_c3_3_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c3_3_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(673898176))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(687005440))))[name = string("layers_c3_3_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_c3_3_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(687015744))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(700123008))))[name = string("layers_c3_3_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_c3_3_post_feedforward_layernorm_weight = const()[name = string("layers_c3_3_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(700125632)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_c3_3_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(700130816))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(700458560))))[name = string("layers_c3_3_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_c3_4_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(700458880))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(703080384))))[name = string("layers_c3_4_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c3_4_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(703082496))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(716189760))))[name = string("layers_c3_4_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c3_4_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(716200064))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(729307328))))[name = string("layers_c3_4_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_c3_4_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(729317632))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(742424896))))[name = string("layers_c3_4_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_c3_4_post_feedforward_layernorm_weight = const()[name = string("layers_c3_4_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(742427520)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_c3_4_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(742432704))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(742760448))))[name = string("layers_c3_4_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [4096, 2560, 1, 1]> layers_c3_5_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(742760768))), lut = tensor<fp16, [128, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(748003712))))[name = string("layers_c3_5_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c3_5_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(748007872))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(761115136))))[name = string("layers_c3_5_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c3_5_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(761125440))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(774232704))))[name = string("layers_c3_5_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_c3_5_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(774243008))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(787350272))))[name = string("layers_c3_5_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_c3_5_post_feedforward_layernorm_weight = const()[name = string("layers_c3_5_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(787352896)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_c3_5_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(787358080))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(787685824))))[name = string("layers_c3_5_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_c3_6_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(787686144))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(790307648))))[name = string("layers_c3_6_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c3_6_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(790309760))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(803417024))))[name = string("layers_c3_6_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c3_6_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(803427328))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(816534592))))[name = string("layers_c3_6_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_c3_6_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(816544896))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(829652160))))[name = string("layers_c3_6_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_c3_6_post_feedforward_layernorm_weight = const()[name = string("layers_c3_6_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(829654784)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_c3_6_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(829659968))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(829987712))))[name = string("layers_c3_6_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_c3_7_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(829988032))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(832609536))))[name = string("layers_c3_7_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c3_7_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(832611648))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(845718912))))[name = string("layers_c3_7_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c3_7_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(845729216))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(858836480))))[name = string("layers_c3_7_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_c3_7_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(858846784))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(871954048))))[name = string("layers_c3_7_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_c3_7_post_feedforward_layernorm_weight = const()[name = string("layers_c3_7_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(871956672)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_c3_7_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(871961856))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(872289600))))[name = string("layers_c3_7_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_c3_8_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(872289920))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(874911424))))[name = string("layers_c3_8_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c3_8_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(874913536))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(888020800))))[name = string("layers_c3_8_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_c3_8_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(888031104))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(901138368))))[name = string("layers_c3_8_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_c3_8_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(901148672))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(914255936))))[name = string("layers_c3_8_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_c3_8_post_feedforward_layernorm_weight = const()[name = string("layers_c3_8_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(914258560)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_c3_8_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(914263744))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(914591488))))[name = string("layers_c3_8_per_layer_input_gate_weight_palettized")];
+            tensor<int32, [4]> var_1168_begin_0 = const()[name = string("op_1168_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1168_end_0 = const()[name = string("op_1168_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_1168_end_mask_0 = const()[name = string("op_1168_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_1168_squeeze_mask_0 = const()[name = string("op_1168_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_1168_cast_fp16 = slice_by_index(begin = var_1168_begin_0, end = var_1168_end_0, end_mask = var_1168_end_mask_0, squeeze_mask = var_1168_squeeze_mask_0, x = K_sliding_in)[name = string("op_1168_cast_fp16")];
+            tensor<int32, [1]> K_sliding_slot_1_axes_0 = const()[name = string("K_sliding_slot_1_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_slot_1_cast_fp16 = expand_dims(axes = K_sliding_slot_1_axes_0, x = var_1168_cast_fp16)[name = string("K_sliding_slot_1_cast_fp16")];
+            tensor<int32, [4]> var_1173_begin_0 = const()[name = string("op_1173_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_1173_end_0 = const()[name = string("op_1173_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_1173_end_mask_0 = const()[name = string("op_1173_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_1173_squeeze_mask_0 = const()[name = string("op_1173_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_1173_cast_fp16 = slice_by_index(begin = var_1173_begin_0, end = var_1173_end_0, end_mask = var_1173_end_mask_0, squeeze_mask = var_1173_squeeze_mask_0, x = V_sliding_in)[name = string("op_1173_cast_fp16")];
+            tensor<int32, [1]> V_sliding_slot_1_axes_0 = const()[name = string("V_sliding_slot_1_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_slot_1_cast_fp16 = expand_dims(axes = V_sliding_slot_1_axes_0, x = var_1173_cast_fp16)[name = string("V_sliding_slot_1_cast_fp16")];
+            int32 var_1180 = const()[name = string("op_1180"), val = int32(-1)];
+            fp16 const_0_promoted_to_fp16 = const()[name = string("const_0_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_1182_cast_fp16 = mul(x = hidden_states, y = const_0_promoted_to_fp16)[name = string("op_1182_cast_fp16")];
+            bool input_1_interleave_0 = const()[name = string("input_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_1_cast_fp16 = concat(axis = var_1180, interleave = input_1_interleave_0, values = (hidden_states, var_1182_cast_fp16))[name = string("input_1_cast_fp16")];
+            tensor<int32, [1]> normed_1_axes_0 = const()[name = string("normed_1_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1177_to_fp16 = const()[name = string("op_1177_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_1_cast_fp16 = layer_norm(axes = normed_1_axes_0, epsilon = var_1177_to_fp16, x = input_1_cast_fp16)[name = string("normed_1_cast_fp16")];
+            tensor<int32, [2]> var_1187_split_sizes_0 = const()[name = string("op_1187_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1187_axis_0 = const()[name = string("op_1187_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1187_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1187_cast_fp16_1 = split(axis = var_1187_axis_0, split_sizes = var_1187_split_sizes_0, x = normed_1_cast_fp16)[name = string("op_1187_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_0_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_0_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(914591808)))];
+            tensor<fp16, [1, 1, 2560]> h_1_cast_fp16 = mul(x = var_1187_cast_fp16_0, y = layers_c2_0_input_layernorm_weight_promoted_to_fp16)[name = string("h_1_cast_fp16")];
+            tensor<int32, [3]> var_1193 = const()[name = string("op_1193"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1196_axes_0 = const()[name = string("op_1196_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1194_cast_fp16 = transpose(perm = var_1193, x = h_1_cast_fp16)[name = string("transpose_305")];
+            tensor<fp16, [1, 2560, 1, 1]> var_1196_cast_fp16 = expand_dims(axes = var_1196_axes_0, x = var_1194_cast_fp16)[name = string("op_1196_cast_fp16")];
+            string var_1212_pad_type_0 = const()[name = string("op_1212_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1212_strides_0 = const()[name = string("op_1212_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1212_pad_0 = const()[name = string("op_1212_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1212_dilations_0 = const()[name = string("op_1212_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1212_groups_0 = const()[name = string("op_1212_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_1212 = conv(dilations = var_1212_dilations_0, groups = var_1212_groups_0, pad = var_1212_pad_0, pad_type = var_1212_pad_type_0, strides = var_1212_strides_0, weight = layers_c2_0_self_attn_q_proj_weight_palettized, x = var_1196_cast_fp16)[name = string("op_1212")];
+            tensor<int32, [4]> var_1217 = const()[name = string("op_1217"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_1218 = reshape(shape = var_1217, x = var_1212)[name = string("op_1218")];
+            tensor<int32, [4]> var_1223 = const()[name = string("op_1223"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_1233 = const()[name = string("op_1233"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_1224 = transpose(perm = var_1223, x = var_1218)[name = string("transpose_304")];
+            tensor<fp16, [1, 8, 256]> x_1 = reshape(shape = var_1233, x = var_1224)[name = string("x_1")];
+            int32 var_1239 = const()[name = string("op_1239"), val = int32(-1)];
+            fp16 const_1_promoted = const()[name = string("const_1_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_1241 = mul(x = x_1, y = const_1_promoted)[name = string("op_1241")];
+            bool input_5_interleave_0 = const()[name = string("input_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_5 = concat(axis = var_1239, interleave = input_5_interleave_0, values = (x_1, var_1241))[name = string("input_5")];
+            tensor<int32, [1]> normed_5_axes_0 = const()[name = string("normed_5_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1236_to_fp16 = const()[name = string("op_1236_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_5_cast_fp16 = layer_norm(axes = normed_5_axes_0, epsilon = var_1236_to_fp16, x = input_5)[name = string("normed_5_cast_fp16")];
+            tensor<int32, [2]> var_1246_split_sizes_0 = const()[name = string("op_1246_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_1246_axis_0 = const()[name = string("op_1246_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_1246_0, tensor<fp16, [1, 8, 256]> var_1246_1 = split(axis = var_1246_axis_0, split_sizes = var_1246_split_sizes_0, x = normed_5_cast_fp16)[name = string("op_1246")];
+            tensor<fp16, [1, 8, 256]> var_1248 = mul(x = var_1246_0, y = layers_c2_0_self_attn_q_norm_weight)[name = string("op_1248")];
+            tensor<int32, [4]> var_1253 = const()[name = string("op_1253"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_3 = reshape(shape = var_1253, x = var_1248)[name = string("q_3")];
+            tensor<fp16, [1, 8, 1, 256]> var_1255_cast_fp16 = mul(x = q_3, y = cos_s)[name = string("op_1255_cast_fp16")];
+            tensor<int32, [2]> var_1256_split_sizes_0 = const()[name = string("op_1256_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_1256_axis_0 = const()[name = string("op_1256_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_1256_0, tensor<fp16, [1, 8, 1, 128]> var_1256_1 = split(axis = var_1256_axis_0, split_sizes = var_1256_split_sizes_0, x = q_3)[name = string("op_1256")];
+            fp16 const_2_promoted = const()[name = string("const_2_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_1258 = mul(x = var_1256_1, y = const_2_promoted)[name = string("op_1258")];
+            int32 var_1260 = const()[name = string("op_1260"), val = int32(-1)];
+            bool var_1261_interleave_0 = const()[name = string("op_1261_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_1261 = concat(axis = var_1260, interleave = var_1261_interleave_0, values = (var_1258, var_1256_0))[name = string("op_1261")];
+            tensor<fp16, [1, 8, 1, 256]> var_1262_cast_fp16 = mul(x = var_1261, y = sin_s)[name = string("op_1262_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_7_cast_fp16 = add(x = var_1255_cast_fp16, y = var_1262_cast_fp16)[name = string("q_7_cast_fp16")];
+            string var_1275_pad_type_0 = const()[name = string("op_1275_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1275_strides_0 = const()[name = string("op_1275_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1275_pad_0 = const()[name = string("op_1275_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1275_dilations_0 = const()[name = string("op_1275_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1275_groups_0 = const()[name = string("op_1275_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_1275 = conv(dilations = var_1275_dilations_0, groups = var_1275_groups_0, pad = var_1275_pad_0, pad_type = var_1275_pad_type_0, strides = var_1275_strides_0, weight = layers_c2_0_self_attn_k_proj_weight_palettized, x = var_1196_cast_fp16)[name = string("op_1275")];
+            tensor<int32, [4]> var_1280 = const()[name = string("op_1280"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_1281 = reshape(shape = var_1280, x = var_1275)[name = string("op_1281")];
+            tensor<int32, [4]> var_1286 = const()[name = string("op_1286"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_1303_pad_type_0 = const()[name = string("op_1303_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1303_strides_0 = const()[name = string("op_1303_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1303_pad_0 = const()[name = string("op_1303_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1303_dilations_0 = const()[name = string("op_1303_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1303_groups_0 = const()[name = string("op_1303_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_1303 = conv(dilations = var_1303_dilations_0, groups = var_1303_groups_0, pad = var_1303_pad_0, pad_type = var_1303_pad_type_0, strides = var_1303_strides_0, weight = layers_c2_0_self_attn_v_proj_weight_palettized, x = var_1196_cast_fp16)[name = string("op_1303")];
+            tensor<int32, [4]> var_1308 = const()[name = string("op_1308"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_1309 = reshape(shape = var_1308, x = var_1303)[name = string("op_1309")];
+            tensor<int32, [4]> var_1314 = const()[name = string("op_1314"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_1324 = const()[name = string("op_1324"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp16, [1, 2, 1, 256]> var_1287 = transpose(perm = var_1286, x = var_1281)[name = string("transpose_303")];
+            tensor<fp16, [1, 2, 256]> x_3 = reshape(shape = var_1324, x = var_1287)[name = string("x_3")];
+            int32 var_1330 = const()[name = string("op_1330"), val = int32(-1)];
+            fp16 const_3_promoted = const()[name = string("const_3_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 256]> var_1332 = mul(x = x_3, y = const_3_promoted)[name = string("op_1332")];
+            bool input_7_interleave_0 = const()[name = string("input_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512]> input_7 = concat(axis = var_1330, interleave = input_7_interleave_0, values = (x_3, var_1332))[name = string("input_7")];
+            tensor<int32, [1]> normed_9_axes_0 = const()[name = string("normed_9_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1327_to_fp16 = const()[name = string("op_1327_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 512]> normed_9_cast_fp16 = layer_norm(axes = normed_9_axes_0, epsilon = var_1327_to_fp16, x = input_7)[name = string("normed_9_cast_fp16")];
+            tensor<int32, [2]> var_1337_split_sizes_0 = const()[name = string("op_1337_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_1337_axis_0 = const()[name = string("op_1337_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 256]> var_1337_0, tensor<fp16, [1, 2, 256]> var_1337_1 = split(axis = var_1337_axis_0, split_sizes = var_1337_split_sizes_0, x = normed_9_cast_fp16)[name = string("op_1337")];
+            tensor<fp16, [1, 2, 256]> var_1339 = mul(x = var_1337_0, y = layers_c2_0_self_attn_k_norm_weight)[name = string("op_1339")];
+            tensor<int32, [4]> var_1344 = const()[name = string("op_1344"), val = tensor<int32, [4]>([1, 2, 1, 256])];
+            tensor<fp16, [1, 2, 1, 256]> q_5 = reshape(shape = var_1344, x = var_1339)[name = string("q_5")];
+            fp16 var_1346_promoted = const()[name = string("op_1346_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 256]> var_1315 = transpose(perm = var_1314, x = var_1309)[name = string("transpose_302")];
+            tensor<fp16, [1, 2, 1, 256]> var_1347 = pow(x = var_1315, y = var_1346_promoted)[name = string("op_1347")];
+            tensor<int32, [1]> var_1352_axes_0 = const()[name = string("op_1352_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1352_keep_dims_0 = const()[name = string("op_1352_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_1352 = reduce_mean(axes = var_1352_axes_0, keep_dims = var_1352_keep_dims_0, x = var_1347)[name = string("op_1352")];
+            fp16 var_1354_to_fp16 = const()[name = string("op_1354_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_1_cast_fp16 = add(x = var_1352, y = var_1354_to_fp16)[name = string("mean_sq_1_cast_fp16")];
+            fp32 var_1356_epsilon_0 = const()[name = string("op_1356_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_1356_cast_fp16 = rsqrt(epsilon = var_1356_epsilon_0, x = mean_sq_1_cast_fp16)[name = string("op_1356_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_11_cast_fp16 = mul(x = var_1315, y = var_1356_cast_fp16)[name = string("input_11_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> var_1358_cast_fp16 = mul(x = q_5, y = cos_s)[name = string("op_1358_cast_fp16")];
+            tensor<int32, [2]> var_1359_split_sizes_0 = const()[name = string("op_1359_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_1359_axis_0 = const()[name = string("op_1359_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 128]> var_1359_0, tensor<fp16, [1, 2, 1, 128]> var_1359_1 = split(axis = var_1359_axis_0, split_sizes = var_1359_split_sizes_0, x = q_5)[name = string("op_1359")];
+            fp16 const_4_promoted = const()[name = string("const_4_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 128]> var_1361 = mul(x = var_1359_1, y = const_4_promoted)[name = string("op_1361")];
+            int32 var_1363 = const()[name = string("op_1363"), val = int32(-1)];
+            bool var_1364_interleave_0 = const()[name = string("op_1364_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 256]> var_1364 = concat(axis = var_1363, interleave = var_1364_interleave_0, values = (var_1361, var_1359_0))[name = string("op_1364")];
+            tensor<fp16, [1, 2, 1, 256]> var_1365_cast_fp16 = mul(x = var_1364, y = sin_s)[name = string("op_1365_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_9_cast_fp16 = add(x = var_1358_cast_fp16, y = var_1365_cast_fp16)[name = string("input_9_cast_fp16")];
+            tensor<int32, [8]> k_padded_1_pad_0 = const()[name = string("k_padded_1_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_1_mode_0 = const()[name = string("k_padded_1_mode_0"), val = string("constant")];
+            fp16 const_5_to_fp16 = const()[name = string("const_5_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> k_padded_1_cast_fp16 = pad(constant_val = const_5_to_fp16, mode = k_padded_1_mode_0, pad = k_padded_1_pad_0, x = input_9_cast_fp16)[name = string("k_padded_1_cast_fp16")];
+            tensor<int32, [8]> v_padded_1_pad_0 = const()[name = string("v_padded_1_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_1_mode_0 = const()[name = string("v_padded_1_mode_0"), val = string("constant")];
+            fp16 const_6_to_fp16 = const()[name = string("const_6_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> v_padded_1_cast_fp16 = pad(constant_val = const_6_to_fp16, mode = v_padded_1_mode_0, pad = v_padded_1_pad_0, x = input_11_cast_fp16)[name = string("v_padded_1_cast_fp16")];
+            tensor<int32, [4]> var_1394_begin_0 = const()[name = string("op_1394_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_1394_end_0 = const()[name = string("op_1394_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_1394_end_mask_0 = const()[name = string("op_1394_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_1394_cast_fp16 = slice_by_index(begin = var_1394_begin_0, end = var_1394_end_0, end_mask = var_1394_end_mask_0, x = K_sliding_slot_1_cast_fp16)[name = string("op_1394_cast_fp16")];
+            int32 var_1401 = const()[name = string("op_1401"), val = int32(2)];
+            bool K_sliding_out_1_interleave_0 = const()[name = string("K_sliding_out_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_out_1_cast_fp16 = concat(axis = var_1401, interleave = K_sliding_out_1_interleave_0, values = (var_1394_cast_fp16, k_padded_1_cast_fp16))[name = string("K_sliding_out_1_cast_fp16")];
+            tensor<int32, [4]> var_1417_begin_0 = const()[name = string("op_1417_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_1417_end_0 = const()[name = string("op_1417_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_1417_end_mask_0 = const()[name = string("op_1417_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_1417_cast_fp16 = slice_by_index(begin = var_1417_begin_0, end = var_1417_end_0, end_mask = var_1417_end_mask_0, x = V_sliding_slot_1_cast_fp16)[name = string("op_1417_cast_fp16")];
+            int32 var_1424 = const()[name = string("op_1424"), val = int32(2)];
+            bool V_sliding_out_1_interleave_0 = const()[name = string("V_sliding_out_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_out_1_cast_fp16 = concat(axis = var_1424, interleave = V_sliding_out_1_interleave_0, values = (var_1417_cast_fp16, v_padded_1_cast_fp16))[name = string("V_sliding_out_1_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_1_begin_0 = const()[name = string("K_for_attn_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_1_end_0 = const()[name = string("K_for_attn_1_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_1_end_mask_0 = const()[name = string("K_for_attn_1_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_1_cast_fp16 = slice_by_index(begin = K_for_attn_1_begin_0, end = K_for_attn_1_end_0, end_mask = K_for_attn_1_end_mask_0, x = K_sliding_out_1_cast_fp16)[name = string("K_for_attn_1_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_1_begin_0 = const()[name = string("V_for_attn_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_1_end_0 = const()[name = string("V_for_attn_1_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_1_end_mask_0 = const()[name = string("V_for_attn_1_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_1_cast_fp16 = slice_by_index(begin = V_for_attn_1_begin_0, end = V_for_attn_1_end_0, end_mask = V_for_attn_1_end_mask_0, x = V_sliding_out_1_cast_fp16)[name = string("V_for_attn_1_cast_fp16")];
+            tensor<int32, [4]> transpose_0_perm_0 = const()[name = string("transpose_0_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_0_reps_0 = const()[name = string("tile_0_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_0_cast_fp16 = transpose(perm = transpose_0_perm_0, x = K_for_attn_1_cast_fp16)[name = string("transpose_301")];
+            tensor<fp16, [8, 1, 512, 256]> tile_0_cast_fp16 = tile(reps = tile_0_reps_0, x = transpose_0_cast_fp16)[name = string("tile_0_cast_fp16")];
+            tensor<int32, [5]> concat_0 = const()[name = string("concat_0"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_0_cast_fp16 = reshape(shape = concat_0, x = tile_0_cast_fp16)[name = string("reshape_0_cast_fp16")];
+            tensor<int32, [5]> transpose_1_perm_0 = const()[name = string("transpose_1_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_1 = const()[name = string("concat_1"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_1_cast_fp16 = transpose(perm = transpose_1_perm_0, x = reshape_0_cast_fp16)[name = string("transpose_300")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_1_cast_fp16 = reshape(shape = concat_1, x = transpose_1_cast_fp16)[name = string("reshape_1_cast_fp16")];
+            tensor<int32, [4]> transpose_84_perm_0 = const()[name = string("transpose_84_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_2_perm_0 = const()[name = string("transpose_2_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_1_reps_0 = const()[name = string("tile_1_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_2_cast_fp16 = transpose(perm = transpose_2_perm_0, x = V_for_attn_1_cast_fp16)[name = string("transpose_299")];
+            tensor<fp16, [8, 1, 512, 256]> tile_1_cast_fp16 = tile(reps = tile_1_reps_0, x = transpose_2_cast_fp16)[name = string("tile_1_cast_fp16")];
+            tensor<int32, [5]> concat_2 = const()[name = string("concat_2"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_2_cast_fp16 = reshape(shape = concat_2, x = tile_1_cast_fp16)[name = string("reshape_2_cast_fp16")];
+            tensor<int32, [5]> transpose_3_perm_0 = const()[name = string("transpose_3_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_3 = const()[name = string("concat_3"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_3_cast_fp16 = transpose(perm = transpose_3_perm_0, x = reshape_2_cast_fp16)[name = string("transpose_298")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_3_cast_fp16 = reshape(shape = concat_3, x = transpose_3_cast_fp16)[name = string("reshape_3_cast_fp16")];
+            tensor<int32, [4]> V_expanded_1_perm_0 = const()[name = string("V_expanded_1_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(false)];
+            bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_84_cast_fp16 = transpose(perm = transpose_84_perm_0, x = reshape_1_cast_fp16)[name = string("transpose_297")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = q_7_cast_fp16, y = transpose_84_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_7_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = causal_mask_sliding)[name = string("x_7_cast_fp16")];
+            tensor<int32, [1]> reduce_max_0_axes_0 = const()[name = string("reduce_max_0_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_0_keep_dims_0 = const()[name = string("reduce_max_0_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_0 = reduce_max(axes = reduce_max_0_axes_0, keep_dims = reduce_max_0_keep_dims_0, x = x_7_cast_fp16)[name = string("reduce_max_0")];
+            tensor<fp16, [1, 8, 1, 512]> var_1465 = sub(x = x_7_cast_fp16, y = reduce_max_0)[name = string("op_1465")];
+            tensor<fp16, [1, 8, 1, 512]> var_1471 = exp(x = var_1465)[name = string("op_1471")];
+            tensor<int32, [1]> var_1481_axes_0 = const()[name = string("op_1481_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1481_keep_dims_0 = const()[name = string("op_1481_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_1481 = reduce_sum(axes = var_1481_axes_0, keep_dims = var_1481_keep_dims_0, x = var_1471)[name = string("op_1481")];
+            tensor<fp16, [1, 8, 1, 512]> var_1487_cast_fp16 = real_div(x = var_1471, y = var_1481)[name = string("op_1487_cast_fp16")];
+            bool attn_output_1_transpose_x_0 = const()[name = string("attn_output_1_transpose_x_0"), val = bool(false)];
+            bool attn_output_1_transpose_y_0 = const()[name = string("attn_output_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_1_cast_fp16 = transpose(perm = V_expanded_1_perm_0, x = reshape_3_cast_fp16)[name = string("transpose_296")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_1_cast_fp16 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = var_1487_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_1_cast_fp16")];
+            tensor<int32, [4]> var_1498 = const()[name = string("op_1498"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1505 = const()[name = string("op_1505"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_1499_cast_fp16 = transpose(perm = var_1498, x = attn_output_1_cast_fp16)[name = string("transpose_295")];
+            tensor<fp16, [1, 1, 2048]> attn_output_3_cast_fp16 = reshape(shape = var_1505, x = var_1499_cast_fp16)[name = string("attn_output_3_cast_fp16")];
+            tensor<int32, [3]> var_1510 = const()[name = string("op_1510"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_1526_pad_type_0 = const()[name = string("op_1526_pad_type_0"), val = string("valid")];
+            int32 var_1526_groups_0 = const()[name = string("op_1526_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_1526_strides_0 = const()[name = string("op_1526_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_1526_pad_0 = const()[name = string("op_1526_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_1526_dilations_0 = const()[name = string("op_1526_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_0_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(914596992))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(917218496))))[name = string("squeeze_0_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_1511_cast_fp16 = transpose(perm = var_1510, x = attn_output_3_cast_fp16)[name = string("transpose_294")];
+            tensor<fp16, [1, 2560, 1]> var_1526_cast_fp16 = conv(dilations = var_1526_dilations_0, groups = var_1526_groups_0, pad = var_1526_pad_0, pad_type = var_1526_pad_type_0, strides = var_1526_strides_0, weight = squeeze_0_cast_fp16_to_fp32_to_fp16_palettized, x = var_1511_cast_fp16)[name = string("op_1526_cast_fp16")];
+            tensor<int32, [3]> var_1530 = const()[name = string("op_1530"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1536 = const()[name = string("op_1536"), val = int32(-1)];
+            fp16 const_7_promoted_to_fp16 = const()[name = string("const_7_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_11_cast_fp16 = transpose(perm = var_1530, x = var_1526_cast_fp16)[name = string("transpose_293")];
+            tensor<fp16, [1, 1, 2560]> var_1538_cast_fp16 = mul(x = x_11_cast_fp16, y = const_7_promoted_to_fp16)[name = string("op_1538_cast_fp16")];
+            bool input_15_interleave_0 = const()[name = string("input_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_15_cast_fp16 = concat(axis = var_1536, interleave = input_15_interleave_0, values = (x_11_cast_fp16, var_1538_cast_fp16))[name = string("input_15_cast_fp16")];
+            tensor<int32, [1]> normed_13_axes_0 = const()[name = string("normed_13_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1533_to_fp16 = const()[name = string("op_1533_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_13_cast_fp16 = layer_norm(axes = normed_13_axes_0, epsilon = var_1533_to_fp16, x = input_15_cast_fp16)[name = string("normed_13_cast_fp16")];
+            tensor<int32, [2]> var_1543_split_sizes_0 = const()[name = string("op_1543_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1543_axis_0 = const()[name = string("op_1543_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1543_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1543_cast_fp16_1 = split(axis = var_1543_axis_0, split_sizes = var_1543_split_sizes_0, x = normed_13_cast_fp16)[name = string("op_1543_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_0_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_0_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(917221120)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_5_cast_fp16 = mul(x = var_1543_cast_fp16_0, y = layers_c2_0_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_5_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_13_cast_fp16 = add(x = hidden_states, y = attn_output_5_cast_fp16)[name = string("x_13_cast_fp16")];
+            int32 var_1552 = const()[name = string("op_1552"), val = int32(-1)];
+            fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_1554_cast_fp16 = mul(x = x_13_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_1554_cast_fp16")];
+            bool input_17_interleave_0 = const()[name = string("input_17_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_17_cast_fp16 = concat(axis = var_1552, interleave = input_17_interleave_0, values = (x_13_cast_fp16, var_1554_cast_fp16))[name = string("input_17_cast_fp16")];
+            tensor<int32, [1]> normed_17_axes_0 = const()[name = string("normed_17_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1549_to_fp16 = const()[name = string("op_1549_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_17_cast_fp16 = layer_norm(axes = normed_17_axes_0, epsilon = var_1549_to_fp16, x = input_17_cast_fp16)[name = string("normed_17_cast_fp16")];
+            tensor<int32, [2]> var_1559_split_sizes_0 = const()[name = string("op_1559_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1559_axis_0 = const()[name = string("op_1559_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1559_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1559_cast_fp16_1 = split(axis = var_1559_axis_0, split_sizes = var_1559_split_sizes_0, x = normed_17_cast_fp16)[name = string("op_1559_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_0_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_0_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(917226304)))];
+            tensor<fp16, [1, 1, 2560]> h_3_cast_fp16 = mul(x = var_1559_cast_fp16_0, y = layers_c2_0_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_3_cast_fp16")];
+            tensor<int32, [3]> var_1570 = const()[name = string("op_1570"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_19_axes_0 = const()[name = string("input_19_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1571 = transpose(perm = var_1570, x = h_3_cast_fp16)[name = string("transpose_292")];
+            tensor<fp16, [1, 2560, 1, 1]> input_19 = expand_dims(axes = input_19_axes_0, x = var_1571)[name = string("input_19")];
+            string gate_1_pad_type_0 = const()[name = string("gate_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_1_strides_0 = const()[name = string("gate_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_1_pad_0 = const()[name = string("gate_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_1_dilations_0 = const()[name = string("gate_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_1_groups_0 = const()[name = string("gate_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_1 = conv(dilations = gate_1_dilations_0, groups = gate_1_groups_0, pad = gate_1_pad_0, pad_type = gate_1_pad_type_0, strides = gate_1_strides_0, weight = layers_c2_0_mlp_gate_proj_weight_palettized, x = input_19)[name = string("gate_1")];
+            string up_1_pad_type_0 = const()[name = string("up_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_1_strides_0 = const()[name = string("up_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_1_pad_0 = const()[name = string("up_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_1_dilations_0 = const()[name = string("up_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_1_groups_0 = const()[name = string("up_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_1 = conv(dilations = up_1_dilations_0, groups = up_1_groups_0, pad = up_1_pad_0, pad_type = up_1_pad_type_0, strides = up_1_strides_0, weight = layers_c2_0_mlp_up_proj_weight_palettized, x = input_19)[name = string("up_1")];
+            string gate_3_mode_0 = const()[name = string("gate_3_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_3 = gelu(mode = gate_3_mode_0, x = gate_1)[name = string("gate_3")];
+            tensor<fp16, [1, 10240, 1, 1]> input_21 = mul(x = gate_3, y = up_1)[name = string("input_21")];
+            string mlp_out_1_pad_type_0 = const()[name = string("mlp_out_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_1_strides_0 = const()[name = string("mlp_out_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_1_pad_0 = const()[name = string("mlp_out_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_1_dilations_0 = const()[name = string("mlp_out_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_1_groups_0 = const()[name = string("mlp_out_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_1 = conv(dilations = mlp_out_1_dilations_0, groups = mlp_out_1_groups_0, pad = mlp_out_1_pad_0, pad_type = mlp_out_1_pad_type_0, strides = mlp_out_1_strides_0, weight = layers_c2_0_mlp_down_proj_weight_palettized, x = input_21)[name = string("mlp_out_1")];
+            tensor<int32, [1]> var_1611_axes_0 = const()[name = string("op_1611_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1611 = squeeze(axes = var_1611_axes_0, x = mlp_out_1)[name = string("op_1611")];
+            tensor<int32, [3]> var_1615 = const()[name = string("op_1615"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1621 = const()[name = string("op_1621"), val = int32(-1)];
+            fp16 const_9_promoted = const()[name = string("const_9_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_15 = transpose(perm = var_1615, x = var_1611)[name = string("transpose_291")];
+            tensor<fp16, [1, 1, 2560]> var_1623 = mul(x = x_15, y = const_9_promoted)[name = string("op_1623")];
+            bool input_23_interleave_0 = const()[name = string("input_23_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_23 = concat(axis = var_1621, interleave = input_23_interleave_0, values = (x_15, var_1623))[name = string("input_23")];
+            tensor<int32, [1]> normed_21_axes_0 = const()[name = string("normed_21_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1618_to_fp16 = const()[name = string("op_1618_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_21_cast_fp16 = layer_norm(axes = normed_21_axes_0, epsilon = var_1618_to_fp16, x = input_23)[name = string("normed_21_cast_fp16")];
+            tensor<int32, [2]> var_1628_split_sizes_0 = const()[name = string("op_1628_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1628_axis_0 = const()[name = string("op_1628_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1628_0, tensor<fp16, [1, 1, 2560]> var_1628_1 = split(axis = var_1628_axis_0, split_sizes = var_1628_split_sizes_0, x = normed_21_cast_fp16)[name = string("op_1628")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_3 = mul(x = var_1628_0, y = layers_c2_0_post_feedforward_layernorm_weight)[name = string("hidden_states_3")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_5_cast_fp16 = add(x = x_13_cast_fp16, y = hidden_states_3)[name = string("hidden_states_5_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_1_begin_0 = const()[name = string("per_layer_slice_1_begin_0"), val = tensor<int32, [3]>([0, 0, 3072])];
+            tensor<int32, [3]> per_layer_slice_1_end_0 = const()[name = string("per_layer_slice_1_end_0"), val = tensor<int32, [3]>([1, 1, 3328])];
+            tensor<bool, [3]> per_layer_slice_1_end_mask_0 = const()[name = string("per_layer_slice_1_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_1_cast_fp16 = slice_by_index(begin = per_layer_slice_1_begin_0, end = per_layer_slice_1_end_0, end_mask = per_layer_slice_1_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_1_cast_fp16")];
+            tensor<int32, [3]> var_1656 = const()[name = string("op_1656"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_25_axes_0 = const()[name = string("input_25_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1657 = transpose(perm = var_1656, x = hidden_states_5_cast_fp16)[name = string("transpose_290")];
+            tensor<fp16, [1, 2560, 1, 1]> input_25 = expand_dims(axes = input_25_axes_0, x = var_1657)[name = string("input_25")];
+            string gated_1_pad_type_0 = const()[name = string("gated_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_1_strides_0 = const()[name = string("gated_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_1_pad_0 = const()[name = string("gated_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_1_dilations_0 = const()[name = string("gated_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_1_groups_0 = const()[name = string("gated_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_1 = conv(dilations = gated_1_dilations_0, groups = gated_1_groups_0, pad = gated_1_pad_0, pad_type = gated_1_pad_type_0, strides = gated_1_strides_0, weight = layers_c2_0_per_layer_input_gate_weight_palettized, x = input_25)[name = string("gated_1")];
+            string gated_3_mode_0 = const()[name = string("gated_3_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_3 = gelu(mode = gated_3_mode_0, x = gated_1)[name = string("gated_3")];
+            tensor<int32, [3]> var_1676 = const()[name = string("op_1676"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_1_axes_0 = const()[name = string("per_layer_slice_conv_1_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_1677_cast_fp16 = transpose(perm = var_1676, x = per_layer_slice_1_cast_fp16)[name = string("transpose_289")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_1_cast_fp16 = expand_dims(axes = per_layer_slice_conv_1_axes_0, x = var_1677_cast_fp16)[name = string("per_layer_slice_conv_1_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_27_cast_fp16 = mul(x = gated_3, y = per_layer_slice_conv_1_cast_fp16)[name = string("input_27_cast_fp16")];
+            string gated_5_pad_type_0 = const()[name = string("gated_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_5_strides_0 = const()[name = string("gated_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_5_pad_0 = const()[name = string("gated_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_5_dilations_0 = const()[name = string("gated_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_5_groups_0 = const()[name = string("gated_5_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_c2_0_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(917231488))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(917559232))))[name = string("layers_c2_0_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_5_cast_fp16 = conv(dilations = gated_5_dilations_0, groups = gated_5_groups_0, pad = gated_5_pad_0, pad_type = gated_5_pad_type_0, strides = gated_5_strides_0, weight = layers_c2_0_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_27_cast_fp16)[name = string("gated_5_cast_fp16")];
+            tensor<int32, [1]> var_1693_axes_0 = const()[name = string("op_1693_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1693_cast_fp16 = squeeze(axes = var_1693_axes_0, x = gated_5_cast_fp16)[name = string("op_1693_cast_fp16")];
+            tensor<int32, [3]> var_1697 = const()[name = string("op_1697"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1703 = const()[name = string("op_1703"), val = int32(-1)];
+            fp16 const_10_promoted_to_fp16 = const()[name = string("const_10_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_17_cast_fp16 = transpose(perm = var_1697, x = var_1693_cast_fp16)[name = string("transpose_288")];
+            tensor<fp16, [1, 1, 2560]> var_1705_cast_fp16 = mul(x = x_17_cast_fp16, y = const_10_promoted_to_fp16)[name = string("op_1705_cast_fp16")];
+            bool input_29_interleave_0 = const()[name = string("input_29_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_29_cast_fp16 = concat(axis = var_1703, interleave = input_29_interleave_0, values = (x_17_cast_fp16, var_1705_cast_fp16))[name = string("input_29_cast_fp16")];
+            tensor<int32, [1]> normed_25_axes_0 = const()[name = string("normed_25_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1700_to_fp16 = const()[name = string("op_1700_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_25_cast_fp16 = layer_norm(axes = normed_25_axes_0, epsilon = var_1700_to_fp16, x = input_29_cast_fp16)[name = string("normed_25_cast_fp16")];
+            tensor<int32, [2]> var_1710_split_sizes_0 = const()[name = string("op_1710_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1710_axis_0 = const()[name = string("op_1710_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1710_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1710_cast_fp16_1 = split(axis = var_1710_axis_0, split_sizes = var_1710_split_sizes_0, x = normed_25_cast_fp16)[name = string("op_1710_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_0_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_c2_0_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(917561856)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_9_cast_fp16 = mul(x = var_1710_cast_fp16_0, y = layers_c2_0_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_9_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_11_cast_fp16 = add(x = hidden_states_5_cast_fp16, y = hidden_states_9_cast_fp16)[name = string("hidden_states_11_cast_fp16")];
+            tensor<fp16, [1]> const_11_promoted_to_fp16 = const()[name = string("const_11_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.7ep-1])];
+            tensor<fp16, [1, 1, 2560]> x_19_cast_fp16 = mul(x = hidden_states_11_cast_fp16, y = const_11_promoted_to_fp16)[name = string("x_19_cast_fp16")];
+            tensor<int32, [1]> var_1722_axes_0 = const()[name = string("op_1722_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_1722_cast_fp16 = squeeze(axes = var_1722_axes_0, x = K_sliding_out_1_cast_fp16)[name = string("op_1722_cast_fp16")];
+            tensor<int32, [1]> var_1724_axes_0 = const()[name = string("op_1724_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_1724_cast_fp16 = squeeze(axes = var_1724_axes_0, x = V_sliding_out_1_cast_fp16)[name = string("op_1724_cast_fp16")];
+            tensor<int32, [4]> var_1727_begin_0 = const()[name = string("op_1727_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> var_1727_end_0 = const()[name = string("op_1727_end_0"), val = tensor<int32, [4]>([2, 2, 512, 512])];
+            tensor<bool, [4]> var_1727_end_mask_0 = const()[name = string("op_1727_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_1727_squeeze_mask_0 = const()[name = string("op_1727_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_1727_cast_fp16 = slice_by_index(begin = var_1727_begin_0, end = var_1727_end_0, end_mask = var_1727_end_mask_0, squeeze_mask = var_1727_squeeze_mask_0, x = K_sliding_in)[name = string("op_1727_cast_fp16")];
+            tensor<int32, [1]> K_sliding_slot_3_axes_0 = const()[name = string("K_sliding_slot_3_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_slot_3_cast_fp16 = expand_dims(axes = K_sliding_slot_3_axes_0, x = var_1727_cast_fp16)[name = string("K_sliding_slot_3_cast_fp16")];
+            tensor<int32, [4]> var_1732_begin_0 = const()[name = string("op_1732_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> var_1732_end_0 = const()[name = string("op_1732_end_0"), val = tensor<int32, [4]>([2, 2, 512, 512])];
+            tensor<bool, [4]> var_1732_end_mask_0 = const()[name = string("op_1732_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_1732_squeeze_mask_0 = const()[name = string("op_1732_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_1732_cast_fp16 = slice_by_index(begin = var_1732_begin_0, end = var_1732_end_0, end_mask = var_1732_end_mask_0, squeeze_mask = var_1732_squeeze_mask_0, x = V_sliding_in)[name = string("op_1732_cast_fp16")];
+            tensor<int32, [1]> V_sliding_slot_3_axes_0 = const()[name = string("V_sliding_slot_3_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_slot_3_cast_fp16 = expand_dims(axes = V_sliding_slot_3_axes_0, x = var_1732_cast_fp16)[name = string("V_sliding_slot_3_cast_fp16")];
+            int32 var_1739 = const()[name = string("op_1739"), val = int32(-1)];
+            fp16 const_12_promoted_to_fp16 = const()[name = string("const_12_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_1741_cast_fp16 = mul(x = x_19_cast_fp16, y = const_12_promoted_to_fp16)[name = string("op_1741_cast_fp16")];
+            bool input_31_interleave_0 = const()[name = string("input_31_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_31_cast_fp16 = concat(axis = var_1739, interleave = input_31_interleave_0, values = (x_19_cast_fp16, var_1741_cast_fp16))[name = string("input_31_cast_fp16")];
+            tensor<int32, [1]> normed_29_axes_0 = const()[name = string("normed_29_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1736_to_fp16 = const()[name = string("op_1736_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_29_cast_fp16 = layer_norm(axes = normed_29_axes_0, epsilon = var_1736_to_fp16, x = input_31_cast_fp16)[name = string("normed_29_cast_fp16")];
+            tensor<int32, [2]> var_1746_split_sizes_0 = const()[name = string("op_1746_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1746_axis_0 = const()[name = string("op_1746_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1746_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1746_cast_fp16_1 = split(axis = var_1746_axis_0, split_sizes = var_1746_split_sizes_0, x = normed_29_cast_fp16)[name = string("op_1746_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_1_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_1_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(917567040)))];
+            tensor<fp16, [1, 1, 2560]> h_7_cast_fp16 = mul(x = var_1746_cast_fp16_0, y = layers_c2_1_input_layernorm_weight_promoted_to_fp16)[name = string("h_7_cast_fp16")];
+            tensor<int32, [3]> var_1752 = const()[name = string("op_1752"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1755_axes_0 = const()[name = string("op_1755_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1753_cast_fp16 = transpose(perm = var_1752, x = h_7_cast_fp16)[name = string("transpose_287")];
+            tensor<fp16, [1, 2560, 1, 1]> var_1755_cast_fp16 = expand_dims(axes = var_1755_axes_0, x = var_1753_cast_fp16)[name = string("op_1755_cast_fp16")];
+            string var_1771_pad_type_0 = const()[name = string("op_1771_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1771_strides_0 = const()[name = string("op_1771_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1771_pad_0 = const()[name = string("op_1771_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1771_dilations_0 = const()[name = string("op_1771_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1771_groups_0 = const()[name = string("op_1771_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_1771 = conv(dilations = var_1771_dilations_0, groups = var_1771_groups_0, pad = var_1771_pad_0, pad_type = var_1771_pad_type_0, strides = var_1771_strides_0, weight = layers_c2_1_self_attn_q_proj_weight_palettized, x = var_1755_cast_fp16)[name = string("op_1771")];
+            tensor<int32, [4]> var_1776 = const()[name = string("op_1776"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_1777 = reshape(shape = var_1776, x = var_1771)[name = string("op_1777")];
+            tensor<int32, [4]> var_1782 = const()[name = string("op_1782"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_1792 = const()[name = string("op_1792"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_1783 = transpose(perm = var_1782, x = var_1777)[name = string("transpose_286")];
+            tensor<fp16, [1, 8, 256]> x_21 = reshape(shape = var_1792, x = var_1783)[name = string("x_21")];
+            int32 var_1798 = const()[name = string("op_1798"), val = int32(-1)];
+            fp16 const_13_promoted = const()[name = string("const_13_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_1800 = mul(x = x_21, y = const_13_promoted)[name = string("op_1800")];
+            bool input_35_interleave_0 = const()[name = string("input_35_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_35 = concat(axis = var_1798, interleave = input_35_interleave_0, values = (x_21, var_1800))[name = string("input_35")];
+            tensor<int32, [1]> normed_33_axes_0 = const()[name = string("normed_33_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1795_to_fp16 = const()[name = string("op_1795_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_33_cast_fp16 = layer_norm(axes = normed_33_axes_0, epsilon = var_1795_to_fp16, x = input_35)[name = string("normed_33_cast_fp16")];
+            tensor<int32, [2]> var_1805_split_sizes_0 = const()[name = string("op_1805_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_1805_axis_0 = const()[name = string("op_1805_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_1805_0, tensor<fp16, [1, 8, 256]> var_1805_1 = split(axis = var_1805_axis_0, split_sizes = var_1805_split_sizes_0, x = normed_33_cast_fp16)[name = string("op_1805")];
+            tensor<fp16, [1, 8, 256]> var_1807 = mul(x = var_1805_0, y = layers_c2_1_self_attn_q_norm_weight)[name = string("op_1807")];
+            tensor<int32, [4]> var_1812 = const()[name = string("op_1812"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_11 = reshape(shape = var_1812, x = var_1807)[name = string("q_11")];
+            tensor<fp16, [1, 8, 1, 256]> var_1814_cast_fp16 = mul(x = q_11, y = cos_s)[name = string("op_1814_cast_fp16")];
+            tensor<int32, [2]> var_1815_split_sizes_0 = const()[name = string("op_1815_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_1815_axis_0 = const()[name = string("op_1815_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_1815_0, tensor<fp16, [1, 8, 1, 128]> var_1815_1 = split(axis = var_1815_axis_0, split_sizes = var_1815_split_sizes_0, x = q_11)[name = string("op_1815")];
+            fp16 const_14_promoted = const()[name = string("const_14_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_1817 = mul(x = var_1815_1, y = const_14_promoted)[name = string("op_1817")];
+            int32 var_1819 = const()[name = string("op_1819"), val = int32(-1)];
+            bool var_1820_interleave_0 = const()[name = string("op_1820_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_1820 = concat(axis = var_1819, interleave = var_1820_interleave_0, values = (var_1817, var_1815_0))[name = string("op_1820")];
+            tensor<fp16, [1, 8, 1, 256]> var_1821_cast_fp16 = mul(x = var_1820, y = sin_s)[name = string("op_1821_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_15_cast_fp16 = add(x = var_1814_cast_fp16, y = var_1821_cast_fp16)[name = string("q_15_cast_fp16")];
+            string var_1834_pad_type_0 = const()[name = string("op_1834_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1834_strides_0 = const()[name = string("op_1834_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1834_pad_0 = const()[name = string("op_1834_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1834_dilations_0 = const()[name = string("op_1834_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1834_groups_0 = const()[name = string("op_1834_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_1834 = conv(dilations = var_1834_dilations_0, groups = var_1834_groups_0, pad = var_1834_pad_0, pad_type = var_1834_pad_type_0, strides = var_1834_strides_0, weight = layers_c2_1_self_attn_k_proj_weight_palettized, x = var_1755_cast_fp16)[name = string("op_1834")];
+            tensor<int32, [4]> var_1839 = const()[name = string("op_1839"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_1840 = reshape(shape = var_1839, x = var_1834)[name = string("op_1840")];
+            tensor<int32, [4]> var_1845 = const()[name = string("op_1845"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_1862_pad_type_0 = const()[name = string("op_1862_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1862_strides_0 = const()[name = string("op_1862_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1862_pad_0 = const()[name = string("op_1862_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1862_dilations_0 = const()[name = string("op_1862_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1862_groups_0 = const()[name = string("op_1862_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_1862 = conv(dilations = var_1862_dilations_0, groups = var_1862_groups_0, pad = var_1862_pad_0, pad_type = var_1862_pad_type_0, strides = var_1862_strides_0, weight = layers_c2_1_self_attn_v_proj_weight_palettized, x = var_1755_cast_fp16)[name = string("op_1862")];
+            tensor<int32, [4]> var_1867 = const()[name = string("op_1867"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_1868 = reshape(shape = var_1867, x = var_1862)[name = string("op_1868")];
+            tensor<int32, [4]> var_1873 = const()[name = string("op_1873"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_1883 = const()[name = string("op_1883"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp16, [1, 2, 1, 256]> var_1846 = transpose(perm = var_1845, x = var_1840)[name = string("transpose_285")];
+            tensor<fp16, [1, 2, 256]> x_23 = reshape(shape = var_1883, x = var_1846)[name = string("x_23")];
+            int32 var_1889 = const()[name = string("op_1889"), val = int32(-1)];
+            fp16 const_15_promoted = const()[name = string("const_15_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 256]> var_1891 = mul(x = x_23, y = const_15_promoted)[name = string("op_1891")];
+            bool input_37_interleave_0 = const()[name = string("input_37_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512]> input_37 = concat(axis = var_1889, interleave = input_37_interleave_0, values = (x_23, var_1891))[name = string("input_37")];
+            tensor<int32, [1]> normed_37_axes_0 = const()[name = string("normed_37_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1886_to_fp16 = const()[name = string("op_1886_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 512]> normed_37_cast_fp16 = layer_norm(axes = normed_37_axes_0, epsilon = var_1886_to_fp16, x = input_37)[name = string("normed_37_cast_fp16")];
+            tensor<int32, [2]> var_1896_split_sizes_0 = const()[name = string("op_1896_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_1896_axis_0 = const()[name = string("op_1896_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 256]> var_1896_0, tensor<fp16, [1, 2, 256]> var_1896_1 = split(axis = var_1896_axis_0, split_sizes = var_1896_split_sizes_0, x = normed_37_cast_fp16)[name = string("op_1896")];
+            tensor<fp16, [1, 2, 256]> var_1898 = mul(x = var_1896_0, y = layers_c2_1_self_attn_k_norm_weight)[name = string("op_1898")];
+            tensor<int32, [4]> var_1903 = const()[name = string("op_1903"), val = tensor<int32, [4]>([1, 2, 1, 256])];
+            tensor<fp16, [1, 2, 1, 256]> q_13 = reshape(shape = var_1903, x = var_1898)[name = string("q_13")];
+            fp16 var_1905_promoted = const()[name = string("op_1905_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 256]> var_1874 = transpose(perm = var_1873, x = var_1868)[name = string("transpose_284")];
+            tensor<fp16, [1, 2, 1, 256]> var_1906 = pow(x = var_1874, y = var_1905_promoted)[name = string("op_1906")];
+            tensor<int32, [1]> var_1911_axes_0 = const()[name = string("op_1911_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1911_keep_dims_0 = const()[name = string("op_1911_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_1911 = reduce_mean(axes = var_1911_axes_0, keep_dims = var_1911_keep_dims_0, x = var_1906)[name = string("op_1911")];
+            fp16 var_1913_to_fp16 = const()[name = string("op_1913_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_3_cast_fp16 = add(x = var_1911, y = var_1913_to_fp16)[name = string("mean_sq_3_cast_fp16")];
+            fp32 var_1915_epsilon_0 = const()[name = string("op_1915_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_1915_cast_fp16 = rsqrt(epsilon = var_1915_epsilon_0, x = mean_sq_3_cast_fp16)[name = string("op_1915_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_41_cast_fp16 = mul(x = var_1874, y = var_1915_cast_fp16)[name = string("input_41_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> var_1917_cast_fp16 = mul(x = q_13, y = cos_s)[name = string("op_1917_cast_fp16")];
+            tensor<int32, [2]> var_1918_split_sizes_0 = const()[name = string("op_1918_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_1918_axis_0 = const()[name = string("op_1918_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 128]> var_1918_0, tensor<fp16, [1, 2, 1, 128]> var_1918_1 = split(axis = var_1918_axis_0, split_sizes = var_1918_split_sizes_0, x = q_13)[name = string("op_1918")];
+            fp16 const_16_promoted = const()[name = string("const_16_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 128]> var_1920 = mul(x = var_1918_1, y = const_16_promoted)[name = string("op_1920")];
+            int32 var_1922 = const()[name = string("op_1922"), val = int32(-1)];
+            bool var_1923_interleave_0 = const()[name = string("op_1923_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 256]> var_1923 = concat(axis = var_1922, interleave = var_1923_interleave_0, values = (var_1920, var_1918_0))[name = string("op_1923")];
+            tensor<fp16, [1, 2, 1, 256]> var_1924_cast_fp16 = mul(x = var_1923, y = sin_s)[name = string("op_1924_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_39_cast_fp16 = add(x = var_1917_cast_fp16, y = var_1924_cast_fp16)[name = string("input_39_cast_fp16")];
+            tensor<int32, [8]> k_padded_3_pad_0 = const()[name = string("k_padded_3_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_3_mode_0 = const()[name = string("k_padded_3_mode_0"), val = string("constant")];
+            fp16 const_17_to_fp16 = const()[name = string("const_17_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> k_padded_3_cast_fp16 = pad(constant_val = const_17_to_fp16, mode = k_padded_3_mode_0, pad = k_padded_3_pad_0, x = input_39_cast_fp16)[name = string("k_padded_3_cast_fp16")];
+            tensor<int32, [8]> v_padded_3_pad_0 = const()[name = string("v_padded_3_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_3_mode_0 = const()[name = string("v_padded_3_mode_0"), val = string("constant")];
+            fp16 const_18_to_fp16 = const()[name = string("const_18_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> v_padded_3_cast_fp16 = pad(constant_val = const_18_to_fp16, mode = v_padded_3_mode_0, pad = v_padded_3_pad_0, x = input_41_cast_fp16)[name = string("v_padded_3_cast_fp16")];
+            tensor<int32, [4]> var_1953_begin_0 = const()[name = string("op_1953_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_1953_end_0 = const()[name = string("op_1953_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_1953_end_mask_0 = const()[name = string("op_1953_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_1953_cast_fp16 = slice_by_index(begin = var_1953_begin_0, end = var_1953_end_0, end_mask = var_1953_end_mask_0, x = K_sliding_slot_3_cast_fp16)[name = string("op_1953_cast_fp16")];
+            int32 var_1960 = const()[name = string("op_1960"), val = int32(2)];
+            bool K_sliding_out_3_interleave_0 = const()[name = string("K_sliding_out_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_out_3_cast_fp16 = concat(axis = var_1960, interleave = K_sliding_out_3_interleave_0, values = (var_1953_cast_fp16, k_padded_3_cast_fp16))[name = string("K_sliding_out_3_cast_fp16")];
+            tensor<int32, [4]> var_1976_begin_0 = const()[name = string("op_1976_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_1976_end_0 = const()[name = string("op_1976_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_1976_end_mask_0 = const()[name = string("op_1976_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_1976_cast_fp16 = slice_by_index(begin = var_1976_begin_0, end = var_1976_end_0, end_mask = var_1976_end_mask_0, x = V_sliding_slot_3_cast_fp16)[name = string("op_1976_cast_fp16")];
+            int32 var_1983 = const()[name = string("op_1983"), val = int32(2)];
+            bool V_sliding_out_3_interleave_0 = const()[name = string("V_sliding_out_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_out_3_cast_fp16 = concat(axis = var_1983, interleave = V_sliding_out_3_interleave_0, values = (var_1976_cast_fp16, v_padded_3_cast_fp16))[name = string("V_sliding_out_3_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_3_begin_0 = const()[name = string("K_for_attn_3_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_3_end_0 = const()[name = string("K_for_attn_3_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_3_end_mask_0 = const()[name = string("K_for_attn_3_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_3_cast_fp16 = slice_by_index(begin = K_for_attn_3_begin_0, end = K_for_attn_3_end_0, end_mask = K_for_attn_3_end_mask_0, x = K_sliding_out_3_cast_fp16)[name = string("K_for_attn_3_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_3_begin_0 = const()[name = string("V_for_attn_3_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_3_end_0 = const()[name = string("V_for_attn_3_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_3_end_mask_0 = const()[name = string("V_for_attn_3_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_3_cast_fp16 = slice_by_index(begin = V_for_attn_3_begin_0, end = V_for_attn_3_end_0, end_mask = V_for_attn_3_end_mask_0, x = V_sliding_out_3_cast_fp16)[name = string("V_for_attn_3_cast_fp16")];
+            tensor<int32, [4]> transpose_4_perm_0 = const()[name = string("transpose_4_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_2_reps_0 = const()[name = string("tile_2_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_4_cast_fp16 = transpose(perm = transpose_4_perm_0, x = K_for_attn_3_cast_fp16)[name = string("transpose_283")];
+            tensor<fp16, [8, 1, 512, 256]> tile_2_cast_fp16 = tile(reps = tile_2_reps_0, x = transpose_4_cast_fp16)[name = string("tile_2_cast_fp16")];
+            tensor<int32, [5]> concat_4 = const()[name = string("concat_4"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_4_cast_fp16 = reshape(shape = concat_4, x = tile_2_cast_fp16)[name = string("reshape_4_cast_fp16")];
+            tensor<int32, [5]> transpose_5_perm_0 = const()[name = string("transpose_5_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_5 = const()[name = string("concat_5"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_5_cast_fp16 = transpose(perm = transpose_5_perm_0, x = reshape_4_cast_fp16)[name = string("transpose_282")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_5_cast_fp16 = reshape(shape = concat_5, x = transpose_5_cast_fp16)[name = string("reshape_5_cast_fp16")];
+            tensor<int32, [4]> transpose_85_perm_0 = const()[name = string("transpose_85_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_6_perm_0 = const()[name = string("transpose_6_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_3_reps_0 = const()[name = string("tile_3_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_6_cast_fp16 = transpose(perm = transpose_6_perm_0, x = V_for_attn_3_cast_fp16)[name = string("transpose_281")];
+            tensor<fp16, [8, 1, 512, 256]> tile_3_cast_fp16 = tile(reps = tile_3_reps_0, x = transpose_6_cast_fp16)[name = string("tile_3_cast_fp16")];
+            tensor<int32, [5]> concat_6 = const()[name = string("concat_6"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_6_cast_fp16 = reshape(shape = concat_6, x = tile_3_cast_fp16)[name = string("reshape_6_cast_fp16")];
+            tensor<int32, [5]> transpose_7_perm_0 = const()[name = string("transpose_7_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_7 = const()[name = string("concat_7"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_7_cast_fp16 = transpose(perm = transpose_7_perm_0, x = reshape_6_cast_fp16)[name = string("transpose_280")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_7_cast_fp16 = reshape(shape = concat_7, x = transpose_7_cast_fp16)[name = string("reshape_7_cast_fp16")];
+            tensor<int32, [4]> V_expanded_3_perm_0 = const()[name = string("V_expanded_3_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(false)];
+            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_85_cast_fp16 = transpose(perm = transpose_85_perm_0, x = reshape_5_cast_fp16)[name = string("transpose_279")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = q_15_cast_fp16, y = transpose_85_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_27_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = causal_mask_sliding)[name = string("x_27_cast_fp16")];
+            tensor<int32, [1]> reduce_max_1_axes_0 = const()[name = string("reduce_max_1_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_1_keep_dims_0 = const()[name = string("reduce_max_1_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_1 = reduce_max(axes = reduce_max_1_axes_0, keep_dims = reduce_max_1_keep_dims_0, x = x_27_cast_fp16)[name = string("reduce_max_1")];
+            tensor<fp16, [1, 8, 1, 512]> var_2024 = sub(x = x_27_cast_fp16, y = reduce_max_1)[name = string("op_2024")];
+            tensor<fp16, [1, 8, 1, 512]> var_2030 = exp(x = var_2024)[name = string("op_2030")];
+            tensor<int32, [1]> var_2040_axes_0 = const()[name = string("op_2040_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2040_keep_dims_0 = const()[name = string("op_2040_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_2040 = reduce_sum(axes = var_2040_axes_0, keep_dims = var_2040_keep_dims_0, x = var_2030)[name = string("op_2040")];
+            tensor<fp16, [1, 8, 1, 512]> var_2046_cast_fp16 = real_div(x = var_2030, y = var_2040)[name = string("op_2046_cast_fp16")];
+            bool attn_output_7_transpose_x_0 = const()[name = string("attn_output_7_transpose_x_0"), val = bool(false)];
+            bool attn_output_7_transpose_y_0 = const()[name = string("attn_output_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_3_cast_fp16 = transpose(perm = V_expanded_3_perm_0, x = reshape_7_cast_fp16)[name = string("transpose_278")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_7_cast_fp16 = matmul(transpose_x = attn_output_7_transpose_x_0, transpose_y = attn_output_7_transpose_y_0, x = var_2046_cast_fp16, y = V_expanded_3_cast_fp16)[name = string("attn_output_7_cast_fp16")];
+            tensor<int32, [4]> var_2057 = const()[name = string("op_2057"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_2064 = const()[name = string("op_2064"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_2058_cast_fp16 = transpose(perm = var_2057, x = attn_output_7_cast_fp16)[name = string("transpose_277")];
+            tensor<fp16, [1, 1, 2048]> attn_output_9_cast_fp16 = reshape(shape = var_2064, x = var_2058_cast_fp16)[name = string("attn_output_9_cast_fp16")];
+            tensor<int32, [3]> var_2069 = const()[name = string("op_2069"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_2085_pad_type_0 = const()[name = string("op_2085_pad_type_0"), val = string("valid")];
+            int32 var_2085_groups_0 = const()[name = string("op_2085_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_2085_strides_0 = const()[name = string("op_2085_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_2085_pad_0 = const()[name = string("op_2085_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_2085_dilations_0 = const()[name = string("op_2085_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_1_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(917572224))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(920193728))))[name = string("squeeze_1_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_2070_cast_fp16 = transpose(perm = var_2069, x = attn_output_9_cast_fp16)[name = string("transpose_276")];
+            tensor<fp16, [1, 2560, 1]> var_2085_cast_fp16 = conv(dilations = var_2085_dilations_0, groups = var_2085_groups_0, pad = var_2085_pad_0, pad_type = var_2085_pad_type_0, strides = var_2085_strides_0, weight = squeeze_1_cast_fp16_to_fp32_to_fp16_palettized, x = var_2070_cast_fp16)[name = string("op_2085_cast_fp16")];
+            tensor<int32, [3]> var_2089 = const()[name = string("op_2089"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2095 = const()[name = string("op_2095"), val = int32(-1)];
+            fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_31_cast_fp16 = transpose(perm = var_2089, x = var_2085_cast_fp16)[name = string("transpose_275")];
+            tensor<fp16, [1, 1, 2560]> var_2097_cast_fp16 = mul(x = x_31_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_2097_cast_fp16")];
+            bool input_45_interleave_0 = const()[name = string("input_45_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_45_cast_fp16 = concat(axis = var_2095, interleave = input_45_interleave_0, values = (x_31_cast_fp16, var_2097_cast_fp16))[name = string("input_45_cast_fp16")];
+            tensor<int32, [1]> normed_41_axes_0 = const()[name = string("normed_41_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2092_to_fp16 = const()[name = string("op_2092_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_41_cast_fp16 = layer_norm(axes = normed_41_axes_0, epsilon = var_2092_to_fp16, x = input_45_cast_fp16)[name = string("normed_41_cast_fp16")];
+            tensor<int32, [2]> var_2102_split_sizes_0 = const()[name = string("op_2102_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2102_axis_0 = const()[name = string("op_2102_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2102_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2102_cast_fp16_1 = split(axis = var_2102_axis_0, split_sizes = var_2102_split_sizes_0, x = normed_41_cast_fp16)[name = string("op_2102_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_1_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_1_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(920196352)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_11_cast_fp16 = mul(x = var_2102_cast_fp16_0, y = layers_c2_1_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_11_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_33_cast_fp16 = add(x = x_19_cast_fp16, y = attn_output_11_cast_fp16)[name = string("x_33_cast_fp16")];
+            int32 var_2111 = const()[name = string("op_2111"), val = int32(-1)];
+            fp16 const_20_promoted_to_fp16 = const()[name = string("const_20_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_2113_cast_fp16 = mul(x = x_33_cast_fp16, y = const_20_promoted_to_fp16)[name = string("op_2113_cast_fp16")];
+            bool input_47_interleave_0 = const()[name = string("input_47_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_47_cast_fp16 = concat(axis = var_2111, interleave = input_47_interleave_0, values = (x_33_cast_fp16, var_2113_cast_fp16))[name = string("input_47_cast_fp16")];
+            tensor<int32, [1]> normed_45_axes_0 = const()[name = string("normed_45_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2108_to_fp16 = const()[name = string("op_2108_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_45_cast_fp16 = layer_norm(axes = normed_45_axes_0, epsilon = var_2108_to_fp16, x = input_47_cast_fp16)[name = string("normed_45_cast_fp16")];
+            tensor<int32, [2]> var_2118_split_sizes_0 = const()[name = string("op_2118_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2118_axis_0 = const()[name = string("op_2118_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2118_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2118_cast_fp16_1 = split(axis = var_2118_axis_0, split_sizes = var_2118_split_sizes_0, x = normed_45_cast_fp16)[name = string("op_2118_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_1_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_1_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(920201536)))];
+            tensor<fp16, [1, 1, 2560]> h_9_cast_fp16 = mul(x = var_2118_cast_fp16_0, y = layers_c2_1_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_9_cast_fp16")];
+            tensor<int32, [3]> var_2129 = const()[name = string("op_2129"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_49_axes_0 = const()[name = string("input_49_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2130 = transpose(perm = var_2129, x = h_9_cast_fp16)[name = string("transpose_274")];
+            tensor<fp16, [1, 2560, 1, 1]> input_49 = expand_dims(axes = input_49_axes_0, x = var_2130)[name = string("input_49")];
+            string gate_5_pad_type_0 = const()[name = string("gate_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_5_strides_0 = const()[name = string("gate_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_5_pad_0 = const()[name = string("gate_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_5_dilations_0 = const()[name = string("gate_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_5_groups_0 = const()[name = string("gate_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_5 = conv(dilations = gate_5_dilations_0, groups = gate_5_groups_0, pad = gate_5_pad_0, pad_type = gate_5_pad_type_0, strides = gate_5_strides_0, weight = layers_c2_1_mlp_gate_proj_weight_palettized, x = input_49)[name = string("gate_5")];
+            string up_3_pad_type_0 = const()[name = string("up_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_3_strides_0 = const()[name = string("up_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_3_pad_0 = const()[name = string("up_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_3_dilations_0 = const()[name = string("up_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_3_groups_0 = const()[name = string("up_3_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_3 = conv(dilations = up_3_dilations_0, groups = up_3_groups_0, pad = up_3_pad_0, pad_type = up_3_pad_type_0, strides = up_3_strides_0, weight = layers_c2_1_mlp_up_proj_weight_palettized, x = input_49)[name = string("up_3")];
+            string gate_7_mode_0 = const()[name = string("gate_7_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_7 = gelu(mode = gate_7_mode_0, x = gate_5)[name = string("gate_7")];
+            tensor<fp16, [1, 10240, 1, 1]> input_51 = mul(x = gate_7, y = up_3)[name = string("input_51")];
+            string mlp_out_3_pad_type_0 = const()[name = string("mlp_out_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_3_strides_0 = const()[name = string("mlp_out_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_3_pad_0 = const()[name = string("mlp_out_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_3_dilations_0 = const()[name = string("mlp_out_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_3_groups_0 = const()[name = string("mlp_out_3_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_3 = conv(dilations = mlp_out_3_dilations_0, groups = mlp_out_3_groups_0, pad = mlp_out_3_pad_0, pad_type = mlp_out_3_pad_type_0, strides = mlp_out_3_strides_0, weight = layers_c2_1_mlp_down_proj_weight_palettized, x = input_51)[name = string("mlp_out_3")];
+            tensor<int32, [1]> var_2170_axes_0 = const()[name = string("op_2170_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2170 = squeeze(axes = var_2170_axes_0, x = mlp_out_3)[name = string("op_2170")];
+            tensor<int32, [3]> var_2174 = const()[name = string("op_2174"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2180 = const()[name = string("op_2180"), val = int32(-1)];
+            fp16 const_21_promoted = const()[name = string("const_21_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_35 = transpose(perm = var_2174, x = var_2170)[name = string("transpose_273")];
+            tensor<fp16, [1, 1, 2560]> var_2182 = mul(x = x_35, y = const_21_promoted)[name = string("op_2182")];
+            bool input_53_interleave_0 = const()[name = string("input_53_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_53 = concat(axis = var_2180, interleave = input_53_interleave_0, values = (x_35, var_2182))[name = string("input_53")];
+            tensor<int32, [1]> normed_49_axes_0 = const()[name = string("normed_49_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2177_to_fp16 = const()[name = string("op_2177_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_49_cast_fp16 = layer_norm(axes = normed_49_axes_0, epsilon = var_2177_to_fp16, x = input_53)[name = string("normed_49_cast_fp16")];
+            tensor<int32, [2]> var_2187_split_sizes_0 = const()[name = string("op_2187_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2187_axis_0 = const()[name = string("op_2187_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2187_0, tensor<fp16, [1, 1, 2560]> var_2187_1 = split(axis = var_2187_axis_0, split_sizes = var_2187_split_sizes_0, x = normed_49_cast_fp16)[name = string("op_2187")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_13 = mul(x = var_2187_0, y = layers_c2_1_post_feedforward_layernorm_weight)[name = string("hidden_states_13")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_15_cast_fp16 = add(x = x_33_cast_fp16, y = hidden_states_13)[name = string("hidden_states_15_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_3_begin_0 = const()[name = string("per_layer_slice_3_begin_0"), val = tensor<int32, [3]>([0, 0, 3328])];
+            tensor<int32, [3]> per_layer_slice_3_end_0 = const()[name = string("per_layer_slice_3_end_0"), val = tensor<int32, [3]>([1, 1, 3584])];
+            tensor<bool, [3]> per_layer_slice_3_end_mask_0 = const()[name = string("per_layer_slice_3_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_3_cast_fp16 = slice_by_index(begin = per_layer_slice_3_begin_0, end = per_layer_slice_3_end_0, end_mask = per_layer_slice_3_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_3_cast_fp16")];
+            tensor<int32, [3]> var_2215 = const()[name = string("op_2215"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_55_axes_0 = const()[name = string("input_55_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2216 = transpose(perm = var_2215, x = hidden_states_15_cast_fp16)[name = string("transpose_272")];
+            tensor<fp16, [1, 2560, 1, 1]> input_55 = expand_dims(axes = input_55_axes_0, x = var_2216)[name = string("input_55")];
+            string gated_7_pad_type_0 = const()[name = string("gated_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_7_strides_0 = const()[name = string("gated_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_7_pad_0 = const()[name = string("gated_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_7_dilations_0 = const()[name = string("gated_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_7_groups_0 = const()[name = string("gated_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_7 = conv(dilations = gated_7_dilations_0, groups = gated_7_groups_0, pad = gated_7_pad_0, pad_type = gated_7_pad_type_0, strides = gated_7_strides_0, weight = layers_c2_1_per_layer_input_gate_weight_palettized, x = input_55)[name = string("gated_7")];
+            string gated_9_mode_0 = const()[name = string("gated_9_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_9 = gelu(mode = gated_9_mode_0, x = gated_7)[name = string("gated_9")];
+            tensor<int32, [3]> var_2235 = const()[name = string("op_2235"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_3_axes_0 = const()[name = string("per_layer_slice_conv_3_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_2236_cast_fp16 = transpose(perm = var_2235, x = per_layer_slice_3_cast_fp16)[name = string("transpose_271")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_3_cast_fp16 = expand_dims(axes = per_layer_slice_conv_3_axes_0, x = var_2236_cast_fp16)[name = string("per_layer_slice_conv_3_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_57_cast_fp16 = mul(x = gated_9, y = per_layer_slice_conv_3_cast_fp16)[name = string("input_57_cast_fp16")];
+            string gated_11_pad_type_0 = const()[name = string("gated_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_11_strides_0 = const()[name = string("gated_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_11_pad_0 = const()[name = string("gated_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_11_dilations_0 = const()[name = string("gated_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_11_groups_0 = const()[name = string("gated_11_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_c2_1_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(920206720))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(920534464))))[name = string("layers_c2_1_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_11_cast_fp16 = conv(dilations = gated_11_dilations_0, groups = gated_11_groups_0, pad = gated_11_pad_0, pad_type = gated_11_pad_type_0, strides = gated_11_strides_0, weight = layers_c2_1_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_57_cast_fp16)[name = string("gated_11_cast_fp16")];
+            tensor<int32, [1]> var_2252_axes_0 = const()[name = string("op_2252_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2252_cast_fp16 = squeeze(axes = var_2252_axes_0, x = gated_11_cast_fp16)[name = string("op_2252_cast_fp16")];
+            tensor<int32, [3]> var_2256 = const()[name = string("op_2256"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2262 = const()[name = string("op_2262"), val = int32(-1)];
+            fp16 const_22_promoted_to_fp16 = const()[name = string("const_22_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_37_cast_fp16 = transpose(perm = var_2256, x = var_2252_cast_fp16)[name = string("transpose_270")];
+            tensor<fp16, [1, 1, 2560]> var_2264_cast_fp16 = mul(x = x_37_cast_fp16, y = const_22_promoted_to_fp16)[name = string("op_2264_cast_fp16")];
+            bool input_59_interleave_0 = const()[name = string("input_59_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_59_cast_fp16 = concat(axis = var_2262, interleave = input_59_interleave_0, values = (x_37_cast_fp16, var_2264_cast_fp16))[name = string("input_59_cast_fp16")];
+            tensor<int32, [1]> normed_53_axes_0 = const()[name = string("normed_53_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2259_to_fp16 = const()[name = string("op_2259_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_53_cast_fp16 = layer_norm(axes = normed_53_axes_0, epsilon = var_2259_to_fp16, x = input_59_cast_fp16)[name = string("normed_53_cast_fp16")];
+            tensor<int32, [2]> var_2269_split_sizes_0 = const()[name = string("op_2269_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2269_axis_0 = const()[name = string("op_2269_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2269_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2269_cast_fp16_1 = split(axis = var_2269_axis_0, split_sizes = var_2269_split_sizes_0, x = normed_53_cast_fp16)[name = string("op_2269_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_1_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_c2_1_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(920537088)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_19_cast_fp16 = mul(x = var_2269_cast_fp16_0, y = layers_c2_1_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_19_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_21_cast_fp16 = add(x = hidden_states_15_cast_fp16, y = hidden_states_19_cast_fp16)[name = string("hidden_states_21_cast_fp16")];
+            tensor<fp16, [1]> const_23_promoted_to_fp16 = const()[name = string("const_23_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.6cp-1])];
+            tensor<fp16, [1, 1, 2560]> x_39_cast_fp16 = mul(x = hidden_states_21_cast_fp16, y = const_23_promoted_to_fp16)[name = string("x_39_cast_fp16")];
+            tensor<int32, [1]> var_2281_axes_0 = const()[name = string("op_2281_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_2281_cast_fp16 = squeeze(axes = var_2281_axes_0, x = K_sliding_out_3_cast_fp16)[name = string("op_2281_cast_fp16")];
+            tensor<int32, [1]> var_2283_axes_0 = const()[name = string("op_2283_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_2283_cast_fp16 = squeeze(axes = var_2283_axes_0, x = V_sliding_out_3_cast_fp16)[name = string("op_2283_cast_fp16")];
+            tensor<int32, [4]> var_2286_begin_0 = const()[name = string("op_2286_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
+            tensor<int32, [4]> var_2286_end_0 = const()[name = string("op_2286_end_0"), val = tensor<int32, [4]>([3, 2, 512, 512])];
+            tensor<bool, [4]> var_2286_end_mask_0 = const()[name = string("op_2286_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_2286_squeeze_mask_0 = const()[name = string("op_2286_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_2286_cast_fp16 = slice_by_index(begin = var_2286_begin_0, end = var_2286_end_0, end_mask = var_2286_end_mask_0, squeeze_mask = var_2286_squeeze_mask_0, x = K_sliding_in)[name = string("op_2286_cast_fp16")];
+            tensor<int32, [1]> K_sliding_slot_5_axes_0 = const()[name = string("K_sliding_slot_5_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_slot_5_cast_fp16 = expand_dims(axes = K_sliding_slot_5_axes_0, x = var_2286_cast_fp16)[name = string("K_sliding_slot_5_cast_fp16")];
+            tensor<int32, [4]> var_2291_begin_0 = const()[name = string("op_2291_begin_0"), val = tensor<int32, [4]>([2, 0, 0, 0])];
+            tensor<int32, [4]> var_2291_end_0 = const()[name = string("op_2291_end_0"), val = tensor<int32, [4]>([3, 2, 512, 512])];
+            tensor<bool, [4]> var_2291_end_mask_0 = const()[name = string("op_2291_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_2291_squeeze_mask_0 = const()[name = string("op_2291_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_2291_cast_fp16 = slice_by_index(begin = var_2291_begin_0, end = var_2291_end_0, end_mask = var_2291_end_mask_0, squeeze_mask = var_2291_squeeze_mask_0, x = V_sliding_in)[name = string("op_2291_cast_fp16")];
+            tensor<int32, [1]> V_sliding_slot_5_axes_0 = const()[name = string("V_sliding_slot_5_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_slot_5_cast_fp16 = expand_dims(axes = V_sliding_slot_5_axes_0, x = var_2291_cast_fp16)[name = string("V_sliding_slot_5_cast_fp16")];
+            int32 var_2298 = const()[name = string("op_2298"), val = int32(-1)];
+            fp16 const_24_promoted_to_fp16 = const()[name = string("const_24_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_2300_cast_fp16 = mul(x = x_39_cast_fp16, y = const_24_promoted_to_fp16)[name = string("op_2300_cast_fp16")];
+            bool input_61_interleave_0 = const()[name = string("input_61_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_61_cast_fp16 = concat(axis = var_2298, interleave = input_61_interleave_0, values = (x_39_cast_fp16, var_2300_cast_fp16))[name = string("input_61_cast_fp16")];
+            tensor<int32, [1]> normed_57_axes_0 = const()[name = string("normed_57_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2295_to_fp16 = const()[name = string("op_2295_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_57_cast_fp16 = layer_norm(axes = normed_57_axes_0, epsilon = var_2295_to_fp16, x = input_61_cast_fp16)[name = string("normed_57_cast_fp16")];
+            tensor<int32, [2]> var_2305_split_sizes_0 = const()[name = string("op_2305_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2305_axis_0 = const()[name = string("op_2305_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2305_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2305_cast_fp16_1 = split(axis = var_2305_axis_0, split_sizes = var_2305_split_sizes_0, x = normed_57_cast_fp16)[name = string("op_2305_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_2_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_2_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(920542272)))];
+            tensor<fp16, [1, 1, 2560]> h_13_cast_fp16 = mul(x = var_2305_cast_fp16_0, y = layers_c2_2_input_layernorm_weight_promoted_to_fp16)[name = string("h_13_cast_fp16")];
+            tensor<int32, [3]> var_2311 = const()[name = string("op_2311"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_2314_axes_0 = const()[name = string("op_2314_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2312_cast_fp16 = transpose(perm = var_2311, x = h_13_cast_fp16)[name = string("transpose_269")];
+            tensor<fp16, [1, 2560, 1, 1]> var_2314_cast_fp16 = expand_dims(axes = var_2314_axes_0, x = var_2312_cast_fp16)[name = string("op_2314_cast_fp16")];
+            string var_2330_pad_type_0 = const()[name = string("op_2330_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2330_strides_0 = const()[name = string("op_2330_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2330_pad_0 = const()[name = string("op_2330_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2330_dilations_0 = const()[name = string("op_2330_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2330_groups_0 = const()[name = string("op_2330_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_2330 = conv(dilations = var_2330_dilations_0, groups = var_2330_groups_0, pad = var_2330_pad_0, pad_type = var_2330_pad_type_0, strides = var_2330_strides_0, weight = layers_c2_2_self_attn_q_proj_weight_palettized, x = var_2314_cast_fp16)[name = string("op_2330")];
+            tensor<int32, [4]> var_2335 = const()[name = string("op_2335"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_2336 = reshape(shape = var_2335, x = var_2330)[name = string("op_2336")];
+            tensor<int32, [4]> var_2341 = const()[name = string("op_2341"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_2351 = const()[name = string("op_2351"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_2342 = transpose(perm = var_2341, x = var_2336)[name = string("transpose_268")];
+            tensor<fp16, [1, 8, 256]> x_41 = reshape(shape = var_2351, x = var_2342)[name = string("x_41")];
+            int32 var_2357 = const()[name = string("op_2357"), val = int32(-1)];
+            fp16 const_25_promoted = const()[name = string("const_25_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_2359 = mul(x = x_41, y = const_25_promoted)[name = string("op_2359")];
+            bool input_65_interleave_0 = const()[name = string("input_65_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_65 = concat(axis = var_2357, interleave = input_65_interleave_0, values = (x_41, var_2359))[name = string("input_65")];
+            tensor<int32, [1]> normed_61_axes_0 = const()[name = string("normed_61_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2354_to_fp16 = const()[name = string("op_2354_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_61_cast_fp16 = layer_norm(axes = normed_61_axes_0, epsilon = var_2354_to_fp16, x = input_65)[name = string("normed_61_cast_fp16")];
+            tensor<int32, [2]> var_2364_split_sizes_0 = const()[name = string("op_2364_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2364_axis_0 = const()[name = string("op_2364_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_2364_0, tensor<fp16, [1, 8, 256]> var_2364_1 = split(axis = var_2364_axis_0, split_sizes = var_2364_split_sizes_0, x = normed_61_cast_fp16)[name = string("op_2364")];
+            tensor<fp16, [1, 8, 256]> var_2366 = mul(x = var_2364_0, y = layers_c2_2_self_attn_q_norm_weight)[name = string("op_2366")];
+            tensor<int32, [4]> var_2371 = const()[name = string("op_2371"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_19 = reshape(shape = var_2371, x = var_2366)[name = string("q_19")];
+            tensor<fp16, [1, 8, 1, 256]> var_2373_cast_fp16 = mul(x = q_19, y = cos_s)[name = string("op_2373_cast_fp16")];
+            tensor<int32, [2]> var_2374_split_sizes_0 = const()[name = string("op_2374_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2374_axis_0 = const()[name = string("op_2374_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_2374_0, tensor<fp16, [1, 8, 1, 128]> var_2374_1 = split(axis = var_2374_axis_0, split_sizes = var_2374_split_sizes_0, x = q_19)[name = string("op_2374")];
+            fp16 const_26_promoted = const()[name = string("const_26_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_2376 = mul(x = var_2374_1, y = const_26_promoted)[name = string("op_2376")];
+            int32 var_2378 = const()[name = string("op_2378"), val = int32(-1)];
+            bool var_2379_interleave_0 = const()[name = string("op_2379_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_2379 = concat(axis = var_2378, interleave = var_2379_interleave_0, values = (var_2376, var_2374_0))[name = string("op_2379")];
+            tensor<fp16, [1, 8, 1, 256]> var_2380_cast_fp16 = mul(x = var_2379, y = sin_s)[name = string("op_2380_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_23_cast_fp16 = add(x = var_2373_cast_fp16, y = var_2380_cast_fp16)[name = string("q_23_cast_fp16")];
+            string var_2393_pad_type_0 = const()[name = string("op_2393_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2393_strides_0 = const()[name = string("op_2393_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2393_pad_0 = const()[name = string("op_2393_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2393_dilations_0 = const()[name = string("op_2393_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2393_groups_0 = const()[name = string("op_2393_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_2393 = conv(dilations = var_2393_dilations_0, groups = var_2393_groups_0, pad = var_2393_pad_0, pad_type = var_2393_pad_type_0, strides = var_2393_strides_0, weight = layers_c2_2_self_attn_k_proj_weight_palettized, x = var_2314_cast_fp16)[name = string("op_2393")];
+            tensor<int32, [4]> var_2398 = const()[name = string("op_2398"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_2399 = reshape(shape = var_2398, x = var_2393)[name = string("op_2399")];
+            tensor<int32, [4]> var_2404 = const()[name = string("op_2404"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_2421_pad_type_0 = const()[name = string("op_2421_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2421_strides_0 = const()[name = string("op_2421_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2421_pad_0 = const()[name = string("op_2421_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2421_dilations_0 = const()[name = string("op_2421_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2421_groups_0 = const()[name = string("op_2421_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_2421 = conv(dilations = var_2421_dilations_0, groups = var_2421_groups_0, pad = var_2421_pad_0, pad_type = var_2421_pad_type_0, strides = var_2421_strides_0, weight = layers_c2_2_self_attn_v_proj_weight_palettized, x = var_2314_cast_fp16)[name = string("op_2421")];
+            tensor<int32, [4]> var_2426 = const()[name = string("op_2426"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_2427 = reshape(shape = var_2426, x = var_2421)[name = string("op_2427")];
+            tensor<int32, [4]> var_2432 = const()[name = string("op_2432"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_2442 = const()[name = string("op_2442"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp16, [1, 2, 1, 256]> var_2405 = transpose(perm = var_2404, x = var_2399)[name = string("transpose_267")];
+            tensor<fp16, [1, 2, 256]> x_43 = reshape(shape = var_2442, x = var_2405)[name = string("x_43")];
+            int32 var_2448 = const()[name = string("op_2448"), val = int32(-1)];
+            fp16 const_27_promoted = const()[name = string("const_27_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 256]> var_2450 = mul(x = x_43, y = const_27_promoted)[name = string("op_2450")];
+            bool input_67_interleave_0 = const()[name = string("input_67_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512]> input_67 = concat(axis = var_2448, interleave = input_67_interleave_0, values = (x_43, var_2450))[name = string("input_67")];
+            tensor<int32, [1]> normed_65_axes_0 = const()[name = string("normed_65_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2445_to_fp16 = const()[name = string("op_2445_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 512]> normed_65_cast_fp16 = layer_norm(axes = normed_65_axes_0, epsilon = var_2445_to_fp16, x = input_67)[name = string("normed_65_cast_fp16")];
+            tensor<int32, [2]> var_2455_split_sizes_0 = const()[name = string("op_2455_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2455_axis_0 = const()[name = string("op_2455_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 256]> var_2455_0, tensor<fp16, [1, 2, 256]> var_2455_1 = split(axis = var_2455_axis_0, split_sizes = var_2455_split_sizes_0, x = normed_65_cast_fp16)[name = string("op_2455")];
+            tensor<fp16, [1, 2, 256]> var_2457 = mul(x = var_2455_0, y = layers_c2_2_self_attn_k_norm_weight)[name = string("op_2457")];
+            tensor<int32, [4]> var_2462 = const()[name = string("op_2462"), val = tensor<int32, [4]>([1, 2, 1, 256])];
+            tensor<fp16, [1, 2, 1, 256]> q_21 = reshape(shape = var_2462, x = var_2457)[name = string("q_21")];
+            fp16 var_2464_promoted = const()[name = string("op_2464_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 256]> var_2433 = transpose(perm = var_2432, x = var_2427)[name = string("transpose_266")];
+            tensor<fp16, [1, 2, 1, 256]> var_2465 = pow(x = var_2433, y = var_2464_promoted)[name = string("op_2465")];
+            tensor<int32, [1]> var_2470_axes_0 = const()[name = string("op_2470_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2470_keep_dims_0 = const()[name = string("op_2470_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_2470 = reduce_mean(axes = var_2470_axes_0, keep_dims = var_2470_keep_dims_0, x = var_2465)[name = string("op_2470")];
+            fp16 var_2472_to_fp16 = const()[name = string("op_2472_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_5_cast_fp16 = add(x = var_2470, y = var_2472_to_fp16)[name = string("mean_sq_5_cast_fp16")];
+            fp32 var_2474_epsilon_0 = const()[name = string("op_2474_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_2474_cast_fp16 = rsqrt(epsilon = var_2474_epsilon_0, x = mean_sq_5_cast_fp16)[name = string("op_2474_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_71_cast_fp16 = mul(x = var_2433, y = var_2474_cast_fp16)[name = string("input_71_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> var_2476_cast_fp16 = mul(x = q_21, y = cos_s)[name = string("op_2476_cast_fp16")];
+            tensor<int32, [2]> var_2477_split_sizes_0 = const()[name = string("op_2477_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2477_axis_0 = const()[name = string("op_2477_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 128]> var_2477_0, tensor<fp16, [1, 2, 1, 128]> var_2477_1 = split(axis = var_2477_axis_0, split_sizes = var_2477_split_sizes_0, x = q_21)[name = string("op_2477")];
+            fp16 const_28_promoted = const()[name = string("const_28_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 128]> var_2479 = mul(x = var_2477_1, y = const_28_promoted)[name = string("op_2479")];
+            int32 var_2481 = const()[name = string("op_2481"), val = int32(-1)];
+            bool var_2482_interleave_0 = const()[name = string("op_2482_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 256]> var_2482 = concat(axis = var_2481, interleave = var_2482_interleave_0, values = (var_2479, var_2477_0))[name = string("op_2482")];
+            tensor<fp16, [1, 2, 1, 256]> var_2483_cast_fp16 = mul(x = var_2482, y = sin_s)[name = string("op_2483_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_69_cast_fp16 = add(x = var_2476_cast_fp16, y = var_2483_cast_fp16)[name = string("input_69_cast_fp16")];
+            tensor<int32, [8]> k_padded_5_pad_0 = const()[name = string("k_padded_5_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_5_mode_0 = const()[name = string("k_padded_5_mode_0"), val = string("constant")];
+            fp16 const_29_to_fp16 = const()[name = string("const_29_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> k_padded_5_cast_fp16 = pad(constant_val = const_29_to_fp16, mode = k_padded_5_mode_0, pad = k_padded_5_pad_0, x = input_69_cast_fp16)[name = string("k_padded_5_cast_fp16")];
+            tensor<int32, [8]> v_padded_5_pad_0 = const()[name = string("v_padded_5_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_5_mode_0 = const()[name = string("v_padded_5_mode_0"), val = string("constant")];
+            fp16 const_30_to_fp16 = const()[name = string("const_30_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> v_padded_5_cast_fp16 = pad(constant_val = const_30_to_fp16, mode = v_padded_5_mode_0, pad = v_padded_5_pad_0, x = input_71_cast_fp16)[name = string("v_padded_5_cast_fp16")];
+            tensor<int32, [4]> var_2512_begin_0 = const()[name = string("op_2512_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_2512_end_0 = const()[name = string("op_2512_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_2512_end_mask_0 = const()[name = string("op_2512_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_2512_cast_fp16 = slice_by_index(begin = var_2512_begin_0, end = var_2512_end_0, end_mask = var_2512_end_mask_0, x = K_sliding_slot_5_cast_fp16)[name = string("op_2512_cast_fp16")];
+            int32 var_2519 = const()[name = string("op_2519"), val = int32(2)];
+            bool K_sliding_out_5_interleave_0 = const()[name = string("K_sliding_out_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_out_5_cast_fp16 = concat(axis = var_2519, interleave = K_sliding_out_5_interleave_0, values = (var_2512_cast_fp16, k_padded_5_cast_fp16))[name = string("K_sliding_out_5_cast_fp16")];
+            tensor<int32, [4]> var_2535_begin_0 = const()[name = string("op_2535_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_2535_end_0 = const()[name = string("op_2535_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_2535_end_mask_0 = const()[name = string("op_2535_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_2535_cast_fp16 = slice_by_index(begin = var_2535_begin_0, end = var_2535_end_0, end_mask = var_2535_end_mask_0, x = V_sliding_slot_5_cast_fp16)[name = string("op_2535_cast_fp16")];
+            int32 var_2542 = const()[name = string("op_2542"), val = int32(2)];
+            bool V_sliding_out_5_interleave_0 = const()[name = string("V_sliding_out_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_out_5_cast_fp16 = concat(axis = var_2542, interleave = V_sliding_out_5_interleave_0, values = (var_2535_cast_fp16, v_padded_5_cast_fp16))[name = string("V_sliding_out_5_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_5_begin_0 = const()[name = string("K_for_attn_5_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_5_end_0 = const()[name = string("K_for_attn_5_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_5_end_mask_0 = const()[name = string("K_for_attn_5_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_5_cast_fp16 = slice_by_index(begin = K_for_attn_5_begin_0, end = K_for_attn_5_end_0, end_mask = K_for_attn_5_end_mask_0, x = K_sliding_out_5_cast_fp16)[name = string("K_for_attn_5_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_5_begin_0 = const()[name = string("V_for_attn_5_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_5_end_0 = const()[name = string("V_for_attn_5_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_5_end_mask_0 = const()[name = string("V_for_attn_5_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_5_cast_fp16 = slice_by_index(begin = V_for_attn_5_begin_0, end = V_for_attn_5_end_0, end_mask = V_for_attn_5_end_mask_0, x = V_sliding_out_5_cast_fp16)[name = string("V_for_attn_5_cast_fp16")];
+            tensor<int32, [4]> transpose_8_perm_0 = const()[name = string("transpose_8_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_4_reps_0 = const()[name = string("tile_4_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_8_cast_fp16 = transpose(perm = transpose_8_perm_0, x = K_for_attn_5_cast_fp16)[name = string("transpose_265")];
+            tensor<fp16, [8, 1, 512, 256]> tile_4_cast_fp16 = tile(reps = tile_4_reps_0, x = transpose_8_cast_fp16)[name = string("tile_4_cast_fp16")];
+            tensor<int32, [5]> concat_8 = const()[name = string("concat_8"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_8_cast_fp16 = reshape(shape = concat_8, x = tile_4_cast_fp16)[name = string("reshape_8_cast_fp16")];
+            tensor<int32, [5]> transpose_9_perm_0 = const()[name = string("transpose_9_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_9 = const()[name = string("concat_9"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_9_cast_fp16 = transpose(perm = transpose_9_perm_0, x = reshape_8_cast_fp16)[name = string("transpose_264")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_9_cast_fp16 = reshape(shape = concat_9, x = transpose_9_cast_fp16)[name = string("reshape_9_cast_fp16")];
+            tensor<int32, [4]> transpose_86_perm_0 = const()[name = string("transpose_86_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_10_perm_0 = const()[name = string("transpose_10_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_5_reps_0 = const()[name = string("tile_5_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_10_cast_fp16 = transpose(perm = transpose_10_perm_0, x = V_for_attn_5_cast_fp16)[name = string("transpose_263")];
+            tensor<fp16, [8, 1, 512, 256]> tile_5_cast_fp16 = tile(reps = tile_5_reps_0, x = transpose_10_cast_fp16)[name = string("tile_5_cast_fp16")];
+            tensor<int32, [5]> concat_10 = const()[name = string("concat_10"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_10_cast_fp16 = reshape(shape = concat_10, x = tile_5_cast_fp16)[name = string("reshape_10_cast_fp16")];
+            tensor<int32, [5]> transpose_11_perm_0 = const()[name = string("transpose_11_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_11 = const()[name = string("concat_11"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_11_cast_fp16 = transpose(perm = transpose_11_perm_0, x = reshape_10_cast_fp16)[name = string("transpose_262")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_11_cast_fp16 = reshape(shape = concat_11, x = transpose_11_cast_fp16)[name = string("reshape_11_cast_fp16")];
+            tensor<int32, [4]> V_expanded_5_perm_0 = const()[name = string("V_expanded_5_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(false)];
+            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_86_cast_fp16 = transpose(perm = transpose_86_perm_0, x = reshape_9_cast_fp16)[name = string("transpose_261")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = q_23_cast_fp16, y = transpose_86_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_47_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = causal_mask_sliding)[name = string("x_47_cast_fp16")];
+            tensor<int32, [1]> reduce_max_2_axes_0 = const()[name = string("reduce_max_2_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_2_keep_dims_0 = const()[name = string("reduce_max_2_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_2 = reduce_max(axes = reduce_max_2_axes_0, keep_dims = reduce_max_2_keep_dims_0, x = x_47_cast_fp16)[name = string("reduce_max_2")];
+            tensor<fp16, [1, 8, 1, 512]> var_2583 = sub(x = x_47_cast_fp16, y = reduce_max_2)[name = string("op_2583")];
+            tensor<fp16, [1, 8, 1, 512]> var_2589 = exp(x = var_2583)[name = string("op_2589")];
+            tensor<int32, [1]> var_2599_axes_0 = const()[name = string("op_2599_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2599_keep_dims_0 = const()[name = string("op_2599_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_2599 = reduce_sum(axes = var_2599_axes_0, keep_dims = var_2599_keep_dims_0, x = var_2589)[name = string("op_2599")];
+            tensor<fp16, [1, 8, 1, 512]> var_2605_cast_fp16 = real_div(x = var_2589, y = var_2599)[name = string("op_2605_cast_fp16")];
+            bool attn_output_13_transpose_x_0 = const()[name = string("attn_output_13_transpose_x_0"), val = bool(false)];
+            bool attn_output_13_transpose_y_0 = const()[name = string("attn_output_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_5_cast_fp16 = transpose(perm = V_expanded_5_perm_0, x = reshape_11_cast_fp16)[name = string("transpose_260")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_13_cast_fp16 = matmul(transpose_x = attn_output_13_transpose_x_0, transpose_y = attn_output_13_transpose_y_0, x = var_2605_cast_fp16, y = V_expanded_5_cast_fp16)[name = string("attn_output_13_cast_fp16")];
+            tensor<int32, [4]> var_2616 = const()[name = string("op_2616"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_2623 = const()[name = string("op_2623"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_2617_cast_fp16 = transpose(perm = var_2616, x = attn_output_13_cast_fp16)[name = string("transpose_259")];
+            tensor<fp16, [1, 1, 2048]> attn_output_15_cast_fp16 = reshape(shape = var_2623, x = var_2617_cast_fp16)[name = string("attn_output_15_cast_fp16")];
+            tensor<int32, [3]> var_2628 = const()[name = string("op_2628"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_2644_pad_type_0 = const()[name = string("op_2644_pad_type_0"), val = string("valid")];
+            int32 var_2644_groups_0 = const()[name = string("op_2644_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_2644_strides_0 = const()[name = string("op_2644_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_2644_pad_0 = const()[name = string("op_2644_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_2644_dilations_0 = const()[name = string("op_2644_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_2_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(920547456))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(923168960))))[name = string("squeeze_2_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_2629_cast_fp16 = transpose(perm = var_2628, x = attn_output_15_cast_fp16)[name = string("transpose_258")];
+            tensor<fp16, [1, 2560, 1]> var_2644_cast_fp16 = conv(dilations = var_2644_dilations_0, groups = var_2644_groups_0, pad = var_2644_pad_0, pad_type = var_2644_pad_type_0, strides = var_2644_strides_0, weight = squeeze_2_cast_fp16_to_fp32_to_fp16_palettized, x = var_2629_cast_fp16)[name = string("op_2644_cast_fp16")];
+            tensor<int32, [3]> var_2648 = const()[name = string("op_2648"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2654 = const()[name = string("op_2654"), val = int32(-1)];
+            fp16 const_31_promoted_to_fp16 = const()[name = string("const_31_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_51_cast_fp16 = transpose(perm = var_2648, x = var_2644_cast_fp16)[name = string("transpose_257")];
+            tensor<fp16, [1, 1, 2560]> var_2656_cast_fp16 = mul(x = x_51_cast_fp16, y = const_31_promoted_to_fp16)[name = string("op_2656_cast_fp16")];
+            bool input_75_interleave_0 = const()[name = string("input_75_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_75_cast_fp16 = concat(axis = var_2654, interleave = input_75_interleave_0, values = (x_51_cast_fp16, var_2656_cast_fp16))[name = string("input_75_cast_fp16")];
+            tensor<int32, [1]> normed_69_axes_0 = const()[name = string("normed_69_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2651_to_fp16 = const()[name = string("op_2651_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_69_cast_fp16 = layer_norm(axes = normed_69_axes_0, epsilon = var_2651_to_fp16, x = input_75_cast_fp16)[name = string("normed_69_cast_fp16")];
+            tensor<int32, [2]> var_2661_split_sizes_0 = const()[name = string("op_2661_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2661_axis_0 = const()[name = string("op_2661_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2661_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2661_cast_fp16_1 = split(axis = var_2661_axis_0, split_sizes = var_2661_split_sizes_0, x = normed_69_cast_fp16)[name = string("op_2661_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_2_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_2_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(923171584)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_17_cast_fp16 = mul(x = var_2661_cast_fp16_0, y = layers_c2_2_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_17_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_53_cast_fp16 = add(x = x_39_cast_fp16, y = attn_output_17_cast_fp16)[name = string("x_53_cast_fp16")];
+            int32 var_2670 = const()[name = string("op_2670"), val = int32(-1)];
+            fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_2672_cast_fp16 = mul(x = x_53_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_2672_cast_fp16")];
+            bool input_77_interleave_0 = const()[name = string("input_77_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_77_cast_fp16 = concat(axis = var_2670, interleave = input_77_interleave_0, values = (x_53_cast_fp16, var_2672_cast_fp16))[name = string("input_77_cast_fp16")];
+            tensor<int32, [1]> normed_73_axes_0 = const()[name = string("normed_73_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2667_to_fp16 = const()[name = string("op_2667_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_73_cast_fp16 = layer_norm(axes = normed_73_axes_0, epsilon = var_2667_to_fp16, x = input_77_cast_fp16)[name = string("normed_73_cast_fp16")];
+            tensor<int32, [2]> var_2677_split_sizes_0 = const()[name = string("op_2677_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2677_axis_0 = const()[name = string("op_2677_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2677_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2677_cast_fp16_1 = split(axis = var_2677_axis_0, split_sizes = var_2677_split_sizes_0, x = normed_73_cast_fp16)[name = string("op_2677_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_2_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_2_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(923176768)))];
+            tensor<fp16, [1, 1, 2560]> h_15_cast_fp16 = mul(x = var_2677_cast_fp16_0, y = layers_c2_2_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_15_cast_fp16")];
+            tensor<int32, [3]> var_2688 = const()[name = string("op_2688"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_79_axes_0 = const()[name = string("input_79_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2689 = transpose(perm = var_2688, x = h_15_cast_fp16)[name = string("transpose_256")];
+            tensor<fp16, [1, 2560, 1, 1]> input_79 = expand_dims(axes = input_79_axes_0, x = var_2689)[name = string("input_79")];
+            string gate_9_pad_type_0 = const()[name = string("gate_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_9_strides_0 = const()[name = string("gate_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_9_pad_0 = const()[name = string("gate_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_9_dilations_0 = const()[name = string("gate_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_9_groups_0 = const()[name = string("gate_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_9 = conv(dilations = gate_9_dilations_0, groups = gate_9_groups_0, pad = gate_9_pad_0, pad_type = gate_9_pad_type_0, strides = gate_9_strides_0, weight = layers_c2_2_mlp_gate_proj_weight_palettized, x = input_79)[name = string("gate_9")];
+            string up_5_pad_type_0 = const()[name = string("up_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_5_strides_0 = const()[name = string("up_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_5_pad_0 = const()[name = string("up_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_5_dilations_0 = const()[name = string("up_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_5_groups_0 = const()[name = string("up_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_5 = conv(dilations = up_5_dilations_0, groups = up_5_groups_0, pad = up_5_pad_0, pad_type = up_5_pad_type_0, strides = up_5_strides_0, weight = layers_c2_2_mlp_up_proj_weight_palettized, x = input_79)[name = string("up_5")];
+            string gate_11_mode_0 = const()[name = string("gate_11_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_11 = gelu(mode = gate_11_mode_0, x = gate_9)[name = string("gate_11")];
+            tensor<fp16, [1, 10240, 1, 1]> input_81 = mul(x = gate_11, y = up_5)[name = string("input_81")];
+            string mlp_out_5_pad_type_0 = const()[name = string("mlp_out_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_5_strides_0 = const()[name = string("mlp_out_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_5_pad_0 = const()[name = string("mlp_out_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_5_dilations_0 = const()[name = string("mlp_out_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_5_groups_0 = const()[name = string("mlp_out_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_5 = conv(dilations = mlp_out_5_dilations_0, groups = mlp_out_5_groups_0, pad = mlp_out_5_pad_0, pad_type = mlp_out_5_pad_type_0, strides = mlp_out_5_strides_0, weight = layers_c2_2_mlp_down_proj_weight_palettized, x = input_81)[name = string("mlp_out_5")];
+            tensor<int32, [1]> var_2729_axes_0 = const()[name = string("op_2729_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2729 = squeeze(axes = var_2729_axes_0, x = mlp_out_5)[name = string("op_2729")];
+            tensor<int32, [3]> var_2733 = const()[name = string("op_2733"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2739 = const()[name = string("op_2739"), val = int32(-1)];
+            fp16 const_33_promoted = const()[name = string("const_33_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_55 = transpose(perm = var_2733, x = var_2729)[name = string("transpose_255")];
+            tensor<fp16, [1, 1, 2560]> var_2741 = mul(x = x_55, y = const_33_promoted)[name = string("op_2741")];
+            bool input_83_interleave_0 = const()[name = string("input_83_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_83 = concat(axis = var_2739, interleave = input_83_interleave_0, values = (x_55, var_2741))[name = string("input_83")];
+            tensor<int32, [1]> normed_77_axes_0 = const()[name = string("normed_77_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2736_to_fp16 = const()[name = string("op_2736_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_77_cast_fp16 = layer_norm(axes = normed_77_axes_0, epsilon = var_2736_to_fp16, x = input_83)[name = string("normed_77_cast_fp16")];
+            tensor<int32, [2]> var_2746_split_sizes_0 = const()[name = string("op_2746_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2746_axis_0 = const()[name = string("op_2746_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2746_0, tensor<fp16, [1, 1, 2560]> var_2746_1 = split(axis = var_2746_axis_0, split_sizes = var_2746_split_sizes_0, x = normed_77_cast_fp16)[name = string("op_2746")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_23 = mul(x = var_2746_0, y = layers_c2_2_post_feedforward_layernorm_weight)[name = string("hidden_states_23")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_25_cast_fp16 = add(x = x_53_cast_fp16, y = hidden_states_23)[name = string("hidden_states_25_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_5_begin_0 = const()[name = string("per_layer_slice_5_begin_0"), val = tensor<int32, [3]>([0, 0, 3584])];
+            tensor<int32, [3]> per_layer_slice_5_end_0 = const()[name = string("per_layer_slice_5_end_0"), val = tensor<int32, [3]>([1, 1, 3840])];
+            tensor<bool, [3]> per_layer_slice_5_end_mask_0 = const()[name = string("per_layer_slice_5_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_5_cast_fp16 = slice_by_index(begin = per_layer_slice_5_begin_0, end = per_layer_slice_5_end_0, end_mask = per_layer_slice_5_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_5_cast_fp16")];
+            tensor<int32, [3]> var_2774 = const()[name = string("op_2774"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_85_axes_0 = const()[name = string("input_85_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2775 = transpose(perm = var_2774, x = hidden_states_25_cast_fp16)[name = string("transpose_254")];
+            tensor<fp16, [1, 2560, 1, 1]> input_85 = expand_dims(axes = input_85_axes_0, x = var_2775)[name = string("input_85")];
+            string gated_13_pad_type_0 = const()[name = string("gated_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_13_strides_0 = const()[name = string("gated_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_13_pad_0 = const()[name = string("gated_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_13_dilations_0 = const()[name = string("gated_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_13_groups_0 = const()[name = string("gated_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_13 = conv(dilations = gated_13_dilations_0, groups = gated_13_groups_0, pad = gated_13_pad_0, pad_type = gated_13_pad_type_0, strides = gated_13_strides_0, weight = layers_c2_2_per_layer_input_gate_weight_palettized, x = input_85)[name = string("gated_13")];
+            string gated_15_mode_0 = const()[name = string("gated_15_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_15 = gelu(mode = gated_15_mode_0, x = gated_13)[name = string("gated_15")];
+            tensor<int32, [3]> var_2794 = const()[name = string("op_2794"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_5_axes_0 = const()[name = string("per_layer_slice_conv_5_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_2795_cast_fp16 = transpose(perm = var_2794, x = per_layer_slice_5_cast_fp16)[name = string("transpose_253")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_5_cast_fp16 = expand_dims(axes = per_layer_slice_conv_5_axes_0, x = var_2795_cast_fp16)[name = string("per_layer_slice_conv_5_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_87_cast_fp16 = mul(x = gated_15, y = per_layer_slice_conv_5_cast_fp16)[name = string("input_87_cast_fp16")];
+            string gated_17_pad_type_0 = const()[name = string("gated_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_17_strides_0 = const()[name = string("gated_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_17_pad_0 = const()[name = string("gated_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_17_dilations_0 = const()[name = string("gated_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_17_groups_0 = const()[name = string("gated_17_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_c2_2_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(923181952))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(923509696))))[name = string("layers_c2_2_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_17_cast_fp16 = conv(dilations = gated_17_dilations_0, groups = gated_17_groups_0, pad = gated_17_pad_0, pad_type = gated_17_pad_type_0, strides = gated_17_strides_0, weight = layers_c2_2_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_87_cast_fp16)[name = string("gated_17_cast_fp16")];
+            tensor<int32, [1]> var_2811_axes_0 = const()[name = string("op_2811_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2811_cast_fp16 = squeeze(axes = var_2811_axes_0, x = gated_17_cast_fp16)[name = string("op_2811_cast_fp16")];
+            tensor<int32, [3]> var_2815 = const()[name = string("op_2815"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2821 = const()[name = string("op_2821"), val = int32(-1)];
+            fp16 const_34_promoted_to_fp16 = const()[name = string("const_34_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_57_cast_fp16 = transpose(perm = var_2815, x = var_2811_cast_fp16)[name = string("transpose_252")];
+            tensor<fp16, [1, 1, 2560]> var_2823_cast_fp16 = mul(x = x_57_cast_fp16, y = const_34_promoted_to_fp16)[name = string("op_2823_cast_fp16")];
+            bool input_89_interleave_0 = const()[name = string("input_89_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_89_cast_fp16 = concat(axis = var_2821, interleave = input_89_interleave_0, values = (x_57_cast_fp16, var_2823_cast_fp16))[name = string("input_89_cast_fp16")];
+            tensor<int32, [1]> normed_81_axes_0 = const()[name = string("normed_81_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2818_to_fp16 = const()[name = string("op_2818_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_81_cast_fp16 = layer_norm(axes = normed_81_axes_0, epsilon = var_2818_to_fp16, x = input_89_cast_fp16)[name = string("normed_81_cast_fp16")];
+            tensor<int32, [2]> var_2828_split_sizes_0 = const()[name = string("op_2828_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2828_axis_0 = const()[name = string("op_2828_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2828_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2828_cast_fp16_1 = split(axis = var_2828_axis_0, split_sizes = var_2828_split_sizes_0, x = normed_81_cast_fp16)[name = string("op_2828_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_2_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_c2_2_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(923512320)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_29_cast_fp16 = mul(x = var_2828_cast_fp16_0, y = layers_c2_2_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_29_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_31_cast_fp16 = add(x = hidden_states_25_cast_fp16, y = hidden_states_29_cast_fp16)[name = string("hidden_states_31_cast_fp16")];
+            tensor<fp16, [1]> const_35_promoted_to_fp16 = const()[name = string("const_35_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.58p-1])];
+            tensor<fp16, [1, 1, 2560]> x_59_cast_fp16 = mul(x = hidden_states_31_cast_fp16, y = const_35_promoted_to_fp16)[name = string("x_59_cast_fp16")];
+            tensor<int32, [1]> var_2840_axes_0 = const()[name = string("op_2840_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_2840_cast_fp16 = squeeze(axes = var_2840_axes_0, x = K_sliding_out_5_cast_fp16)[name = string("op_2840_cast_fp16")];
+            tensor<int32, [1]> var_2842_axes_0 = const()[name = string("op_2842_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_2842_cast_fp16 = squeeze(axes = var_2842_axes_0, x = V_sliding_out_5_cast_fp16)[name = string("op_2842_cast_fp16")];
+            tensor<int32, [4]> var_2845_begin_0 = const()[name = string("op_2845_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
+            tensor<int32, [4]> var_2845_end_0 = const()[name = string("op_2845_end_0"), val = tensor<int32, [4]>([4, 2, 512, 512])];
+            tensor<bool, [4]> var_2845_end_mask_0 = const()[name = string("op_2845_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_2845_squeeze_mask_0 = const()[name = string("op_2845_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_2845_cast_fp16 = slice_by_index(begin = var_2845_begin_0, end = var_2845_end_0, end_mask = var_2845_end_mask_0, squeeze_mask = var_2845_squeeze_mask_0, x = K_sliding_in)[name = string("op_2845_cast_fp16")];
+            tensor<int32, [1]> K_sliding_slot_7_axes_0 = const()[name = string("K_sliding_slot_7_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_slot_7_cast_fp16 = expand_dims(axes = K_sliding_slot_7_axes_0, x = var_2845_cast_fp16)[name = string("K_sliding_slot_7_cast_fp16")];
+            tensor<int32, [4]> var_2850_begin_0 = const()[name = string("op_2850_begin_0"), val = tensor<int32, [4]>([3, 0, 0, 0])];
+            tensor<int32, [4]> var_2850_end_0 = const()[name = string("op_2850_end_0"), val = tensor<int32, [4]>([4, 2, 512, 512])];
+            tensor<bool, [4]> var_2850_end_mask_0 = const()[name = string("op_2850_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_2850_squeeze_mask_0 = const()[name = string("op_2850_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_2850_cast_fp16 = slice_by_index(begin = var_2850_begin_0, end = var_2850_end_0, end_mask = var_2850_end_mask_0, squeeze_mask = var_2850_squeeze_mask_0, x = V_sliding_in)[name = string("op_2850_cast_fp16")];
+            tensor<int32, [1]> V_sliding_slot_7_axes_0 = const()[name = string("V_sliding_slot_7_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_slot_7_cast_fp16 = expand_dims(axes = V_sliding_slot_7_axes_0, x = var_2850_cast_fp16)[name = string("V_sliding_slot_7_cast_fp16")];
+            int32 var_2857 = const()[name = string("op_2857"), val = int32(-1)];
+            fp16 const_36_promoted_to_fp16 = const()[name = string("const_36_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_2859_cast_fp16 = mul(x = x_59_cast_fp16, y = const_36_promoted_to_fp16)[name = string("op_2859_cast_fp16")];
+            bool input_91_interleave_0 = const()[name = string("input_91_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_91_cast_fp16 = concat(axis = var_2857, interleave = input_91_interleave_0, values = (x_59_cast_fp16, var_2859_cast_fp16))[name = string("input_91_cast_fp16")];
+            tensor<int32, [1]> normed_85_axes_0 = const()[name = string("normed_85_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2854_to_fp16 = const()[name = string("op_2854_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_85_cast_fp16 = layer_norm(axes = normed_85_axes_0, epsilon = var_2854_to_fp16, x = input_91_cast_fp16)[name = string("normed_85_cast_fp16")];
+            tensor<int32, [2]> var_2864_split_sizes_0 = const()[name = string("op_2864_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2864_axis_0 = const()[name = string("op_2864_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2864_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2864_cast_fp16_1 = split(axis = var_2864_axis_0, split_sizes = var_2864_split_sizes_0, x = normed_85_cast_fp16)[name = string("op_2864_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_3_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_3_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(923517504)))];
+            tensor<fp16, [1, 1, 2560]> h_19_cast_fp16 = mul(x = var_2864_cast_fp16_0, y = layers_c2_3_input_layernorm_weight_promoted_to_fp16)[name = string("h_19_cast_fp16")];
+            tensor<int32, [3]> var_2870 = const()[name = string("op_2870"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_2873_axes_0 = const()[name = string("op_2873_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2871_cast_fp16 = transpose(perm = var_2870, x = h_19_cast_fp16)[name = string("transpose_251")];
+            tensor<fp16, [1, 2560, 1, 1]> var_2873_cast_fp16 = expand_dims(axes = var_2873_axes_0, x = var_2871_cast_fp16)[name = string("op_2873_cast_fp16")];
+            string var_2889_pad_type_0 = const()[name = string("op_2889_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2889_strides_0 = const()[name = string("op_2889_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2889_pad_0 = const()[name = string("op_2889_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2889_dilations_0 = const()[name = string("op_2889_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2889_groups_0 = const()[name = string("op_2889_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_2889 = conv(dilations = var_2889_dilations_0, groups = var_2889_groups_0, pad = var_2889_pad_0, pad_type = var_2889_pad_type_0, strides = var_2889_strides_0, weight = layers_c2_3_self_attn_q_proj_weight_palettized, x = var_2873_cast_fp16)[name = string("op_2889")];
+            tensor<int32, [4]> var_2894 = const()[name = string("op_2894"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_2895 = reshape(shape = var_2894, x = var_2889)[name = string("op_2895")];
+            tensor<int32, [4]> var_2900 = const()[name = string("op_2900"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_2910 = const()[name = string("op_2910"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_2901 = transpose(perm = var_2900, x = var_2895)[name = string("transpose_250")];
+            tensor<fp16, [1, 8, 256]> x_61 = reshape(shape = var_2910, x = var_2901)[name = string("x_61")];
+            int32 var_2916 = const()[name = string("op_2916"), val = int32(-1)];
+            fp16 const_37_promoted = const()[name = string("const_37_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_2918 = mul(x = x_61, y = const_37_promoted)[name = string("op_2918")];
+            bool input_95_interleave_0 = const()[name = string("input_95_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_95 = concat(axis = var_2916, interleave = input_95_interleave_0, values = (x_61, var_2918))[name = string("input_95")];
+            tensor<int32, [1]> normed_89_axes_0 = const()[name = string("normed_89_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2913_to_fp16 = const()[name = string("op_2913_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_89_cast_fp16 = layer_norm(axes = normed_89_axes_0, epsilon = var_2913_to_fp16, x = input_95)[name = string("normed_89_cast_fp16")];
+            tensor<int32, [2]> var_2923_split_sizes_0 = const()[name = string("op_2923_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2923_axis_0 = const()[name = string("op_2923_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_2923_0, tensor<fp16, [1, 8, 256]> var_2923_1 = split(axis = var_2923_axis_0, split_sizes = var_2923_split_sizes_0, x = normed_89_cast_fp16)[name = string("op_2923")];
+            tensor<fp16, [1, 8, 256]> var_2925 = mul(x = var_2923_0, y = layers_c2_3_self_attn_q_norm_weight)[name = string("op_2925")];
+            tensor<int32, [4]> var_2930 = const()[name = string("op_2930"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_27 = reshape(shape = var_2930, x = var_2925)[name = string("q_27")];
+            tensor<fp16, [1, 8, 1, 256]> var_2932_cast_fp16 = mul(x = q_27, y = cos_s)[name = string("op_2932_cast_fp16")];
+            tensor<int32, [2]> var_2933_split_sizes_0 = const()[name = string("op_2933_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2933_axis_0 = const()[name = string("op_2933_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_2933_0, tensor<fp16, [1, 8, 1, 128]> var_2933_1 = split(axis = var_2933_axis_0, split_sizes = var_2933_split_sizes_0, x = q_27)[name = string("op_2933")];
+            fp16 const_38_promoted = const()[name = string("const_38_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_2935 = mul(x = var_2933_1, y = const_38_promoted)[name = string("op_2935")];
+            int32 var_2937 = const()[name = string("op_2937"), val = int32(-1)];
+            bool var_2938_interleave_0 = const()[name = string("op_2938_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_2938 = concat(axis = var_2937, interleave = var_2938_interleave_0, values = (var_2935, var_2933_0))[name = string("op_2938")];
+            tensor<fp16, [1, 8, 1, 256]> var_2939_cast_fp16 = mul(x = var_2938, y = sin_s)[name = string("op_2939_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_31_cast_fp16 = add(x = var_2932_cast_fp16, y = var_2939_cast_fp16)[name = string("q_31_cast_fp16")];
+            string var_2952_pad_type_0 = const()[name = string("op_2952_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2952_strides_0 = const()[name = string("op_2952_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2952_pad_0 = const()[name = string("op_2952_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2952_dilations_0 = const()[name = string("op_2952_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2952_groups_0 = const()[name = string("op_2952_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_2952 = conv(dilations = var_2952_dilations_0, groups = var_2952_groups_0, pad = var_2952_pad_0, pad_type = var_2952_pad_type_0, strides = var_2952_strides_0, weight = layers_c2_3_self_attn_k_proj_weight_palettized, x = var_2873_cast_fp16)[name = string("op_2952")];
+            tensor<int32, [4]> var_2957 = const()[name = string("op_2957"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_2958 = reshape(shape = var_2957, x = var_2952)[name = string("op_2958")];
+            tensor<int32, [4]> var_2963 = const()[name = string("op_2963"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_2980_pad_type_0 = const()[name = string("op_2980_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2980_strides_0 = const()[name = string("op_2980_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2980_pad_0 = const()[name = string("op_2980_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2980_dilations_0 = const()[name = string("op_2980_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2980_groups_0 = const()[name = string("op_2980_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_2980 = conv(dilations = var_2980_dilations_0, groups = var_2980_groups_0, pad = var_2980_pad_0, pad_type = var_2980_pad_type_0, strides = var_2980_strides_0, weight = layers_c2_3_self_attn_v_proj_weight_palettized, x = var_2873_cast_fp16)[name = string("op_2980")];
+            tensor<int32, [4]> var_2985 = const()[name = string("op_2985"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_2986 = reshape(shape = var_2985, x = var_2980)[name = string("op_2986")];
+            tensor<int32, [4]> var_2991 = const()[name = string("op_2991"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_3001 = const()[name = string("op_3001"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp16, [1, 2, 1, 256]> var_2964 = transpose(perm = var_2963, x = var_2958)[name = string("transpose_249")];
+            tensor<fp16, [1, 2, 256]> x_63 = reshape(shape = var_3001, x = var_2964)[name = string("x_63")];
+            int32 var_3007 = const()[name = string("op_3007"), val = int32(-1)];
+            fp16 const_39_promoted = const()[name = string("const_39_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 256]> var_3009 = mul(x = x_63, y = const_39_promoted)[name = string("op_3009")];
+            bool input_97_interleave_0 = const()[name = string("input_97_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512]> input_97 = concat(axis = var_3007, interleave = input_97_interleave_0, values = (x_63, var_3009))[name = string("input_97")];
+            tensor<int32, [1]> normed_93_axes_0 = const()[name = string("normed_93_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3004_to_fp16 = const()[name = string("op_3004_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 512]> normed_93_cast_fp16 = layer_norm(axes = normed_93_axes_0, epsilon = var_3004_to_fp16, x = input_97)[name = string("normed_93_cast_fp16")];
+            tensor<int32, [2]> var_3014_split_sizes_0 = const()[name = string("op_3014_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_3014_axis_0 = const()[name = string("op_3014_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 256]> var_3014_0, tensor<fp16, [1, 2, 256]> var_3014_1 = split(axis = var_3014_axis_0, split_sizes = var_3014_split_sizes_0, x = normed_93_cast_fp16)[name = string("op_3014")];
+            tensor<fp16, [1, 2, 256]> var_3016 = mul(x = var_3014_0, y = layers_c2_3_self_attn_k_norm_weight)[name = string("op_3016")];
+            tensor<int32, [4]> var_3021 = const()[name = string("op_3021"), val = tensor<int32, [4]>([1, 2, 1, 256])];
+            tensor<fp16, [1, 2, 1, 256]> q_29 = reshape(shape = var_3021, x = var_3016)[name = string("q_29")];
+            fp16 var_3023_promoted = const()[name = string("op_3023_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 256]> var_2992 = transpose(perm = var_2991, x = var_2986)[name = string("transpose_248")];
+            tensor<fp16, [1, 2, 1, 256]> var_3024 = pow(x = var_2992, y = var_3023_promoted)[name = string("op_3024")];
+            tensor<int32, [1]> var_3029_axes_0 = const()[name = string("op_3029_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3029_keep_dims_0 = const()[name = string("op_3029_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_3029 = reduce_mean(axes = var_3029_axes_0, keep_dims = var_3029_keep_dims_0, x = var_3024)[name = string("op_3029")];
+            fp16 var_3031_to_fp16 = const()[name = string("op_3031_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_7_cast_fp16 = add(x = var_3029, y = var_3031_to_fp16)[name = string("mean_sq_7_cast_fp16")];
+            fp32 var_3033_epsilon_0 = const()[name = string("op_3033_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_3033_cast_fp16 = rsqrt(epsilon = var_3033_epsilon_0, x = mean_sq_7_cast_fp16)[name = string("op_3033_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_101_cast_fp16 = mul(x = var_2992, y = var_3033_cast_fp16)[name = string("input_101_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> var_3035_cast_fp16 = mul(x = q_29, y = cos_s)[name = string("op_3035_cast_fp16")];
+            tensor<int32, [2]> var_3036_split_sizes_0 = const()[name = string("op_3036_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_3036_axis_0 = const()[name = string("op_3036_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 128]> var_3036_0, tensor<fp16, [1, 2, 1, 128]> var_3036_1 = split(axis = var_3036_axis_0, split_sizes = var_3036_split_sizes_0, x = q_29)[name = string("op_3036")];
+            fp16 const_40_promoted = const()[name = string("const_40_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 128]> var_3038 = mul(x = var_3036_1, y = const_40_promoted)[name = string("op_3038")];
+            int32 var_3040 = const()[name = string("op_3040"), val = int32(-1)];
+            bool var_3041_interleave_0 = const()[name = string("op_3041_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 256]> var_3041 = concat(axis = var_3040, interleave = var_3041_interleave_0, values = (var_3038, var_3036_0))[name = string("op_3041")];
+            tensor<fp16, [1, 2, 1, 256]> var_3042_cast_fp16 = mul(x = var_3041, y = sin_s)[name = string("op_3042_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_99_cast_fp16 = add(x = var_3035_cast_fp16, y = var_3042_cast_fp16)[name = string("input_99_cast_fp16")];
+            tensor<int32, [8]> k_padded_7_pad_0 = const()[name = string("k_padded_7_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_7_mode_0 = const()[name = string("k_padded_7_mode_0"), val = string("constant")];
+            fp16 const_41_to_fp16 = const()[name = string("const_41_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> k_padded_7_cast_fp16 = pad(constant_val = const_41_to_fp16, mode = k_padded_7_mode_0, pad = k_padded_7_pad_0, x = input_99_cast_fp16)[name = string("k_padded_7_cast_fp16")];
+            tensor<int32, [8]> v_padded_7_pad_0 = const()[name = string("v_padded_7_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_7_mode_0 = const()[name = string("v_padded_7_mode_0"), val = string("constant")];
+            fp16 const_42_to_fp16 = const()[name = string("const_42_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> v_padded_7_cast_fp16 = pad(constant_val = const_42_to_fp16, mode = v_padded_7_mode_0, pad = v_padded_7_pad_0, x = input_101_cast_fp16)[name = string("v_padded_7_cast_fp16")];
+            tensor<int32, [4]> var_3071_begin_0 = const()[name = string("op_3071_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_3071_end_0 = const()[name = string("op_3071_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_3071_end_mask_0 = const()[name = string("op_3071_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_3071_cast_fp16 = slice_by_index(begin = var_3071_begin_0, end = var_3071_end_0, end_mask = var_3071_end_mask_0, x = K_sliding_slot_7_cast_fp16)[name = string("op_3071_cast_fp16")];
+            int32 var_3078 = const()[name = string("op_3078"), val = int32(2)];
+            bool K_sliding_out_7_interleave_0 = const()[name = string("K_sliding_out_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_out_7_cast_fp16 = concat(axis = var_3078, interleave = K_sliding_out_7_interleave_0, values = (var_3071_cast_fp16, k_padded_7_cast_fp16))[name = string("K_sliding_out_7_cast_fp16")];
+            tensor<int32, [4]> var_3094_begin_0 = const()[name = string("op_3094_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_3094_end_0 = const()[name = string("op_3094_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_3094_end_mask_0 = const()[name = string("op_3094_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_3094_cast_fp16 = slice_by_index(begin = var_3094_begin_0, end = var_3094_end_0, end_mask = var_3094_end_mask_0, x = V_sliding_slot_7_cast_fp16)[name = string("op_3094_cast_fp16")];
+            int32 var_3101 = const()[name = string("op_3101"), val = int32(2)];
+            bool V_sliding_out_7_interleave_0 = const()[name = string("V_sliding_out_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_out_7_cast_fp16 = concat(axis = var_3101, interleave = V_sliding_out_7_interleave_0, values = (var_3094_cast_fp16, v_padded_7_cast_fp16))[name = string("V_sliding_out_7_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_7_begin_0 = const()[name = string("K_for_attn_7_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_7_end_0 = const()[name = string("K_for_attn_7_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_7_end_mask_0 = const()[name = string("K_for_attn_7_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_7_cast_fp16 = slice_by_index(begin = K_for_attn_7_begin_0, end = K_for_attn_7_end_0, end_mask = K_for_attn_7_end_mask_0, x = K_sliding_out_7_cast_fp16)[name = string("K_for_attn_7_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_7_begin_0 = const()[name = string("V_for_attn_7_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_7_end_0 = const()[name = string("V_for_attn_7_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_7_end_mask_0 = const()[name = string("V_for_attn_7_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_7_cast_fp16 = slice_by_index(begin = V_for_attn_7_begin_0, end = V_for_attn_7_end_0, end_mask = V_for_attn_7_end_mask_0, x = V_sliding_out_7_cast_fp16)[name = string("V_for_attn_7_cast_fp16")];
+            tensor<int32, [4]> transpose_12_perm_0 = const()[name = string("transpose_12_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_6_reps_0 = const()[name = string("tile_6_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_12_cast_fp16 = transpose(perm = transpose_12_perm_0, x = K_for_attn_7_cast_fp16)[name = string("transpose_247")];
+            tensor<fp16, [8, 1, 512, 256]> tile_6_cast_fp16 = tile(reps = tile_6_reps_0, x = transpose_12_cast_fp16)[name = string("tile_6_cast_fp16")];
+            tensor<int32, [5]> concat_12 = const()[name = string("concat_12"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_12_cast_fp16 = reshape(shape = concat_12, x = tile_6_cast_fp16)[name = string("reshape_12_cast_fp16")];
+            tensor<int32, [5]> transpose_13_perm_0 = const()[name = string("transpose_13_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_13 = const()[name = string("concat_13"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_13_cast_fp16 = transpose(perm = transpose_13_perm_0, x = reshape_12_cast_fp16)[name = string("transpose_246")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_13_cast_fp16 = reshape(shape = concat_13, x = transpose_13_cast_fp16)[name = string("reshape_13_cast_fp16")];
+            tensor<int32, [4]> transpose_87_perm_0 = const()[name = string("transpose_87_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_14_perm_0 = const()[name = string("transpose_14_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_7_reps_0 = const()[name = string("tile_7_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_14_cast_fp16 = transpose(perm = transpose_14_perm_0, x = V_for_attn_7_cast_fp16)[name = string("transpose_245")];
+            tensor<fp16, [8, 1, 512, 256]> tile_7_cast_fp16 = tile(reps = tile_7_reps_0, x = transpose_14_cast_fp16)[name = string("tile_7_cast_fp16")];
+            tensor<int32, [5]> concat_14 = const()[name = string("concat_14"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_14_cast_fp16 = reshape(shape = concat_14, x = tile_7_cast_fp16)[name = string("reshape_14_cast_fp16")];
+            tensor<int32, [5]> transpose_15_perm_0 = const()[name = string("transpose_15_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_15 = const()[name = string("concat_15"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_15_cast_fp16 = transpose(perm = transpose_15_perm_0, x = reshape_14_cast_fp16)[name = string("transpose_244")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_15_cast_fp16 = reshape(shape = concat_15, x = transpose_15_cast_fp16)[name = string("reshape_15_cast_fp16")];
+            tensor<int32, [4]> V_expanded_7_perm_0 = const()[name = string("V_expanded_7_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(false)];
+            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_87_cast_fp16 = transpose(perm = transpose_87_perm_0, x = reshape_13_cast_fp16)[name = string("transpose_243")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = q_31_cast_fp16, y = transpose_87_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_67_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = causal_mask_sliding)[name = string("x_67_cast_fp16")];
+            tensor<int32, [1]> reduce_max_3_axes_0 = const()[name = string("reduce_max_3_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_3_keep_dims_0 = const()[name = string("reduce_max_3_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_3 = reduce_max(axes = reduce_max_3_axes_0, keep_dims = reduce_max_3_keep_dims_0, x = x_67_cast_fp16)[name = string("reduce_max_3")];
+            tensor<fp16, [1, 8, 1, 512]> var_3142 = sub(x = x_67_cast_fp16, y = reduce_max_3)[name = string("op_3142")];
+            tensor<fp16, [1, 8, 1, 512]> var_3148 = exp(x = var_3142)[name = string("op_3148")];
+            tensor<int32, [1]> var_3158_axes_0 = const()[name = string("op_3158_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3158_keep_dims_0 = const()[name = string("op_3158_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_3158 = reduce_sum(axes = var_3158_axes_0, keep_dims = var_3158_keep_dims_0, x = var_3148)[name = string("op_3158")];
+            tensor<fp16, [1, 8, 1, 512]> var_3164_cast_fp16 = real_div(x = var_3148, y = var_3158)[name = string("op_3164_cast_fp16")];
+            bool attn_output_19_transpose_x_0 = const()[name = string("attn_output_19_transpose_x_0"), val = bool(false)];
+            bool attn_output_19_transpose_y_0 = const()[name = string("attn_output_19_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_7_cast_fp16 = transpose(perm = V_expanded_7_perm_0, x = reshape_15_cast_fp16)[name = string("transpose_242")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_19_cast_fp16 = matmul(transpose_x = attn_output_19_transpose_x_0, transpose_y = attn_output_19_transpose_y_0, x = var_3164_cast_fp16, y = V_expanded_7_cast_fp16)[name = string("attn_output_19_cast_fp16")];
+            tensor<int32, [4]> var_3175 = const()[name = string("op_3175"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_3182 = const()[name = string("op_3182"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_3176_cast_fp16 = transpose(perm = var_3175, x = attn_output_19_cast_fp16)[name = string("transpose_241")];
+            tensor<fp16, [1, 1, 2048]> attn_output_21_cast_fp16 = reshape(shape = var_3182, x = var_3176_cast_fp16)[name = string("attn_output_21_cast_fp16")];
+            tensor<int32, [3]> var_3187 = const()[name = string("op_3187"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_3203_pad_type_0 = const()[name = string("op_3203_pad_type_0"), val = string("valid")];
+            int32 var_3203_groups_0 = const()[name = string("op_3203_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_3203_strides_0 = const()[name = string("op_3203_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_3203_pad_0 = const()[name = string("op_3203_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_3203_dilations_0 = const()[name = string("op_3203_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_3_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(923522688))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(926144192))))[name = string("squeeze_3_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_3188_cast_fp16 = transpose(perm = var_3187, x = attn_output_21_cast_fp16)[name = string("transpose_240")];
+            tensor<fp16, [1, 2560, 1]> var_3203_cast_fp16 = conv(dilations = var_3203_dilations_0, groups = var_3203_groups_0, pad = var_3203_pad_0, pad_type = var_3203_pad_type_0, strides = var_3203_strides_0, weight = squeeze_3_cast_fp16_to_fp32_to_fp16_palettized, x = var_3188_cast_fp16)[name = string("op_3203_cast_fp16")];
+            tensor<int32, [3]> var_3207 = const()[name = string("op_3207"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3213 = const()[name = string("op_3213"), val = int32(-1)];
+            fp16 const_43_promoted_to_fp16 = const()[name = string("const_43_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_71_cast_fp16 = transpose(perm = var_3207, x = var_3203_cast_fp16)[name = string("transpose_239")];
+            tensor<fp16, [1, 1, 2560]> var_3215_cast_fp16 = mul(x = x_71_cast_fp16, y = const_43_promoted_to_fp16)[name = string("op_3215_cast_fp16")];
+            bool input_105_interleave_0 = const()[name = string("input_105_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_105_cast_fp16 = concat(axis = var_3213, interleave = input_105_interleave_0, values = (x_71_cast_fp16, var_3215_cast_fp16))[name = string("input_105_cast_fp16")];
+            tensor<int32, [1]> normed_97_axes_0 = const()[name = string("normed_97_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3210_to_fp16 = const()[name = string("op_3210_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_97_cast_fp16 = layer_norm(axes = normed_97_axes_0, epsilon = var_3210_to_fp16, x = input_105_cast_fp16)[name = string("normed_97_cast_fp16")];
+            tensor<int32, [2]> var_3220_split_sizes_0 = const()[name = string("op_3220_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3220_axis_0 = const()[name = string("op_3220_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3220_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3220_cast_fp16_1 = split(axis = var_3220_axis_0, split_sizes = var_3220_split_sizes_0, x = normed_97_cast_fp16)[name = string("op_3220_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_3_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_3_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(926146816)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_23_cast_fp16 = mul(x = var_3220_cast_fp16_0, y = layers_c2_3_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_23_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_73_cast_fp16 = add(x = x_59_cast_fp16, y = attn_output_23_cast_fp16)[name = string("x_73_cast_fp16")];
+            int32 var_3229 = const()[name = string("op_3229"), val = int32(-1)];
+            fp16 const_44_promoted_to_fp16 = const()[name = string("const_44_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_3231_cast_fp16 = mul(x = x_73_cast_fp16, y = const_44_promoted_to_fp16)[name = string("op_3231_cast_fp16")];
+            bool input_107_interleave_0 = const()[name = string("input_107_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_107_cast_fp16 = concat(axis = var_3229, interleave = input_107_interleave_0, values = (x_73_cast_fp16, var_3231_cast_fp16))[name = string("input_107_cast_fp16")];
+            tensor<int32, [1]> normed_101_axes_0 = const()[name = string("normed_101_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3226_to_fp16 = const()[name = string("op_3226_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_101_cast_fp16 = layer_norm(axes = normed_101_axes_0, epsilon = var_3226_to_fp16, x = input_107_cast_fp16)[name = string("normed_101_cast_fp16")];
+            tensor<int32, [2]> var_3236_split_sizes_0 = const()[name = string("op_3236_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3236_axis_0 = const()[name = string("op_3236_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3236_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3236_cast_fp16_1 = split(axis = var_3236_axis_0, split_sizes = var_3236_split_sizes_0, x = normed_101_cast_fp16)[name = string("op_3236_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_3_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_3_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(926152000)))];
+            tensor<fp16, [1, 1, 2560]> h_21_cast_fp16 = mul(x = var_3236_cast_fp16_0, y = layers_c2_3_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_21_cast_fp16")];
+            tensor<int32, [3]> var_3247 = const()[name = string("op_3247"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_109_axes_0 = const()[name = string("input_109_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3248 = transpose(perm = var_3247, x = h_21_cast_fp16)[name = string("transpose_238")];
+            tensor<fp16, [1, 2560, 1, 1]> input_109 = expand_dims(axes = input_109_axes_0, x = var_3248)[name = string("input_109")];
+            string gate_13_pad_type_0 = const()[name = string("gate_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_13_strides_0 = const()[name = string("gate_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_13_pad_0 = const()[name = string("gate_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_13_dilations_0 = const()[name = string("gate_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_13_groups_0 = const()[name = string("gate_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_13 = conv(dilations = gate_13_dilations_0, groups = gate_13_groups_0, pad = gate_13_pad_0, pad_type = gate_13_pad_type_0, strides = gate_13_strides_0, weight = layers_c2_3_mlp_gate_proj_weight_palettized, x = input_109)[name = string("gate_13")];
+            string up_7_pad_type_0 = const()[name = string("up_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_7_strides_0 = const()[name = string("up_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_7_pad_0 = const()[name = string("up_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_7_dilations_0 = const()[name = string("up_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_7_groups_0 = const()[name = string("up_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_7 = conv(dilations = up_7_dilations_0, groups = up_7_groups_0, pad = up_7_pad_0, pad_type = up_7_pad_type_0, strides = up_7_strides_0, weight = layers_c2_3_mlp_up_proj_weight_palettized, x = input_109)[name = string("up_7")];
+            string gate_15_mode_0 = const()[name = string("gate_15_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_15 = gelu(mode = gate_15_mode_0, x = gate_13)[name = string("gate_15")];
+            tensor<fp16, [1, 10240, 1, 1]> input_111 = mul(x = gate_15, y = up_7)[name = string("input_111")];
+            string mlp_out_7_pad_type_0 = const()[name = string("mlp_out_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_7_strides_0 = const()[name = string("mlp_out_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_7_pad_0 = const()[name = string("mlp_out_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_7_dilations_0 = const()[name = string("mlp_out_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_7_groups_0 = const()[name = string("mlp_out_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_7 = conv(dilations = mlp_out_7_dilations_0, groups = mlp_out_7_groups_0, pad = mlp_out_7_pad_0, pad_type = mlp_out_7_pad_type_0, strides = mlp_out_7_strides_0, weight = layers_c2_3_mlp_down_proj_weight_palettized, x = input_111)[name = string("mlp_out_7")];
+            tensor<int32, [1]> var_3288_axes_0 = const()[name = string("op_3288_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3288 = squeeze(axes = var_3288_axes_0, x = mlp_out_7)[name = string("op_3288")];
+            tensor<int32, [3]> var_3292 = const()[name = string("op_3292"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3298 = const()[name = string("op_3298"), val = int32(-1)];
+            fp16 const_45_promoted = const()[name = string("const_45_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_75 = transpose(perm = var_3292, x = var_3288)[name = string("transpose_237")];
+            tensor<fp16, [1, 1, 2560]> var_3300 = mul(x = x_75, y = const_45_promoted)[name = string("op_3300")];
+            bool input_113_interleave_0 = const()[name = string("input_113_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_113 = concat(axis = var_3298, interleave = input_113_interleave_0, values = (x_75, var_3300))[name = string("input_113")];
+            tensor<int32, [1]> normed_105_axes_0 = const()[name = string("normed_105_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3295_to_fp16 = const()[name = string("op_3295_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_105_cast_fp16 = layer_norm(axes = normed_105_axes_0, epsilon = var_3295_to_fp16, x = input_113)[name = string("normed_105_cast_fp16")];
+            tensor<int32, [2]> var_3305_split_sizes_0 = const()[name = string("op_3305_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3305_axis_0 = const()[name = string("op_3305_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3305_0, tensor<fp16, [1, 1, 2560]> var_3305_1 = split(axis = var_3305_axis_0, split_sizes = var_3305_split_sizes_0, x = normed_105_cast_fp16)[name = string("op_3305")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_33 = mul(x = var_3305_0, y = layers_c2_3_post_feedforward_layernorm_weight)[name = string("hidden_states_33")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_35_cast_fp16 = add(x = x_73_cast_fp16, y = hidden_states_33)[name = string("hidden_states_35_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_7_begin_0 = const()[name = string("per_layer_slice_7_begin_0"), val = tensor<int32, [3]>([0, 0, 3840])];
+            tensor<int32, [3]> per_layer_slice_7_end_0 = const()[name = string("per_layer_slice_7_end_0"), val = tensor<int32, [3]>([1, 1, 4096])];
+            tensor<bool, [3]> per_layer_slice_7_end_mask_0 = const()[name = string("per_layer_slice_7_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_7_cast_fp16 = slice_by_index(begin = per_layer_slice_7_begin_0, end = per_layer_slice_7_end_0, end_mask = per_layer_slice_7_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_7_cast_fp16")];
+            tensor<int32, [3]> var_3333 = const()[name = string("op_3333"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_115_axes_0 = const()[name = string("input_115_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3334 = transpose(perm = var_3333, x = hidden_states_35_cast_fp16)[name = string("transpose_236")];
+            tensor<fp16, [1, 2560, 1, 1]> input_115 = expand_dims(axes = input_115_axes_0, x = var_3334)[name = string("input_115")];
+            string gated_19_pad_type_0 = const()[name = string("gated_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_19_strides_0 = const()[name = string("gated_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_19_pad_0 = const()[name = string("gated_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_19_dilations_0 = const()[name = string("gated_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_19_groups_0 = const()[name = string("gated_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_19 = conv(dilations = gated_19_dilations_0, groups = gated_19_groups_0, pad = gated_19_pad_0, pad_type = gated_19_pad_type_0, strides = gated_19_strides_0, weight = layers_c2_3_per_layer_input_gate_weight_palettized, x = input_115)[name = string("gated_19")];
+            string gated_21_mode_0 = const()[name = string("gated_21_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_21 = gelu(mode = gated_21_mode_0, x = gated_19)[name = string("gated_21")];
+            tensor<int32, [3]> var_3353 = const()[name = string("op_3353"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_7_axes_0 = const()[name = string("per_layer_slice_conv_7_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_3354_cast_fp16 = transpose(perm = var_3353, x = per_layer_slice_7_cast_fp16)[name = string("transpose_235")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_7_cast_fp16 = expand_dims(axes = per_layer_slice_conv_7_axes_0, x = var_3354_cast_fp16)[name = string("per_layer_slice_conv_7_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_117_cast_fp16 = mul(x = gated_21, y = per_layer_slice_conv_7_cast_fp16)[name = string("input_117_cast_fp16")];
+            string gated_23_pad_type_0 = const()[name = string("gated_23_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_23_strides_0 = const()[name = string("gated_23_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_23_pad_0 = const()[name = string("gated_23_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_23_dilations_0 = const()[name = string("gated_23_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_23_groups_0 = const()[name = string("gated_23_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_c2_3_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(926157184))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(926484928))))[name = string("layers_c2_3_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_23_cast_fp16 = conv(dilations = gated_23_dilations_0, groups = gated_23_groups_0, pad = gated_23_pad_0, pad_type = gated_23_pad_type_0, strides = gated_23_strides_0, weight = layers_c2_3_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_117_cast_fp16)[name = string("gated_23_cast_fp16")];
+            tensor<int32, [1]> var_3370_axes_0 = const()[name = string("op_3370_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3370_cast_fp16 = squeeze(axes = var_3370_axes_0, x = gated_23_cast_fp16)[name = string("op_3370_cast_fp16")];
+            tensor<int32, [3]> var_3374 = const()[name = string("op_3374"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3380 = const()[name = string("op_3380"), val = int32(-1)];
+            fp16 const_46_promoted_to_fp16 = const()[name = string("const_46_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_77_cast_fp16 = transpose(perm = var_3374, x = var_3370_cast_fp16)[name = string("transpose_234")];
+            tensor<fp16, [1, 1, 2560]> var_3382_cast_fp16 = mul(x = x_77_cast_fp16, y = const_46_promoted_to_fp16)[name = string("op_3382_cast_fp16")];
+            bool input_119_interleave_0 = const()[name = string("input_119_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_119_cast_fp16 = concat(axis = var_3380, interleave = input_119_interleave_0, values = (x_77_cast_fp16, var_3382_cast_fp16))[name = string("input_119_cast_fp16")];
+            tensor<int32, [1]> normed_109_axes_0 = const()[name = string("normed_109_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3377_to_fp16 = const()[name = string("op_3377_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_109_cast_fp16 = layer_norm(axes = normed_109_axes_0, epsilon = var_3377_to_fp16, x = input_119_cast_fp16)[name = string("normed_109_cast_fp16")];
+            tensor<int32, [2]> var_3387_split_sizes_0 = const()[name = string("op_3387_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3387_axis_0 = const()[name = string("op_3387_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3387_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3387_cast_fp16_1 = split(axis = var_3387_axis_0, split_sizes = var_3387_split_sizes_0, x = normed_109_cast_fp16)[name = string("op_3387_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_3_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_c2_3_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(926487552)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_39_cast_fp16 = mul(x = var_3387_cast_fp16_0, y = layers_c2_3_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_39_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_41_cast_fp16 = add(x = hidden_states_35_cast_fp16, y = hidden_states_39_cast_fp16)[name = string("hidden_states_41_cast_fp16")];
+            tensor<fp16, [1]> const_47_promoted_to_fp16 = const()[name = string("const_47_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.14p-1])];
+            tensor<fp16, [1, 1, 2560]> x_79_cast_fp16 = mul(x = hidden_states_41_cast_fp16, y = const_47_promoted_to_fp16)[name = string("x_79_cast_fp16")];
+            tensor<int32, [1]> var_3399_axes_0 = const()[name = string("op_3399_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_3399_cast_fp16 = squeeze(axes = var_3399_axes_0, x = K_sliding_out_7_cast_fp16)[name = string("op_3399_cast_fp16")];
+            tensor<int32, [1]> var_3401_axes_0 = const()[name = string("op_3401_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_3401_cast_fp16 = squeeze(axes = var_3401_axes_0, x = V_sliding_out_7_cast_fp16)[name = string("op_3401_cast_fp16")];
+            tensor<int32, [4]> var_3404_begin_0 = const()[name = string("op_3404_begin_0"), val = tensor<int32, [4]>([4, 0, 0, 0])];
+            tensor<int32, [4]> var_3404_end_0 = const()[name = string("op_3404_end_0"), val = tensor<int32, [4]>([5, 2, 512, 512])];
+            tensor<bool, [4]> var_3404_end_mask_0 = const()[name = string("op_3404_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_3404_squeeze_mask_0 = const()[name = string("op_3404_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_3404_cast_fp16 = slice_by_index(begin = var_3404_begin_0, end = var_3404_end_0, end_mask = var_3404_end_mask_0, squeeze_mask = var_3404_squeeze_mask_0, x = K_sliding_in)[name = string("op_3404_cast_fp16")];
+            tensor<int32, [1]> K_sliding_slot_9_axes_0 = const()[name = string("K_sliding_slot_9_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_slot_9_cast_fp16 = expand_dims(axes = K_sliding_slot_9_axes_0, x = var_3404_cast_fp16)[name = string("K_sliding_slot_9_cast_fp16")];
+            tensor<int32, [4]> var_3409_begin_0 = const()[name = string("op_3409_begin_0"), val = tensor<int32, [4]>([4, 0, 0, 0])];
+            tensor<int32, [4]> var_3409_end_0 = const()[name = string("op_3409_end_0"), val = tensor<int32, [4]>([5, 2, 512, 512])];
+            tensor<bool, [4]> var_3409_end_mask_0 = const()[name = string("op_3409_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_3409_squeeze_mask_0 = const()[name = string("op_3409_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_3409_cast_fp16 = slice_by_index(begin = var_3409_begin_0, end = var_3409_end_0, end_mask = var_3409_end_mask_0, squeeze_mask = var_3409_squeeze_mask_0, x = V_sliding_in)[name = string("op_3409_cast_fp16")];
+            tensor<int32, [1]> V_sliding_slot_9_axes_0 = const()[name = string("V_sliding_slot_9_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_slot_9_cast_fp16 = expand_dims(axes = V_sliding_slot_9_axes_0, x = var_3409_cast_fp16)[name = string("V_sliding_slot_9_cast_fp16")];
+            int32 var_3416 = const()[name = string("op_3416"), val = int32(-1)];
+            fp16 const_48_promoted_to_fp16 = const()[name = string("const_48_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_3418_cast_fp16 = mul(x = x_79_cast_fp16, y = const_48_promoted_to_fp16)[name = string("op_3418_cast_fp16")];
+            bool input_121_interleave_0 = const()[name = string("input_121_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_121_cast_fp16 = concat(axis = var_3416, interleave = input_121_interleave_0, values = (x_79_cast_fp16, var_3418_cast_fp16))[name = string("input_121_cast_fp16")];
+            tensor<int32, [1]> normed_113_axes_0 = const()[name = string("normed_113_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3413_to_fp16 = const()[name = string("op_3413_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_113_cast_fp16 = layer_norm(axes = normed_113_axes_0, epsilon = var_3413_to_fp16, x = input_121_cast_fp16)[name = string("normed_113_cast_fp16")];
+            tensor<int32, [2]> var_3423_split_sizes_0 = const()[name = string("op_3423_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3423_axis_0 = const()[name = string("op_3423_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3423_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3423_cast_fp16_1 = split(axis = var_3423_axis_0, split_sizes = var_3423_split_sizes_0, x = normed_113_cast_fp16)[name = string("op_3423_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_4_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_4_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(926492736)))];
+            tensor<fp16, [1, 1, 2560]> h_25_cast_fp16 = mul(x = var_3423_cast_fp16_0, y = layers_c2_4_input_layernorm_weight_promoted_to_fp16)[name = string("h_25_cast_fp16")];
+            tensor<int32, [3]> var_3429 = const()[name = string("op_3429"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_3432_axes_0 = const()[name = string("op_3432_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3430_cast_fp16 = transpose(perm = var_3429, x = h_25_cast_fp16)[name = string("transpose_233")];
+            tensor<fp16, [1, 2560, 1, 1]> var_3432_cast_fp16 = expand_dims(axes = var_3432_axes_0, x = var_3430_cast_fp16)[name = string("op_3432_cast_fp16")];
+            string var_3448_pad_type_0 = const()[name = string("op_3448_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_3448_strides_0 = const()[name = string("op_3448_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_3448_pad_0 = const()[name = string("op_3448_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_3448_dilations_0 = const()[name = string("op_3448_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_3448_groups_0 = const()[name = string("op_3448_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_3448 = conv(dilations = var_3448_dilations_0, groups = var_3448_groups_0, pad = var_3448_pad_0, pad_type = var_3448_pad_type_0, strides = var_3448_strides_0, weight = layers_c2_4_self_attn_q_proj_weight_palettized, x = var_3432_cast_fp16)[name = string("op_3448")];
+            tensor<int32, [4]> var_3453 = const()[name = string("op_3453"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_3454 = reshape(shape = var_3453, x = var_3448)[name = string("op_3454")];
+            tensor<int32, [4]> var_3459 = const()[name = string("op_3459"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_3469 = const()[name = string("op_3469"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_3460 = transpose(perm = var_3459, x = var_3454)[name = string("transpose_232")];
+            tensor<fp16, [1, 8, 256]> x_81 = reshape(shape = var_3469, x = var_3460)[name = string("x_81")];
+            int32 var_3475 = const()[name = string("op_3475"), val = int32(-1)];
+            fp16 const_49_promoted = const()[name = string("const_49_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_3477 = mul(x = x_81, y = const_49_promoted)[name = string("op_3477")];
+            bool input_125_interleave_0 = const()[name = string("input_125_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_125 = concat(axis = var_3475, interleave = input_125_interleave_0, values = (x_81, var_3477))[name = string("input_125")];
+            tensor<int32, [1]> normed_117_axes_0 = const()[name = string("normed_117_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3472_to_fp16 = const()[name = string("op_3472_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_117_cast_fp16 = layer_norm(axes = normed_117_axes_0, epsilon = var_3472_to_fp16, x = input_125)[name = string("normed_117_cast_fp16")];
+            tensor<int32, [2]> var_3482_split_sizes_0 = const()[name = string("op_3482_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_3482_axis_0 = const()[name = string("op_3482_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_3482_0, tensor<fp16, [1, 8, 256]> var_3482_1 = split(axis = var_3482_axis_0, split_sizes = var_3482_split_sizes_0, x = normed_117_cast_fp16)[name = string("op_3482")];
+            tensor<fp16, [1, 8, 256]> var_3484 = mul(x = var_3482_0, y = layers_c2_4_self_attn_q_norm_weight)[name = string("op_3484")];
+            tensor<int32, [4]> var_3489 = const()[name = string("op_3489"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_35 = reshape(shape = var_3489, x = var_3484)[name = string("q_35")];
+            tensor<fp16, [1, 8, 1, 256]> var_3491_cast_fp16 = mul(x = q_35, y = cos_s)[name = string("op_3491_cast_fp16")];
+            tensor<int32, [2]> var_3492_split_sizes_0 = const()[name = string("op_3492_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_3492_axis_0 = const()[name = string("op_3492_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_3492_0, tensor<fp16, [1, 8, 1, 128]> var_3492_1 = split(axis = var_3492_axis_0, split_sizes = var_3492_split_sizes_0, x = q_35)[name = string("op_3492")];
+            fp16 const_50_promoted = const()[name = string("const_50_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_3494 = mul(x = var_3492_1, y = const_50_promoted)[name = string("op_3494")];
+            int32 var_3496 = const()[name = string("op_3496"), val = int32(-1)];
+            bool var_3497_interleave_0 = const()[name = string("op_3497_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_3497 = concat(axis = var_3496, interleave = var_3497_interleave_0, values = (var_3494, var_3492_0))[name = string("op_3497")];
+            tensor<fp16, [1, 8, 1, 256]> var_3498_cast_fp16 = mul(x = var_3497, y = sin_s)[name = string("op_3498_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_39_cast_fp16 = add(x = var_3491_cast_fp16, y = var_3498_cast_fp16)[name = string("q_39_cast_fp16")];
+            string var_3511_pad_type_0 = const()[name = string("op_3511_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_3511_strides_0 = const()[name = string("op_3511_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_3511_pad_0 = const()[name = string("op_3511_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_3511_dilations_0 = const()[name = string("op_3511_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_3511_groups_0 = const()[name = string("op_3511_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_3511 = conv(dilations = var_3511_dilations_0, groups = var_3511_groups_0, pad = var_3511_pad_0, pad_type = var_3511_pad_type_0, strides = var_3511_strides_0, weight = layers_c2_4_self_attn_k_proj_weight_palettized, x = var_3432_cast_fp16)[name = string("op_3511")];
+            tensor<int32, [4]> var_3516 = const()[name = string("op_3516"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_3517 = reshape(shape = var_3516, x = var_3511)[name = string("op_3517")];
+            tensor<int32, [4]> var_3522 = const()[name = string("op_3522"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_3539_pad_type_0 = const()[name = string("op_3539_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_3539_strides_0 = const()[name = string("op_3539_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_3539_pad_0 = const()[name = string("op_3539_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_3539_dilations_0 = const()[name = string("op_3539_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_3539_groups_0 = const()[name = string("op_3539_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_3539 = conv(dilations = var_3539_dilations_0, groups = var_3539_groups_0, pad = var_3539_pad_0, pad_type = var_3539_pad_type_0, strides = var_3539_strides_0, weight = layers_c2_4_self_attn_v_proj_weight_palettized, x = var_3432_cast_fp16)[name = string("op_3539")];
+            tensor<int32, [4]> var_3544 = const()[name = string("op_3544"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_3545 = reshape(shape = var_3544, x = var_3539)[name = string("op_3545")];
+            tensor<int32, [4]> var_3550 = const()[name = string("op_3550"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_3560 = const()[name = string("op_3560"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp16, [1, 2, 1, 256]> var_3523 = transpose(perm = var_3522, x = var_3517)[name = string("transpose_231")];
+            tensor<fp16, [1, 2, 256]> x_83 = reshape(shape = var_3560, x = var_3523)[name = string("x_83")];
+            int32 var_3566 = const()[name = string("op_3566"), val = int32(-1)];
+            fp16 const_51_promoted = const()[name = string("const_51_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 256]> var_3568 = mul(x = x_83, y = const_51_promoted)[name = string("op_3568")];
+            bool input_127_interleave_0 = const()[name = string("input_127_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512]> input_127 = concat(axis = var_3566, interleave = input_127_interleave_0, values = (x_83, var_3568))[name = string("input_127")];
+            tensor<int32, [1]> normed_121_axes_0 = const()[name = string("normed_121_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3563_to_fp16 = const()[name = string("op_3563_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 512]> normed_121_cast_fp16 = layer_norm(axes = normed_121_axes_0, epsilon = var_3563_to_fp16, x = input_127)[name = string("normed_121_cast_fp16")];
+            tensor<int32, [2]> var_3573_split_sizes_0 = const()[name = string("op_3573_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_3573_axis_0 = const()[name = string("op_3573_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 256]> var_3573_0, tensor<fp16, [1, 2, 256]> var_3573_1 = split(axis = var_3573_axis_0, split_sizes = var_3573_split_sizes_0, x = normed_121_cast_fp16)[name = string("op_3573")];
+            tensor<fp16, [1, 2, 256]> var_3575 = mul(x = var_3573_0, y = layers_c2_4_self_attn_k_norm_weight)[name = string("op_3575")];
+            tensor<int32, [4]> var_3580 = const()[name = string("op_3580"), val = tensor<int32, [4]>([1, 2, 1, 256])];
+            tensor<fp16, [1, 2, 1, 256]> q_37 = reshape(shape = var_3580, x = var_3575)[name = string("q_37")];
+            fp16 var_3582_promoted = const()[name = string("op_3582_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 256]> var_3551 = transpose(perm = var_3550, x = var_3545)[name = string("transpose_230")];
+            tensor<fp16, [1, 2, 1, 256]> var_3583 = pow(x = var_3551, y = var_3582_promoted)[name = string("op_3583")];
+            tensor<int32, [1]> var_3588_axes_0 = const()[name = string("op_3588_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3588_keep_dims_0 = const()[name = string("op_3588_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_3588 = reduce_mean(axes = var_3588_axes_0, keep_dims = var_3588_keep_dims_0, x = var_3583)[name = string("op_3588")];
+            fp16 var_3590_to_fp16 = const()[name = string("op_3590_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_9_cast_fp16 = add(x = var_3588, y = var_3590_to_fp16)[name = string("mean_sq_9_cast_fp16")];
+            fp32 var_3592_epsilon_0 = const()[name = string("op_3592_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_3592_cast_fp16 = rsqrt(epsilon = var_3592_epsilon_0, x = mean_sq_9_cast_fp16)[name = string("op_3592_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_131_cast_fp16 = mul(x = var_3551, y = var_3592_cast_fp16)[name = string("input_131_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> var_3594_cast_fp16 = mul(x = q_37, y = cos_s)[name = string("op_3594_cast_fp16")];
+            tensor<int32, [2]> var_3595_split_sizes_0 = const()[name = string("op_3595_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_3595_axis_0 = const()[name = string("op_3595_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 128]> var_3595_0, tensor<fp16, [1, 2, 1, 128]> var_3595_1 = split(axis = var_3595_axis_0, split_sizes = var_3595_split_sizes_0, x = q_37)[name = string("op_3595")];
+            fp16 const_52_promoted = const()[name = string("const_52_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 128]> var_3597 = mul(x = var_3595_1, y = const_52_promoted)[name = string("op_3597")];
+            int32 var_3599 = const()[name = string("op_3599"), val = int32(-1)];
+            bool var_3600_interleave_0 = const()[name = string("op_3600_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 256]> var_3600 = concat(axis = var_3599, interleave = var_3600_interleave_0, values = (var_3597, var_3595_0))[name = string("op_3600")];
+            tensor<fp16, [1, 2, 1, 256]> var_3601_cast_fp16 = mul(x = var_3600, y = sin_s)[name = string("op_3601_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_129_cast_fp16 = add(x = var_3594_cast_fp16, y = var_3601_cast_fp16)[name = string("input_129_cast_fp16")];
+            tensor<int32, [8]> k_padded_9_pad_0 = const()[name = string("k_padded_9_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_9_mode_0 = const()[name = string("k_padded_9_mode_0"), val = string("constant")];
+            fp16 const_53_to_fp16 = const()[name = string("const_53_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> k_padded_9_cast_fp16 = pad(constant_val = const_53_to_fp16, mode = k_padded_9_mode_0, pad = k_padded_9_pad_0, x = input_129_cast_fp16)[name = string("k_padded_9_cast_fp16")];
+            tensor<int32, [8]> v_padded_9_pad_0 = const()[name = string("v_padded_9_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_9_mode_0 = const()[name = string("v_padded_9_mode_0"), val = string("constant")];
+            fp16 const_54_to_fp16 = const()[name = string("const_54_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> v_padded_9_cast_fp16 = pad(constant_val = const_54_to_fp16, mode = v_padded_9_mode_0, pad = v_padded_9_pad_0, x = input_131_cast_fp16)[name = string("v_padded_9_cast_fp16")];
+            tensor<int32, [4]> var_3630_begin_0 = const()[name = string("op_3630_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_3630_end_0 = const()[name = string("op_3630_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_3630_end_mask_0 = const()[name = string("op_3630_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_3630_cast_fp16 = slice_by_index(begin = var_3630_begin_0, end = var_3630_end_0, end_mask = var_3630_end_mask_0, x = K_sliding_slot_9_cast_fp16)[name = string("op_3630_cast_fp16")];
+            int32 var_3637 = const()[name = string("op_3637"), val = int32(2)];
+            bool K_sliding_out_9_interleave_0 = const()[name = string("K_sliding_out_9_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_out_9_cast_fp16 = concat(axis = var_3637, interleave = K_sliding_out_9_interleave_0, values = (var_3630_cast_fp16, k_padded_9_cast_fp16))[name = string("K_sliding_out_9_cast_fp16")];
+            tensor<int32, [4]> var_3653_begin_0 = const()[name = string("op_3653_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_3653_end_0 = const()[name = string("op_3653_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_3653_end_mask_0 = const()[name = string("op_3653_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_3653_cast_fp16 = slice_by_index(begin = var_3653_begin_0, end = var_3653_end_0, end_mask = var_3653_end_mask_0, x = V_sliding_slot_9_cast_fp16)[name = string("op_3653_cast_fp16")];
+            int32 var_3660 = const()[name = string("op_3660"), val = int32(2)];
+            bool V_sliding_out_9_interleave_0 = const()[name = string("V_sliding_out_9_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_out_9_cast_fp16 = concat(axis = var_3660, interleave = V_sliding_out_9_interleave_0, values = (var_3653_cast_fp16, v_padded_9_cast_fp16))[name = string("V_sliding_out_9_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_9_begin_0 = const()[name = string("K_for_attn_9_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_9_end_0 = const()[name = string("K_for_attn_9_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_9_end_mask_0 = const()[name = string("K_for_attn_9_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_9_cast_fp16 = slice_by_index(begin = K_for_attn_9_begin_0, end = K_for_attn_9_end_0, end_mask = K_for_attn_9_end_mask_0, x = K_sliding_out_9_cast_fp16)[name = string("K_for_attn_9_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_9_begin_0 = const()[name = string("V_for_attn_9_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_9_end_0 = const()[name = string("V_for_attn_9_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_9_end_mask_0 = const()[name = string("V_for_attn_9_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_9_cast_fp16 = slice_by_index(begin = V_for_attn_9_begin_0, end = V_for_attn_9_end_0, end_mask = V_for_attn_9_end_mask_0, x = V_sliding_out_9_cast_fp16)[name = string("V_for_attn_9_cast_fp16")];
+            tensor<int32, [4]> transpose_16_perm_0 = const()[name = string("transpose_16_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_8_reps_0 = const()[name = string("tile_8_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_16_cast_fp16 = transpose(perm = transpose_16_perm_0, x = K_for_attn_9_cast_fp16)[name = string("transpose_229")];
+            tensor<fp16, [8, 1, 512, 256]> tile_8_cast_fp16 = tile(reps = tile_8_reps_0, x = transpose_16_cast_fp16)[name = string("tile_8_cast_fp16")];
+            tensor<int32, [5]> concat_16 = const()[name = string("concat_16"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_16_cast_fp16 = reshape(shape = concat_16, x = tile_8_cast_fp16)[name = string("reshape_16_cast_fp16")];
+            tensor<int32, [5]> transpose_17_perm_0 = const()[name = string("transpose_17_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_17 = const()[name = string("concat_17"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_17_cast_fp16 = transpose(perm = transpose_17_perm_0, x = reshape_16_cast_fp16)[name = string("transpose_228")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_17_cast_fp16 = reshape(shape = concat_17, x = transpose_17_cast_fp16)[name = string("reshape_17_cast_fp16")];
+            tensor<int32, [4]> transpose_88_perm_0 = const()[name = string("transpose_88_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_18_perm_0 = const()[name = string("transpose_18_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_9_reps_0 = const()[name = string("tile_9_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_18_cast_fp16 = transpose(perm = transpose_18_perm_0, x = V_for_attn_9_cast_fp16)[name = string("transpose_227")];
+            tensor<fp16, [8, 1, 512, 256]> tile_9_cast_fp16 = tile(reps = tile_9_reps_0, x = transpose_18_cast_fp16)[name = string("tile_9_cast_fp16")];
+            tensor<int32, [5]> concat_18 = const()[name = string("concat_18"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_18_cast_fp16 = reshape(shape = concat_18, x = tile_9_cast_fp16)[name = string("reshape_18_cast_fp16")];
+            tensor<int32, [5]> transpose_19_perm_0 = const()[name = string("transpose_19_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_19 = const()[name = string("concat_19"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_19_cast_fp16 = transpose(perm = transpose_19_perm_0, x = reshape_18_cast_fp16)[name = string("transpose_226")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_19_cast_fp16 = reshape(shape = concat_19, x = transpose_19_cast_fp16)[name = string("reshape_19_cast_fp16")];
+            tensor<int32, [4]> V_expanded_9_perm_0 = const()[name = string("V_expanded_9_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_17_transpose_x_0 = const()[name = string("attn_weights_17_transpose_x_0"), val = bool(false)];
+            bool attn_weights_17_transpose_y_0 = const()[name = string("attn_weights_17_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_88_cast_fp16 = transpose(perm = transpose_88_perm_0, x = reshape_17_cast_fp16)[name = string("transpose_225")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_17_cast_fp16 = matmul(transpose_x = attn_weights_17_transpose_x_0, transpose_y = attn_weights_17_transpose_y_0, x = q_39_cast_fp16, y = transpose_88_cast_fp16)[name = string("attn_weights_17_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_87_cast_fp16 = add(x = attn_weights_17_cast_fp16, y = causal_mask_sliding)[name = string("x_87_cast_fp16")];
+            tensor<int32, [1]> reduce_max_4_axes_0 = const()[name = string("reduce_max_4_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_4_keep_dims_0 = const()[name = string("reduce_max_4_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_4 = reduce_max(axes = reduce_max_4_axes_0, keep_dims = reduce_max_4_keep_dims_0, x = x_87_cast_fp16)[name = string("reduce_max_4")];
+            tensor<fp16, [1, 8, 1, 512]> var_3701 = sub(x = x_87_cast_fp16, y = reduce_max_4)[name = string("op_3701")];
+            tensor<fp16, [1, 8, 1, 512]> var_3707 = exp(x = var_3701)[name = string("op_3707")];
+            tensor<int32, [1]> var_3717_axes_0 = const()[name = string("op_3717_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3717_keep_dims_0 = const()[name = string("op_3717_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_3717 = reduce_sum(axes = var_3717_axes_0, keep_dims = var_3717_keep_dims_0, x = var_3707)[name = string("op_3717")];
+            tensor<fp16, [1, 8, 1, 512]> var_3723_cast_fp16 = real_div(x = var_3707, y = var_3717)[name = string("op_3723_cast_fp16")];
+            bool attn_output_25_transpose_x_0 = const()[name = string("attn_output_25_transpose_x_0"), val = bool(false)];
+            bool attn_output_25_transpose_y_0 = const()[name = string("attn_output_25_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_9_cast_fp16 = transpose(perm = V_expanded_9_perm_0, x = reshape_19_cast_fp16)[name = string("transpose_224")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_25_cast_fp16 = matmul(transpose_x = attn_output_25_transpose_x_0, transpose_y = attn_output_25_transpose_y_0, x = var_3723_cast_fp16, y = V_expanded_9_cast_fp16)[name = string("attn_output_25_cast_fp16")];
+            tensor<int32, [4]> var_3734 = const()[name = string("op_3734"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_3741 = const()[name = string("op_3741"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_3735_cast_fp16 = transpose(perm = var_3734, x = attn_output_25_cast_fp16)[name = string("transpose_223")];
+            tensor<fp16, [1, 1, 2048]> attn_output_27_cast_fp16 = reshape(shape = var_3741, x = var_3735_cast_fp16)[name = string("attn_output_27_cast_fp16")];
+            tensor<int32, [3]> var_3746 = const()[name = string("op_3746"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_3762_pad_type_0 = const()[name = string("op_3762_pad_type_0"), val = string("valid")];
+            int32 var_3762_groups_0 = const()[name = string("op_3762_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_3762_strides_0 = const()[name = string("op_3762_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_3762_pad_0 = const()[name = string("op_3762_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_3762_dilations_0 = const()[name = string("op_3762_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_4_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(926497920))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(929119424))))[name = string("squeeze_4_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_3747_cast_fp16 = transpose(perm = var_3746, x = attn_output_27_cast_fp16)[name = string("transpose_222")];
+            tensor<fp16, [1, 2560, 1]> var_3762_cast_fp16 = conv(dilations = var_3762_dilations_0, groups = var_3762_groups_0, pad = var_3762_pad_0, pad_type = var_3762_pad_type_0, strides = var_3762_strides_0, weight = squeeze_4_cast_fp16_to_fp32_to_fp16_palettized, x = var_3747_cast_fp16)[name = string("op_3762_cast_fp16")];
+            tensor<int32, [3]> var_3766 = const()[name = string("op_3766"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3772 = const()[name = string("op_3772"), val = int32(-1)];
+            fp16 const_55_promoted_to_fp16 = const()[name = string("const_55_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_91_cast_fp16 = transpose(perm = var_3766, x = var_3762_cast_fp16)[name = string("transpose_221")];
+            tensor<fp16, [1, 1, 2560]> var_3774_cast_fp16 = mul(x = x_91_cast_fp16, y = const_55_promoted_to_fp16)[name = string("op_3774_cast_fp16")];
+            bool input_135_interleave_0 = const()[name = string("input_135_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_135_cast_fp16 = concat(axis = var_3772, interleave = input_135_interleave_0, values = (x_91_cast_fp16, var_3774_cast_fp16))[name = string("input_135_cast_fp16")];
+            tensor<int32, [1]> normed_125_axes_0 = const()[name = string("normed_125_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3769_to_fp16 = const()[name = string("op_3769_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_125_cast_fp16 = layer_norm(axes = normed_125_axes_0, epsilon = var_3769_to_fp16, x = input_135_cast_fp16)[name = string("normed_125_cast_fp16")];
+            tensor<int32, [2]> var_3779_split_sizes_0 = const()[name = string("op_3779_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3779_axis_0 = const()[name = string("op_3779_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3779_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3779_cast_fp16_1 = split(axis = var_3779_axis_0, split_sizes = var_3779_split_sizes_0, x = normed_125_cast_fp16)[name = string("op_3779_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_4_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_4_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(929122048)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_29_cast_fp16 = mul(x = var_3779_cast_fp16_0, y = layers_c2_4_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_29_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_93_cast_fp16 = add(x = x_79_cast_fp16, y = attn_output_29_cast_fp16)[name = string("x_93_cast_fp16")];
+            int32 var_3788 = const()[name = string("op_3788"), val = int32(-1)];
+            fp16 const_56_promoted_to_fp16 = const()[name = string("const_56_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_3790_cast_fp16 = mul(x = x_93_cast_fp16, y = const_56_promoted_to_fp16)[name = string("op_3790_cast_fp16")];
+            bool input_137_interleave_0 = const()[name = string("input_137_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_137_cast_fp16 = concat(axis = var_3788, interleave = input_137_interleave_0, values = (x_93_cast_fp16, var_3790_cast_fp16))[name = string("input_137_cast_fp16")];
+            tensor<int32, [1]> normed_129_axes_0 = const()[name = string("normed_129_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3785_to_fp16 = const()[name = string("op_3785_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_129_cast_fp16 = layer_norm(axes = normed_129_axes_0, epsilon = var_3785_to_fp16, x = input_137_cast_fp16)[name = string("normed_129_cast_fp16")];
+            tensor<int32, [2]> var_3795_split_sizes_0 = const()[name = string("op_3795_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3795_axis_0 = const()[name = string("op_3795_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3795_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3795_cast_fp16_1 = split(axis = var_3795_axis_0, split_sizes = var_3795_split_sizes_0, x = normed_129_cast_fp16)[name = string("op_3795_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_4_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_4_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(929127232)))];
+            tensor<fp16, [1, 1, 2560]> h_27_cast_fp16 = mul(x = var_3795_cast_fp16_0, y = layers_c2_4_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_27_cast_fp16")];
+            tensor<int32, [3]> var_3806 = const()[name = string("op_3806"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_139_axes_0 = const()[name = string("input_139_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3807 = transpose(perm = var_3806, x = h_27_cast_fp16)[name = string("transpose_220")];
+            tensor<fp16, [1, 2560, 1, 1]> input_139 = expand_dims(axes = input_139_axes_0, x = var_3807)[name = string("input_139")];
+            string gate_17_pad_type_0 = const()[name = string("gate_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_17_strides_0 = const()[name = string("gate_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_17_pad_0 = const()[name = string("gate_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_17_dilations_0 = const()[name = string("gate_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_17_groups_0 = const()[name = string("gate_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_17 = conv(dilations = gate_17_dilations_0, groups = gate_17_groups_0, pad = gate_17_pad_0, pad_type = gate_17_pad_type_0, strides = gate_17_strides_0, weight = layers_c2_4_mlp_gate_proj_weight_palettized, x = input_139)[name = string("gate_17")];
+            string up_9_pad_type_0 = const()[name = string("up_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_9_strides_0 = const()[name = string("up_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_9_pad_0 = const()[name = string("up_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_9_dilations_0 = const()[name = string("up_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_9_groups_0 = const()[name = string("up_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_9 = conv(dilations = up_9_dilations_0, groups = up_9_groups_0, pad = up_9_pad_0, pad_type = up_9_pad_type_0, strides = up_9_strides_0, weight = layers_c2_4_mlp_up_proj_weight_palettized, x = input_139)[name = string("up_9")];
+            string gate_19_mode_0 = const()[name = string("gate_19_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_19 = gelu(mode = gate_19_mode_0, x = gate_17)[name = string("gate_19")];
+            tensor<fp16, [1, 10240, 1, 1]> input_141 = mul(x = gate_19, y = up_9)[name = string("input_141")];
+            string mlp_out_9_pad_type_0 = const()[name = string("mlp_out_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_9_strides_0 = const()[name = string("mlp_out_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_9_pad_0 = const()[name = string("mlp_out_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_9_dilations_0 = const()[name = string("mlp_out_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_9_groups_0 = const()[name = string("mlp_out_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_9 = conv(dilations = mlp_out_9_dilations_0, groups = mlp_out_9_groups_0, pad = mlp_out_9_pad_0, pad_type = mlp_out_9_pad_type_0, strides = mlp_out_9_strides_0, weight = layers_c2_4_mlp_down_proj_weight_palettized, x = input_141)[name = string("mlp_out_9")];
+            tensor<int32, [1]> var_3847_axes_0 = const()[name = string("op_3847_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3847 = squeeze(axes = var_3847_axes_0, x = mlp_out_9)[name = string("op_3847")];
+            tensor<int32, [3]> var_3851 = const()[name = string("op_3851"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3857 = const()[name = string("op_3857"), val = int32(-1)];
+            fp16 const_57_promoted = const()[name = string("const_57_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_95 = transpose(perm = var_3851, x = var_3847)[name = string("transpose_219")];
+            tensor<fp16, [1, 1, 2560]> var_3859 = mul(x = x_95, y = const_57_promoted)[name = string("op_3859")];
+            bool input_143_interleave_0 = const()[name = string("input_143_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_143 = concat(axis = var_3857, interleave = input_143_interleave_0, values = (x_95, var_3859))[name = string("input_143")];
+            tensor<int32, [1]> normed_133_axes_0 = const()[name = string("normed_133_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3854_to_fp16 = const()[name = string("op_3854_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_133_cast_fp16 = layer_norm(axes = normed_133_axes_0, epsilon = var_3854_to_fp16, x = input_143)[name = string("normed_133_cast_fp16")];
+            tensor<int32, [2]> var_3864_split_sizes_0 = const()[name = string("op_3864_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3864_axis_0 = const()[name = string("op_3864_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3864_0, tensor<fp16, [1, 1, 2560]> var_3864_1 = split(axis = var_3864_axis_0, split_sizes = var_3864_split_sizes_0, x = normed_133_cast_fp16)[name = string("op_3864")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_43 = mul(x = var_3864_0, y = layers_c2_4_post_feedforward_layernorm_weight)[name = string("hidden_states_43")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_45_cast_fp16 = add(x = x_93_cast_fp16, y = hidden_states_43)[name = string("hidden_states_45_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_9_begin_0 = const()[name = string("per_layer_slice_9_begin_0"), val = tensor<int32, [3]>([0, 0, 4096])];
+            tensor<int32, [3]> per_layer_slice_9_end_0 = const()[name = string("per_layer_slice_9_end_0"), val = tensor<int32, [3]>([1, 1, 4352])];
+            tensor<bool, [3]> per_layer_slice_9_end_mask_0 = const()[name = string("per_layer_slice_9_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_9_cast_fp16 = slice_by_index(begin = per_layer_slice_9_begin_0, end = per_layer_slice_9_end_0, end_mask = per_layer_slice_9_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_9_cast_fp16")];
+            tensor<int32, [3]> var_3892 = const()[name = string("op_3892"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_145_axes_0 = const()[name = string("input_145_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3893 = transpose(perm = var_3892, x = hidden_states_45_cast_fp16)[name = string("transpose_218")];
+            tensor<fp16, [1, 2560, 1, 1]> input_145 = expand_dims(axes = input_145_axes_0, x = var_3893)[name = string("input_145")];
+            string gated_25_pad_type_0 = const()[name = string("gated_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_25_strides_0 = const()[name = string("gated_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_25_pad_0 = const()[name = string("gated_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_25_dilations_0 = const()[name = string("gated_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_25_groups_0 = const()[name = string("gated_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_25 = conv(dilations = gated_25_dilations_0, groups = gated_25_groups_0, pad = gated_25_pad_0, pad_type = gated_25_pad_type_0, strides = gated_25_strides_0, weight = layers_c2_4_per_layer_input_gate_weight_palettized, x = input_145)[name = string("gated_25")];
+            string gated_27_mode_0 = const()[name = string("gated_27_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_27 = gelu(mode = gated_27_mode_0, x = gated_25)[name = string("gated_27")];
+            tensor<int32, [3]> var_3912 = const()[name = string("op_3912"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_9_axes_0 = const()[name = string("per_layer_slice_conv_9_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_3913_cast_fp16 = transpose(perm = var_3912, x = per_layer_slice_9_cast_fp16)[name = string("transpose_217")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_9_cast_fp16 = expand_dims(axes = per_layer_slice_conv_9_axes_0, x = var_3913_cast_fp16)[name = string("per_layer_slice_conv_9_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_147_cast_fp16 = mul(x = gated_27, y = per_layer_slice_conv_9_cast_fp16)[name = string("input_147_cast_fp16")];
+            string gated_29_pad_type_0 = const()[name = string("gated_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_29_strides_0 = const()[name = string("gated_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_29_pad_0 = const()[name = string("gated_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_29_dilations_0 = const()[name = string("gated_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_29_groups_0 = const()[name = string("gated_29_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_c2_4_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(929132416))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(929460160))))[name = string("layers_c2_4_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_29_cast_fp16 = conv(dilations = gated_29_dilations_0, groups = gated_29_groups_0, pad = gated_29_pad_0, pad_type = gated_29_pad_type_0, strides = gated_29_strides_0, weight = layers_c2_4_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_147_cast_fp16)[name = string("gated_29_cast_fp16")];
+            tensor<int32, [1]> var_3929_axes_0 = const()[name = string("op_3929_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3929_cast_fp16 = squeeze(axes = var_3929_axes_0, x = gated_29_cast_fp16)[name = string("op_3929_cast_fp16")];
+            tensor<int32, [3]> var_3933 = const()[name = string("op_3933"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3939 = const()[name = string("op_3939"), val = int32(-1)];
+            fp16 const_58_promoted_to_fp16 = const()[name = string("const_58_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_97_cast_fp16 = transpose(perm = var_3933, x = var_3929_cast_fp16)[name = string("transpose_216")];
+            tensor<fp16, [1, 1, 2560]> var_3941_cast_fp16 = mul(x = x_97_cast_fp16, y = const_58_promoted_to_fp16)[name = string("op_3941_cast_fp16")];
+            bool input_149_interleave_0 = const()[name = string("input_149_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_149_cast_fp16 = concat(axis = var_3939, interleave = input_149_interleave_0, values = (x_97_cast_fp16, var_3941_cast_fp16))[name = string("input_149_cast_fp16")];
+            tensor<int32, [1]> normed_137_axes_0 = const()[name = string("normed_137_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3936_to_fp16 = const()[name = string("op_3936_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_137_cast_fp16 = layer_norm(axes = normed_137_axes_0, epsilon = var_3936_to_fp16, x = input_149_cast_fp16)[name = string("normed_137_cast_fp16")];
+            tensor<int32, [2]> var_3946_split_sizes_0 = const()[name = string("op_3946_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3946_axis_0 = const()[name = string("op_3946_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3946_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3946_cast_fp16_1 = split(axis = var_3946_axis_0, split_sizes = var_3946_split_sizes_0, x = normed_137_cast_fp16)[name = string("op_3946_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_4_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_c2_4_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(929462784)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_49_cast_fp16 = mul(x = var_3946_cast_fp16_0, y = layers_c2_4_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_49_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_51_cast_fp16 = add(x = hidden_states_45_cast_fp16, y = hidden_states_49_cast_fp16)[name = string("hidden_states_51_cast_fp16")];
+            tensor<fp16, [1]> const_59_promoted_to_fp16 = const()[name = string("const_59_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.46p-1])];
+            tensor<fp16, [1, 1, 2560]> x_99_cast_fp16 = mul(x = hidden_states_51_cast_fp16, y = const_59_promoted_to_fp16)[name = string("x_99_cast_fp16")];
+            tensor<int32, [1]> var_3958_axes_0 = const()[name = string("op_3958_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_3958_cast_fp16 = squeeze(axes = var_3958_axes_0, x = K_sliding_out_9_cast_fp16)[name = string("op_3958_cast_fp16")];
+            tensor<int32, [1]> var_3960_axes_0 = const()[name = string("op_3960_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_3960_cast_fp16 = squeeze(axes = var_3960_axes_0, x = V_sliding_out_9_cast_fp16)[name = string("op_3960_cast_fp16")];
+            tensor<int32, [4]> var_3963_begin_0 = const()[name = string("op_3963_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3963_end_0 = const()[name = string("op_3963_end_0"), val = tensor<int32, [4]>([1, 2, 2048, 512])];
+            tensor<bool, [4]> var_3963_end_mask_0 = const()[name = string("op_3963_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_3963_squeeze_mask_0 = const()[name = string("op_3963_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 2048, 512]> var_3963_cast_fp16 = slice_by_index(begin = var_3963_begin_0, end = var_3963_end_0, end_mask = var_3963_end_mask_0, squeeze_mask = var_3963_squeeze_mask_0, x = K_full_in)[name = string("op_3963_cast_fp16")];
+            tensor<int32, [1]> K_full_slot_1_axes_0 = const()[name = string("K_full_slot_1_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 2048, 512]> K_full_slot_1_cast_fp16 = expand_dims(axes = K_full_slot_1_axes_0, x = var_3963_cast_fp16)[name = string("K_full_slot_1_cast_fp16")];
+            tensor<int32, [4]> var_3968_begin_0 = const()[name = string("op_3968_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> var_3968_end_0 = const()[name = string("op_3968_end_0"), val = tensor<int32, [4]>([1, 2, 2048, 512])];
+            tensor<bool, [4]> var_3968_end_mask_0 = const()[name = string("op_3968_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_3968_squeeze_mask_0 = const()[name = string("op_3968_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 2048, 512]> var_3968_cast_fp16 = slice_by_index(begin = var_3968_begin_0, end = var_3968_end_0, end_mask = var_3968_end_mask_0, squeeze_mask = var_3968_squeeze_mask_0, x = V_full_in)[name = string("op_3968_cast_fp16")];
+            tensor<int32, [1]> V_full_slot_1_axes_0 = const()[name = string("V_full_slot_1_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 2048, 512]> V_full_slot_1_cast_fp16 = expand_dims(axes = V_full_slot_1_axes_0, x = var_3968_cast_fp16)[name = string("V_full_slot_1_cast_fp16")];
+            int32 var_3975 = const()[name = string("op_3975"), val = int32(-1)];
+            fp16 const_60_promoted_to_fp16 = const()[name = string("const_60_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_3977_cast_fp16 = mul(x = x_99_cast_fp16, y = const_60_promoted_to_fp16)[name = string("op_3977_cast_fp16")];
+            bool input_151_interleave_0 = const()[name = string("input_151_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_151_cast_fp16 = concat(axis = var_3975, interleave = input_151_interleave_0, values = (x_99_cast_fp16, var_3977_cast_fp16))[name = string("input_151_cast_fp16")];
+            tensor<int32, [1]> normed_141_axes_0 = const()[name = string("normed_141_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3972_to_fp16 = const()[name = string("op_3972_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_141_cast_fp16 = layer_norm(axes = normed_141_axes_0, epsilon = var_3972_to_fp16, x = input_151_cast_fp16)[name = string("normed_141_cast_fp16")];
+            tensor<int32, [2]> var_3982_split_sizes_0 = const()[name = string("op_3982_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3982_axis_0 = const()[name = string("op_3982_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3982_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3982_cast_fp16_1 = split(axis = var_3982_axis_0, split_sizes = var_3982_split_sizes_0, x = normed_141_cast_fp16)[name = string("op_3982_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_5_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_5_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(929467968)))];
+            tensor<fp16, [1, 1, 2560]> h_31_cast_fp16 = mul(x = var_3982_cast_fp16_0, y = layers_c2_5_input_layernorm_weight_promoted_to_fp16)[name = string("h_31_cast_fp16")];
+            tensor<int32, [3]> var_3988 = const()[name = string("op_3988"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_3991_axes_0 = const()[name = string("op_3991_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3989_cast_fp16 = transpose(perm = var_3988, x = h_31_cast_fp16)[name = string("transpose_215")];
+            tensor<fp16, [1, 2560, 1, 1]> var_3991_cast_fp16 = expand_dims(axes = var_3991_axes_0, x = var_3989_cast_fp16)[name = string("op_3991_cast_fp16")];
+            string var_4007_pad_type_0 = const()[name = string("op_4007_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_4007_strides_0 = const()[name = string("op_4007_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_4007_pad_0 = const()[name = string("op_4007_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_4007_dilations_0 = const()[name = string("op_4007_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_4007_groups_0 = const()[name = string("op_4007_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 4096, 1, 1]> var_4007 = conv(dilations = var_4007_dilations_0, groups = var_4007_groups_0, pad = var_4007_pad_0, pad_type = var_4007_pad_type_0, strides = var_4007_strides_0, weight = layers_c2_5_self_attn_q_proj_weight_palettized, x = var_3991_cast_fp16)[name = string("op_4007")];
+            tensor<int32, [4]> var_4012 = const()[name = string("op_4012"), val = tensor<int32, [4]>([1, 8, 512, 1])];
+            tensor<fp16, [1, 8, 512, 1]> var_4013 = reshape(shape = var_4012, x = var_4007)[name = string("op_4013")];
+            tensor<int32, [4]> var_4018 = const()[name = string("op_4018"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_4028 = const()[name = string("op_4028"), val = tensor<int32, [3]>([1, 8, 512])];
+            tensor<fp16, [1, 8, 1, 512]> var_4019 = transpose(perm = var_4018, x = var_4013)[name = string("transpose_214")];
+            tensor<fp16, [1, 8, 512]> x_101 = reshape(shape = var_4028, x = var_4019)[name = string("x_101")];
+            int32 var_4034 = const()[name = string("op_4034"), val = int32(-1)];
+            fp16 const_61_promoted = const()[name = string("const_61_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 512]> var_4036 = mul(x = x_101, y = const_61_promoted)[name = string("op_4036")];
+            bool input_155_interleave_0 = const()[name = string("input_155_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1024]> input_155 = concat(axis = var_4034, interleave = input_155_interleave_0, values = (x_101, var_4036))[name = string("input_155")];
+            tensor<int32, [1]> normed_145_axes_0 = const()[name = string("normed_145_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4031_to_fp16 = const()[name = string("op_4031_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 1024]> normed_145_cast_fp16 = layer_norm(axes = normed_145_axes_0, epsilon = var_4031_to_fp16, x = input_155)[name = string("normed_145_cast_fp16")];
+            tensor<int32, [2]> var_4041_split_sizes_0 = const()[name = string("op_4041_split_sizes_0"), val = tensor<int32, [2]>([512, 512])];
+            int32 var_4041_axis_0 = const()[name = string("op_4041_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 512]> var_4041_0, tensor<fp16, [1, 8, 512]> var_4041_1 = split(axis = var_4041_axis_0, split_sizes = var_4041_split_sizes_0, x = normed_145_cast_fp16)[name = string("op_4041")];
+            tensor<fp16, [1, 8, 512]> var_4043 = mul(x = var_4041_0, y = layers_c2_5_self_attn_q_norm_weight)[name = string("op_4043")];
+            tensor<int32, [4]> var_4048 = const()[name = string("op_4048"), val = tensor<int32, [4]>([1, 8, 1, 512])];
+            tensor<fp16, [1, 8, 1, 512]> q_43 = reshape(shape = var_4048, x = var_4043)[name = string("q_43")];
+            tensor<fp16, [1, 8, 1, 512]> var_4050_cast_fp16 = mul(x = q_43, y = cos_f)[name = string("op_4050_cast_fp16")];
+            tensor<int32, [2]> var_4051_split_sizes_0 = const()[name = string("op_4051_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_4051_axis_0 = const()[name = string("op_4051_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 256]> var_4051_0, tensor<fp16, [1, 8, 1, 256]> var_4051_1 = split(axis = var_4051_axis_0, split_sizes = var_4051_split_sizes_0, x = q_43)[name = string("op_4051")];
+            fp16 const_62_promoted = const()[name = string("const_62_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 256]> var_4053 = mul(x = var_4051_1, y = const_62_promoted)[name = string("op_4053")];
+            int32 var_4055 = const()[name = string("op_4055"), val = int32(-1)];
+            bool var_4056_interleave_0 = const()[name = string("op_4056_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> var_4056 = concat(axis = var_4055, interleave = var_4056_interleave_0, values = (var_4053, var_4051_0))[name = string("op_4056")];
+            tensor<fp16, [1, 8, 1, 512]> var_4057_cast_fp16 = mul(x = var_4056, y = sin_f)[name = string("op_4057_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> q_47_cast_fp16 = add(x = var_4050_cast_fp16, y = var_4057_cast_fp16)[name = string("q_47_cast_fp16")];
+            string var_4070_pad_type_0 = const()[name = string("op_4070_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_4070_strides_0 = const()[name = string("op_4070_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_4070_pad_0 = const()[name = string("op_4070_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_4070_dilations_0 = const()[name = string("op_4070_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_4070_groups_0 = const()[name = string("op_4070_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> var_4070 = conv(dilations = var_4070_dilations_0, groups = var_4070_groups_0, pad = var_4070_pad_0, pad_type = var_4070_pad_type_0, strides = var_4070_strides_0, weight = layers_c2_5_self_attn_k_proj_weight_palettized, x = var_3991_cast_fp16)[name = string("op_4070")];
+            tensor<int32, [4]> var_4075 = const()[name = string("op_4075"), val = tensor<int32, [4]>([1, 2, 512, 1])];
+            tensor<fp16, [1, 2, 512, 1]> var_4076 = reshape(shape = var_4075, x = var_4070)[name = string("op_4076")];
+            tensor<int32, [4]> var_4081 = const()[name = string("op_4081"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_4098_pad_type_0 = const()[name = string("op_4098_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_4098_strides_0 = const()[name = string("op_4098_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_4098_pad_0 = const()[name = string("op_4098_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_4098_dilations_0 = const()[name = string("op_4098_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_4098_groups_0 = const()[name = string("op_4098_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> var_4098 = conv(dilations = var_4098_dilations_0, groups = var_4098_groups_0, pad = var_4098_pad_0, pad_type = var_4098_pad_type_0, strides = var_4098_strides_0, weight = layers_c2_5_self_attn_v_proj_weight_palettized, x = var_3991_cast_fp16)[name = string("op_4098")];
+            tensor<int32, [4]> var_4103 = const()[name = string("op_4103"), val = tensor<int32, [4]>([1, 2, 512, 1])];
+            tensor<fp16, [1, 2, 512, 1]> var_4104 = reshape(shape = var_4103, x = var_4098)[name = string("op_4104")];
+            tensor<int32, [4]> var_4109 = const()[name = string("op_4109"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_4119 = const()[name = string("op_4119"), val = tensor<int32, [3]>([1, 2, 512])];
+            tensor<fp16, [1, 2, 1, 512]> var_4082 = transpose(perm = var_4081, x = var_4076)[name = string("transpose_213")];
+            tensor<fp16, [1, 2, 512]> x_103 = reshape(shape = var_4119, x = var_4082)[name = string("x_103")];
+            int32 var_4125 = const()[name = string("op_4125"), val = int32(-1)];
+            fp16 const_63_promoted = const()[name = string("const_63_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 512]> var_4127 = mul(x = x_103, y = const_63_promoted)[name = string("op_4127")];
+            bool input_157_interleave_0 = const()[name = string("input_157_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1024]> input_157 = concat(axis = var_4125, interleave = input_157_interleave_0, values = (x_103, var_4127))[name = string("input_157")];
+            tensor<int32, [1]> normed_149_axes_0 = const()[name = string("normed_149_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4122_to_fp16 = const()[name = string("op_4122_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1024]> normed_149_cast_fp16 = layer_norm(axes = normed_149_axes_0, epsilon = var_4122_to_fp16, x = input_157)[name = string("normed_149_cast_fp16")];
+            tensor<int32, [2]> var_4132_split_sizes_0 = const()[name = string("op_4132_split_sizes_0"), val = tensor<int32, [2]>([512, 512])];
+            int32 var_4132_axis_0 = const()[name = string("op_4132_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 512]> var_4132_0, tensor<fp16, [1, 2, 512]> var_4132_1 = split(axis = var_4132_axis_0, split_sizes = var_4132_split_sizes_0, x = normed_149_cast_fp16)[name = string("op_4132")];
+            tensor<fp16, [1, 2, 512]> var_4134 = mul(x = var_4132_0, y = layers_c2_5_self_attn_k_norm_weight)[name = string("op_4134")];
+            tensor<int32, [4]> var_4139 = const()[name = string("op_4139"), val = tensor<int32, [4]>([1, 2, 1, 512])];
+            tensor<fp16, [1, 2, 1, 512]> q_45 = reshape(shape = var_4139, x = var_4134)[name = string("q_45")];
+            fp16 var_4141_promoted = const()[name = string("op_4141_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 512]> var_4110 = transpose(perm = var_4109, x = var_4104)[name = string("transpose_212")];
+            tensor<fp16, [1, 2, 1, 512]> var_4142 = pow(x = var_4110, y = var_4141_promoted)[name = string("op_4142")];
+            tensor<int32, [1]> var_4147_axes_0 = const()[name = string("op_4147_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4147_keep_dims_0 = const()[name = string("op_4147_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_4147 = reduce_mean(axes = var_4147_axes_0, keep_dims = var_4147_keep_dims_0, x = var_4142)[name = string("op_4147")];
+            fp16 var_4149_to_fp16 = const()[name = string("op_4149_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_11_cast_fp16 = add(x = var_4147, y = var_4149_to_fp16)[name = string("mean_sq_11_cast_fp16")];
+            fp32 var_4151_epsilon_0 = const()[name = string("op_4151_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_4151_cast_fp16 = rsqrt(epsilon = var_4151_epsilon_0, x = mean_sq_11_cast_fp16)[name = string("op_4151_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 512]> v_1_cast_fp16 = mul(x = var_4110, y = var_4151_cast_fp16)[name = string("v_1_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 512]> var_4153_cast_fp16 = mul(x = q_45, y = cos_f)[name = string("op_4153_cast_fp16")];
+            tensor<int32, [2]> var_4154_split_sizes_0 = const()[name = string("op_4154_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_4154_axis_0 = const()[name = string("op_4154_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 256]> var_4154_0, tensor<fp16, [1, 2, 1, 256]> var_4154_1 = split(axis = var_4154_axis_0, split_sizes = var_4154_split_sizes_0, x = q_45)[name = string("op_4154")];
+            fp16 const_64_promoted = const()[name = string("const_64_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 256]> var_4156 = mul(x = var_4154_1, y = const_64_promoted)[name = string("op_4156")];
+            int32 var_4158 = const()[name = string("op_4158"), val = int32(-1)];
+            bool var_4159_interleave_0 = const()[name = string("op_4159_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 512]> var_4159 = concat(axis = var_4158, interleave = var_4159_interleave_0, values = (var_4156, var_4154_0))[name = string("op_4159")];
+            tensor<fp16, [1, 2, 1, 512]> var_4160_cast_fp16 = mul(x = var_4159, y = sin_f)[name = string("op_4160_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 512]> k_13_cast_fp16 = add(x = var_4153_cast_fp16, y = var_4160_cast_fp16)[name = string("k_13_cast_fp16")];
+            fp16 var_4163_promoted_to_fp16 = const()[name = string("op_4163_promoted_to_fp16"), val = fp16(0x1p+0)];
+            tensor<fp16, [1, 1, 2048, 1]> var_4165_cast_fp16 = sub(x = var_4163_promoted_to_fp16, y = update_mask)[name = string("op_4165_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_4166_cast_fp16 = mul(x = K_full_slot_1_cast_fp16, y = var_4165_cast_fp16)[name = string("op_4166_cast_fp16")];
+            tensor<int32, [4]> var_4167_reps_0 = const()[name = string("op_4167_reps_0"), val = tensor<int32, [4]>([1, 1, 2048, 1])];
+            tensor<fp16, [1, 2, 2048, 512]> var_4167_cast_fp16 = tile(reps = var_4167_reps_0, x = k_13_cast_fp16)[name = string("op_4167_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_4168_cast_fp16 = mul(x = var_4167_cast_fp16, y = update_mask)[name = string("op_4168_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> K_full_out_1_cast_fp16 = add(x = var_4166_cast_fp16, y = var_4168_cast_fp16)[name = string("K_full_out_1_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_4174_cast_fp16 = mul(x = V_full_slot_1_cast_fp16, y = var_4165_cast_fp16)[name = string("op_4174_cast_fp16")];
+            tensor<int32, [4]> var_4175_reps_0 = const()[name = string("op_4175_reps_0"), val = tensor<int32, [4]>([1, 1, 2048, 1])];
+            tensor<fp16, [1, 2, 2048, 512]> var_4175_cast_fp16 = tile(reps = var_4175_reps_0, x = v_1_cast_fp16)[name = string("op_4175_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_4176_cast_fp16 = mul(x = var_4175_cast_fp16, y = update_mask)[name = string("op_4176_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> V_full_out_1_cast_fp16 = add(x = var_4174_cast_fp16, y = var_4176_cast_fp16)[name = string("V_full_out_1_cast_fp16")];
+            tensor<int32, [4]> transpose_20_perm_0 = const()[name = string("transpose_20_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_10_reps_0 = const()[name = string("tile_10_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 2048, 512]> transpose_20_cast_fp16 = transpose(perm = transpose_20_perm_0, x = K_full_out_1_cast_fp16)[name = string("transpose_211")];
+            tensor<fp16, [8, 1, 2048, 512]> tile_10_cast_fp16 = tile(reps = tile_10_reps_0, x = transpose_20_cast_fp16)[name = string("tile_10_cast_fp16")];
+            tensor<int32, [5]> concat_20 = const()[name = string("concat_20"), val = tensor<int32, [5]>([4, 2, 1, 2048, 512])];
+            tensor<fp16, [4, 2, 1, 2048, 512]> reshape_20_cast_fp16 = reshape(shape = concat_20, x = tile_10_cast_fp16)[name = string("reshape_20_cast_fp16")];
+            tensor<int32, [5]> transpose_21_perm_0 = const()[name = string("transpose_21_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_21 = const()[name = string("concat_21"), val = tensor<int32, [4]>([-1, 1, 2048, 512])];
+            tensor<fp16, [2, 4, 1, 2048, 512]> transpose_21_cast_fp16 = transpose(perm = transpose_21_perm_0, x = reshape_20_cast_fp16)[name = string("transpose_210")];
+            tensor<fp16, [8, 1, 2048, 512]> reshape_21_cast_fp16 = reshape(shape = concat_21, x = transpose_21_cast_fp16)[name = string("reshape_21_cast_fp16")];
+            tensor<int32, [4]> transpose_89_perm_0 = const()[name = string("transpose_89_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_22_perm_0 = const()[name = string("transpose_22_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_11_reps_0 = const()[name = string("tile_11_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 2048, 512]> transpose_22_cast_fp16 = transpose(perm = transpose_22_perm_0, x = V_full_out_1_cast_fp16)[name = string("transpose_209")];
+            tensor<fp16, [8, 1, 2048, 512]> tile_11_cast_fp16 = tile(reps = tile_11_reps_0, x = transpose_22_cast_fp16)[name = string("tile_11_cast_fp16")];
+            tensor<int32, [5]> concat_22 = const()[name = string("concat_22"), val = tensor<int32, [5]>([4, 2, 1, 2048, 512])];
+            tensor<fp16, [4, 2, 1, 2048, 512]> reshape_22_cast_fp16 = reshape(shape = concat_22, x = tile_11_cast_fp16)[name = string("reshape_22_cast_fp16")];
+            tensor<int32, [5]> transpose_23_perm_0 = const()[name = string("transpose_23_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_23 = const()[name = string("concat_23"), val = tensor<int32, [4]>([-1, 1, 2048, 512])];
+            tensor<fp16, [2, 4, 1, 2048, 512]> transpose_23_cast_fp16 = transpose(perm = transpose_23_perm_0, x = reshape_22_cast_fp16)[name = string("transpose_208")];
+            tensor<fp16, [8, 1, 2048, 512]> reshape_23_cast_fp16 = reshape(shape = concat_23, x = transpose_23_cast_fp16)[name = string("reshape_23_cast_fp16")];
+            tensor<int32, [4]> V_expanded_11_perm_0 = const()[name = string("V_expanded_11_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_21_transpose_x_0 = const()[name = string("attn_weights_21_transpose_x_0"), val = bool(false)];
+            bool attn_weights_21_transpose_y_0 = const()[name = string("attn_weights_21_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 2048]> transpose_89_cast_fp16 = transpose(perm = transpose_89_perm_0, x = reshape_21_cast_fp16)[name = string("transpose_207")];
+            tensor<fp16, [1, 8, 1, 2048]> attn_weights_21_cast_fp16 = matmul(transpose_x = attn_weights_21_transpose_x_0, transpose_y = attn_weights_21_transpose_y_0, x = q_47_cast_fp16, y = transpose_89_cast_fp16)[name = string("attn_weights_21_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 2048]> x_107_cast_fp16 = add(x = attn_weights_21_cast_fp16, y = causal_mask_full)[name = string("x_107_cast_fp16")];
+            tensor<int32, [1]> reduce_max_5_axes_0 = const()[name = string("reduce_max_5_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_5_keep_dims_0 = const()[name = string("reduce_max_5_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_5 = reduce_max(axes = reduce_max_5_axes_0, keep_dims = reduce_max_5_keep_dims_0, x = x_107_cast_fp16)[name = string("reduce_max_5")];
+            tensor<fp16, [1, 8, 1, 2048]> var_4218 = sub(x = x_107_cast_fp16, y = reduce_max_5)[name = string("op_4218")];
+            tensor<fp16, [1, 8, 1, 2048]> var_4224 = exp(x = var_4218)[name = string("op_4224")];
+            tensor<int32, [1]> var_4234_axes_0 = const()[name = string("op_4234_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4234_keep_dims_0 = const()[name = string("op_4234_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_4234 = reduce_sum(axes = var_4234_axes_0, keep_dims = var_4234_keep_dims_0, x = var_4224)[name = string("op_4234")];
+            tensor<fp16, [1, 8, 1, 2048]> var_4240_cast_fp16 = real_div(x = var_4224, y = var_4234)[name = string("op_4240_cast_fp16")];
+            bool attn_output_31_transpose_x_0 = const()[name = string("attn_output_31_transpose_x_0"), val = bool(false)];
+            bool attn_output_31_transpose_y_0 = const()[name = string("attn_output_31_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 2048, 512]> V_expanded_11_cast_fp16 = transpose(perm = V_expanded_11_perm_0, x = reshape_23_cast_fp16)[name = string("transpose_206")];
+            tensor<fp16, [1, 8, 1, 512]> attn_output_31_cast_fp16 = matmul(transpose_x = attn_output_31_transpose_x_0, transpose_y = attn_output_31_transpose_y_0, x = var_4240_cast_fp16, y = V_expanded_11_cast_fp16)[name = string("attn_output_31_cast_fp16")];
+            tensor<int32, [4]> var_4251 = const()[name = string("op_4251"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_4258 = const()[name = string("op_4258"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 512]> var_4252_cast_fp16 = transpose(perm = var_4251, x = attn_output_31_cast_fp16)[name = string("transpose_205")];
+            tensor<fp16, [1, 1, 4096]> attn_output_33_cast_fp16 = reshape(shape = var_4258, x = var_4252_cast_fp16)[name = string("attn_output_33_cast_fp16")];
+            tensor<int32, [3]> var_4263 = const()[name = string("op_4263"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_4279_pad_type_0 = const()[name = string("op_4279_pad_type_0"), val = string("valid")];
+            int32 var_4279_groups_0 = const()[name = string("op_4279_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_4279_strides_0 = const()[name = string("op_4279_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_4279_pad_0 = const()[name = string("op_4279_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_4279_dilations_0 = const()[name = string("op_4279_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 4096, 1]> squeeze_5_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 4096, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(929473152))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(934716096))))[name = string("squeeze_5_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 4096, 1]> var_4264_cast_fp16 = transpose(perm = var_4263, x = attn_output_33_cast_fp16)[name = string("transpose_204")];
+            tensor<fp16, [1, 2560, 1]> var_4279_cast_fp16 = conv(dilations = var_4279_dilations_0, groups = var_4279_groups_0, pad = var_4279_pad_0, pad_type = var_4279_pad_type_0, strides = var_4279_strides_0, weight = squeeze_5_cast_fp16_to_fp32_to_fp16_palettized, x = var_4264_cast_fp16)[name = string("op_4279_cast_fp16")];
+            tensor<int32, [3]> var_4283 = const()[name = string("op_4283"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_4289 = const()[name = string("op_4289"), val = int32(-1)];
+            fp16 const_65_promoted_to_fp16 = const()[name = string("const_65_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_111_cast_fp16 = transpose(perm = var_4283, x = var_4279_cast_fp16)[name = string("transpose_203")];
+            tensor<fp16, [1, 1, 2560]> var_4291_cast_fp16 = mul(x = x_111_cast_fp16, y = const_65_promoted_to_fp16)[name = string("op_4291_cast_fp16")];
+            bool input_161_interleave_0 = const()[name = string("input_161_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_161_cast_fp16 = concat(axis = var_4289, interleave = input_161_interleave_0, values = (x_111_cast_fp16, var_4291_cast_fp16))[name = string("input_161_cast_fp16")];
+            tensor<int32, [1]> normed_153_axes_0 = const()[name = string("normed_153_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4286_to_fp16 = const()[name = string("op_4286_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_153_cast_fp16 = layer_norm(axes = normed_153_axes_0, epsilon = var_4286_to_fp16, x = input_161_cast_fp16)[name = string("normed_153_cast_fp16")];
+            tensor<int32, [2]> var_4296_split_sizes_0 = const()[name = string("op_4296_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4296_axis_0 = const()[name = string("op_4296_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_4296_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_4296_cast_fp16_1 = split(axis = var_4296_axis_0, split_sizes = var_4296_split_sizes_0, x = normed_153_cast_fp16)[name = string("op_4296_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_5_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_5_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(934718720)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_35_cast_fp16 = mul(x = var_4296_cast_fp16_0, y = layers_c2_5_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_35_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_113_cast_fp16 = add(x = x_99_cast_fp16, y = attn_output_35_cast_fp16)[name = string("x_113_cast_fp16")];
+            int32 var_4305 = const()[name = string("op_4305"), val = int32(-1)];
+            fp16 const_66_promoted_to_fp16 = const()[name = string("const_66_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_4307_cast_fp16 = mul(x = x_113_cast_fp16, y = const_66_promoted_to_fp16)[name = string("op_4307_cast_fp16")];
+            bool input_163_interleave_0 = const()[name = string("input_163_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_163_cast_fp16 = concat(axis = var_4305, interleave = input_163_interleave_0, values = (x_113_cast_fp16, var_4307_cast_fp16))[name = string("input_163_cast_fp16")];
+            tensor<int32, [1]> normed_157_axes_0 = const()[name = string("normed_157_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4302_to_fp16 = const()[name = string("op_4302_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_157_cast_fp16 = layer_norm(axes = normed_157_axes_0, epsilon = var_4302_to_fp16, x = input_163_cast_fp16)[name = string("normed_157_cast_fp16")];
+            tensor<int32, [2]> var_4312_split_sizes_0 = const()[name = string("op_4312_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4312_axis_0 = const()[name = string("op_4312_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_4312_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_4312_cast_fp16_1 = split(axis = var_4312_axis_0, split_sizes = var_4312_split_sizes_0, x = normed_157_cast_fp16)[name = string("op_4312_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_5_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_5_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(934723904)))];
+            tensor<fp16, [1, 1, 2560]> h_33_cast_fp16 = mul(x = var_4312_cast_fp16_0, y = layers_c2_5_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_33_cast_fp16")];
+            tensor<int32, [3]> var_4323 = const()[name = string("op_4323"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_165_axes_0 = const()[name = string("input_165_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_4324 = transpose(perm = var_4323, x = h_33_cast_fp16)[name = string("transpose_202")];
+            tensor<fp16, [1, 2560, 1, 1]> input_165 = expand_dims(axes = input_165_axes_0, x = var_4324)[name = string("input_165")];
+            string gate_21_pad_type_0 = const()[name = string("gate_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_21_strides_0 = const()[name = string("gate_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_21_pad_0 = const()[name = string("gate_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_21_dilations_0 = const()[name = string("gate_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_21_groups_0 = const()[name = string("gate_21_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_21 = conv(dilations = gate_21_dilations_0, groups = gate_21_groups_0, pad = gate_21_pad_0, pad_type = gate_21_pad_type_0, strides = gate_21_strides_0, weight = layers_c2_5_mlp_gate_proj_weight_palettized, x = input_165)[name = string("gate_21")];
+            string up_11_pad_type_0 = const()[name = string("up_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_11_strides_0 = const()[name = string("up_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_11_pad_0 = const()[name = string("up_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_11_dilations_0 = const()[name = string("up_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_11_groups_0 = const()[name = string("up_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_11 = conv(dilations = up_11_dilations_0, groups = up_11_groups_0, pad = up_11_pad_0, pad_type = up_11_pad_type_0, strides = up_11_strides_0, weight = layers_c2_5_mlp_up_proj_weight_palettized, x = input_165)[name = string("up_11")];
+            string gate_23_mode_0 = const()[name = string("gate_23_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_23 = gelu(mode = gate_23_mode_0, x = gate_21)[name = string("gate_23")];
+            tensor<fp16, [1, 10240, 1, 1]> input_167 = mul(x = gate_23, y = up_11)[name = string("input_167")];
+            string mlp_out_11_pad_type_0 = const()[name = string("mlp_out_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_11_strides_0 = const()[name = string("mlp_out_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_11_pad_0 = const()[name = string("mlp_out_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_11_dilations_0 = const()[name = string("mlp_out_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_11_groups_0 = const()[name = string("mlp_out_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_11 = conv(dilations = mlp_out_11_dilations_0, groups = mlp_out_11_groups_0, pad = mlp_out_11_pad_0, pad_type = mlp_out_11_pad_type_0, strides = mlp_out_11_strides_0, weight = layers_c2_5_mlp_down_proj_weight_palettized, x = input_167)[name = string("mlp_out_11")];
+            tensor<int32, [1]> var_4364_axes_0 = const()[name = string("op_4364_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_4364 = squeeze(axes = var_4364_axes_0, x = mlp_out_11)[name = string("op_4364")];
+            tensor<int32, [3]> var_4368 = const()[name = string("op_4368"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_4374 = const()[name = string("op_4374"), val = int32(-1)];
+            fp16 const_67_promoted = const()[name = string("const_67_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_115 = transpose(perm = var_4368, x = var_4364)[name = string("transpose_201")];
+            tensor<fp16, [1, 1, 2560]> var_4376 = mul(x = x_115, y = const_67_promoted)[name = string("op_4376")];
+            bool input_169_interleave_0 = const()[name = string("input_169_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_169 = concat(axis = var_4374, interleave = input_169_interleave_0, values = (x_115, var_4376))[name = string("input_169")];
+            tensor<int32, [1]> normed_161_axes_0 = const()[name = string("normed_161_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4371_to_fp16 = const()[name = string("op_4371_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_161_cast_fp16 = layer_norm(axes = normed_161_axes_0, epsilon = var_4371_to_fp16, x = input_169)[name = string("normed_161_cast_fp16")];
+            tensor<int32, [2]> var_4381_split_sizes_0 = const()[name = string("op_4381_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4381_axis_0 = const()[name = string("op_4381_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_4381_0, tensor<fp16, [1, 1, 2560]> var_4381_1 = split(axis = var_4381_axis_0, split_sizes = var_4381_split_sizes_0, x = normed_161_cast_fp16)[name = string("op_4381")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_53 = mul(x = var_4381_0, y = layers_c2_5_post_feedforward_layernorm_weight)[name = string("hidden_states_53")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_55_cast_fp16 = add(x = x_113_cast_fp16, y = hidden_states_53)[name = string("hidden_states_55_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_11_begin_0 = const()[name = string("per_layer_slice_11_begin_0"), val = tensor<int32, [3]>([0, 0, 4352])];
+            tensor<int32, [3]> per_layer_slice_11_end_0 = const()[name = string("per_layer_slice_11_end_0"), val = tensor<int32, [3]>([1, 1, 4608])];
+            tensor<bool, [3]> per_layer_slice_11_end_mask_0 = const()[name = string("per_layer_slice_11_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_11_cast_fp16 = slice_by_index(begin = per_layer_slice_11_begin_0, end = per_layer_slice_11_end_0, end_mask = per_layer_slice_11_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_11_cast_fp16")];
+            tensor<int32, [3]> var_4409 = const()[name = string("op_4409"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_171_axes_0 = const()[name = string("input_171_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_4410 = transpose(perm = var_4409, x = hidden_states_55_cast_fp16)[name = string("transpose_200")];
+            tensor<fp16, [1, 2560, 1, 1]> input_171 = expand_dims(axes = input_171_axes_0, x = var_4410)[name = string("input_171")];
+            string gated_31_pad_type_0 = const()[name = string("gated_31_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_31_strides_0 = const()[name = string("gated_31_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_31_pad_0 = const()[name = string("gated_31_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_31_dilations_0 = const()[name = string("gated_31_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_31_groups_0 = const()[name = string("gated_31_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_31 = conv(dilations = gated_31_dilations_0, groups = gated_31_groups_0, pad = gated_31_pad_0, pad_type = gated_31_pad_type_0, strides = gated_31_strides_0, weight = layers_c2_5_per_layer_input_gate_weight_palettized, x = input_171)[name = string("gated_31")];
+            string gated_33_mode_0 = const()[name = string("gated_33_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_33 = gelu(mode = gated_33_mode_0, x = gated_31)[name = string("gated_33")];
+            tensor<int32, [3]> var_4429 = const()[name = string("op_4429"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_11_axes_0 = const()[name = string("per_layer_slice_conv_11_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_4430_cast_fp16 = transpose(perm = var_4429, x = per_layer_slice_11_cast_fp16)[name = string("transpose_199")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_11_cast_fp16 = expand_dims(axes = per_layer_slice_conv_11_axes_0, x = var_4430_cast_fp16)[name = string("per_layer_slice_conv_11_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_173_cast_fp16 = mul(x = gated_33, y = per_layer_slice_conv_11_cast_fp16)[name = string("input_173_cast_fp16")];
+            string gated_35_pad_type_0 = const()[name = string("gated_35_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_35_strides_0 = const()[name = string("gated_35_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_35_pad_0 = const()[name = string("gated_35_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_35_dilations_0 = const()[name = string("gated_35_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_35_groups_0 = const()[name = string("gated_35_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_c2_5_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(934729088))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(935056832))))[name = string("layers_c2_5_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_35_cast_fp16 = conv(dilations = gated_35_dilations_0, groups = gated_35_groups_0, pad = gated_35_pad_0, pad_type = gated_35_pad_type_0, strides = gated_35_strides_0, weight = layers_c2_5_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_173_cast_fp16)[name = string("gated_35_cast_fp16")];
+            tensor<int32, [1]> var_4446_axes_0 = const()[name = string("op_4446_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_4446_cast_fp16 = squeeze(axes = var_4446_axes_0, x = gated_35_cast_fp16)[name = string("op_4446_cast_fp16")];
+            tensor<int32, [3]> var_4450 = const()[name = string("op_4450"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_4456 = const()[name = string("op_4456"), val = int32(-1)];
+            fp16 const_68_promoted_to_fp16 = const()[name = string("const_68_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_117_cast_fp16 = transpose(perm = var_4450, x = var_4446_cast_fp16)[name = string("transpose_198")];
+            tensor<fp16, [1, 1, 2560]> var_4458_cast_fp16 = mul(x = x_117_cast_fp16, y = const_68_promoted_to_fp16)[name = string("op_4458_cast_fp16")];
+            bool input_175_interleave_0 = const()[name = string("input_175_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_175_cast_fp16 = concat(axis = var_4456, interleave = input_175_interleave_0, values = (x_117_cast_fp16, var_4458_cast_fp16))[name = string("input_175_cast_fp16")];
+            tensor<int32, [1]> normed_165_axes_0 = const()[name = string("normed_165_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4453_to_fp16 = const()[name = string("op_4453_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_165_cast_fp16 = layer_norm(axes = normed_165_axes_0, epsilon = var_4453_to_fp16, x = input_175_cast_fp16)[name = string("normed_165_cast_fp16")];
+            tensor<int32, [2]> var_4463_split_sizes_0 = const()[name = string("op_4463_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4463_axis_0 = const()[name = string("op_4463_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_4463_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_4463_cast_fp16_1 = split(axis = var_4463_axis_0, split_sizes = var_4463_split_sizes_0, x = normed_165_cast_fp16)[name = string("op_4463_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_5_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_c2_5_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(935059456)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_59_cast_fp16 = mul(x = var_4463_cast_fp16_0, y = layers_c2_5_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_59_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_61_cast_fp16 = add(x = hidden_states_55_cast_fp16, y = hidden_states_59_cast_fp16)[name = string("hidden_states_61_cast_fp16")];
+            tensor<fp16, [1]> const_69_promoted_to_fp16 = const()[name = string("const_69_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.b2p-2])];
+            tensor<fp16, [1, 1, 2560]> x_119_cast_fp16 = mul(x = hidden_states_61_cast_fp16, y = const_69_promoted_to_fp16)[name = string("x_119_cast_fp16")];
+            tensor<int32, [1]> var_4475_axes_0 = const()[name = string("op_4475_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 2048, 512]> var_4475_cast_fp16 = squeeze(axes = var_4475_axes_0, x = K_full_out_1_cast_fp16)[name = string("op_4475_cast_fp16")];
+            tensor<int32, [1]> var_4477_axes_0 = const()[name = string("op_4477_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 2048, 512]> var_4477_cast_fp16 = squeeze(axes = var_4477_axes_0, x = V_full_out_1_cast_fp16)[name = string("op_4477_cast_fp16")];
+            tensor<int32, [4]> var_4480_begin_0 = const()[name = string("op_4480_begin_0"), val = tensor<int32, [4]>([5, 0, 0, 0])];
+            tensor<int32, [4]> var_4480_end_0 = const()[name = string("op_4480_end_0"), val = tensor<int32, [4]>([6, 2, 512, 512])];
+            tensor<bool, [4]> var_4480_end_mask_0 = const()[name = string("op_4480_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_4480_squeeze_mask_0 = const()[name = string("op_4480_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_4480_cast_fp16 = slice_by_index(begin = var_4480_begin_0, end = var_4480_end_0, end_mask = var_4480_end_mask_0, squeeze_mask = var_4480_squeeze_mask_0, x = K_sliding_in)[name = string("op_4480_cast_fp16")];
+            tensor<int32, [1]> K_sliding_slot_11_axes_0 = const()[name = string("K_sliding_slot_11_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_slot_11_cast_fp16 = expand_dims(axes = K_sliding_slot_11_axes_0, x = var_4480_cast_fp16)[name = string("K_sliding_slot_11_cast_fp16")];
+            tensor<int32, [4]> var_4485_begin_0 = const()[name = string("op_4485_begin_0"), val = tensor<int32, [4]>([5, 0, 0, 0])];
+            tensor<int32, [4]> var_4485_end_0 = const()[name = string("op_4485_end_0"), val = tensor<int32, [4]>([6, 2, 512, 512])];
+            tensor<bool, [4]> var_4485_end_mask_0 = const()[name = string("op_4485_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_4485_squeeze_mask_0 = const()[name = string("op_4485_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_4485_cast_fp16 = slice_by_index(begin = var_4485_begin_0, end = var_4485_end_0, end_mask = var_4485_end_mask_0, squeeze_mask = var_4485_squeeze_mask_0, x = V_sliding_in)[name = string("op_4485_cast_fp16")];
+            tensor<int32, [1]> V_sliding_slot_11_axes_0 = const()[name = string("V_sliding_slot_11_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_slot_11_cast_fp16 = expand_dims(axes = V_sliding_slot_11_axes_0, x = var_4485_cast_fp16)[name = string("V_sliding_slot_11_cast_fp16")];
+            int32 var_4492 = const()[name = string("op_4492"), val = int32(-1)];
+            fp16 const_70_promoted_to_fp16 = const()[name = string("const_70_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_4494_cast_fp16 = mul(x = x_119_cast_fp16, y = const_70_promoted_to_fp16)[name = string("op_4494_cast_fp16")];
+            bool input_177_interleave_0 = const()[name = string("input_177_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_177_cast_fp16 = concat(axis = var_4492, interleave = input_177_interleave_0, values = (x_119_cast_fp16, var_4494_cast_fp16))[name = string("input_177_cast_fp16")];
+            tensor<int32, [1]> normed_169_axes_0 = const()[name = string("normed_169_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4489_to_fp16 = const()[name = string("op_4489_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_169_cast_fp16 = layer_norm(axes = normed_169_axes_0, epsilon = var_4489_to_fp16, x = input_177_cast_fp16)[name = string("normed_169_cast_fp16")];
+            tensor<int32, [2]> var_4499_split_sizes_0 = const()[name = string("op_4499_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4499_axis_0 = const()[name = string("op_4499_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_4499_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_4499_cast_fp16_1 = split(axis = var_4499_axis_0, split_sizes = var_4499_split_sizes_0, x = normed_169_cast_fp16)[name = string("op_4499_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_6_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_6_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(935064640)))];
+            tensor<fp16, [1, 1, 2560]> h_37_cast_fp16 = mul(x = var_4499_cast_fp16_0, y = layers_c2_6_input_layernorm_weight_promoted_to_fp16)[name = string("h_37_cast_fp16")];
+            tensor<int32, [3]> var_4505 = const()[name = string("op_4505"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_4508_axes_0 = const()[name = string("op_4508_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_4506_cast_fp16 = transpose(perm = var_4505, x = h_37_cast_fp16)[name = string("transpose_197")];
+            tensor<fp16, [1, 2560, 1, 1]> var_4508_cast_fp16 = expand_dims(axes = var_4508_axes_0, x = var_4506_cast_fp16)[name = string("op_4508_cast_fp16")];
+            string var_4524_pad_type_0 = const()[name = string("op_4524_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_4524_strides_0 = const()[name = string("op_4524_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_4524_pad_0 = const()[name = string("op_4524_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_4524_dilations_0 = const()[name = string("op_4524_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_4524_groups_0 = const()[name = string("op_4524_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_4524 = conv(dilations = var_4524_dilations_0, groups = var_4524_groups_0, pad = var_4524_pad_0, pad_type = var_4524_pad_type_0, strides = var_4524_strides_0, weight = layers_c2_6_self_attn_q_proj_weight_palettized, x = var_4508_cast_fp16)[name = string("op_4524")];
+            tensor<int32, [4]> var_4529 = const()[name = string("op_4529"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_4530 = reshape(shape = var_4529, x = var_4524)[name = string("op_4530")];
+            tensor<int32, [4]> var_4535 = const()[name = string("op_4535"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_4545 = const()[name = string("op_4545"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_4536 = transpose(perm = var_4535, x = var_4530)[name = string("transpose_196")];
+            tensor<fp16, [1, 8, 256]> x_121 = reshape(shape = var_4545, x = var_4536)[name = string("x_121")];
+            int32 var_4551 = const()[name = string("op_4551"), val = int32(-1)];
+            fp16 const_71_promoted = const()[name = string("const_71_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_4553 = mul(x = x_121, y = const_71_promoted)[name = string("op_4553")];
+            bool input_181_interleave_0 = const()[name = string("input_181_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_181 = concat(axis = var_4551, interleave = input_181_interleave_0, values = (x_121, var_4553))[name = string("input_181")];
+            tensor<int32, [1]> normed_173_axes_0 = const()[name = string("normed_173_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4548_to_fp16 = const()[name = string("op_4548_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_173_cast_fp16 = layer_norm(axes = normed_173_axes_0, epsilon = var_4548_to_fp16, x = input_181)[name = string("normed_173_cast_fp16")];
+            tensor<int32, [2]> var_4558_split_sizes_0 = const()[name = string("op_4558_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_4558_axis_0 = const()[name = string("op_4558_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_4558_0, tensor<fp16, [1, 8, 256]> var_4558_1 = split(axis = var_4558_axis_0, split_sizes = var_4558_split_sizes_0, x = normed_173_cast_fp16)[name = string("op_4558")];
+            tensor<fp16, [1, 8, 256]> var_4560 = mul(x = var_4558_0, y = layers_c2_2_self_attn_q_norm_weight)[name = string("op_4560")];
+            tensor<int32, [4]> var_4565 = const()[name = string("op_4565"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_51 = reshape(shape = var_4565, x = var_4560)[name = string("q_51")];
+            tensor<fp16, [1, 8, 1, 256]> var_4567_cast_fp16 = mul(x = q_51, y = cos_s)[name = string("op_4567_cast_fp16")];
+            tensor<int32, [2]> var_4568_split_sizes_0 = const()[name = string("op_4568_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_4568_axis_0 = const()[name = string("op_4568_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_4568_0, tensor<fp16, [1, 8, 1, 128]> var_4568_1 = split(axis = var_4568_axis_0, split_sizes = var_4568_split_sizes_0, x = q_51)[name = string("op_4568")];
+            fp16 const_72_promoted = const()[name = string("const_72_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_4570 = mul(x = var_4568_1, y = const_72_promoted)[name = string("op_4570")];
+            int32 var_4572 = const()[name = string("op_4572"), val = int32(-1)];
+            bool var_4573_interleave_0 = const()[name = string("op_4573_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_4573 = concat(axis = var_4572, interleave = var_4573_interleave_0, values = (var_4570, var_4568_0))[name = string("op_4573")];
+            tensor<fp16, [1, 8, 1, 256]> var_4574_cast_fp16 = mul(x = var_4573, y = sin_s)[name = string("op_4574_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_55_cast_fp16 = add(x = var_4567_cast_fp16, y = var_4574_cast_fp16)[name = string("q_55_cast_fp16")];
+            string var_4587_pad_type_0 = const()[name = string("op_4587_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_4587_strides_0 = const()[name = string("op_4587_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_4587_pad_0 = const()[name = string("op_4587_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_4587_dilations_0 = const()[name = string("op_4587_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_4587_groups_0 = const()[name = string("op_4587_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_4587 = conv(dilations = var_4587_dilations_0, groups = var_4587_groups_0, pad = var_4587_pad_0, pad_type = var_4587_pad_type_0, strides = var_4587_strides_0, weight = layers_c2_6_self_attn_k_proj_weight_palettized, x = var_4508_cast_fp16)[name = string("op_4587")];
+            tensor<int32, [4]> var_4592 = const()[name = string("op_4592"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_4593 = reshape(shape = var_4592, x = var_4587)[name = string("op_4593")];
+            tensor<int32, [4]> var_4598 = const()[name = string("op_4598"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_4615_pad_type_0 = const()[name = string("op_4615_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_4615_strides_0 = const()[name = string("op_4615_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_4615_pad_0 = const()[name = string("op_4615_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_4615_dilations_0 = const()[name = string("op_4615_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_4615_groups_0 = const()[name = string("op_4615_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_4615 = conv(dilations = var_4615_dilations_0, groups = var_4615_groups_0, pad = var_4615_pad_0, pad_type = var_4615_pad_type_0, strides = var_4615_strides_0, weight = layers_c2_6_self_attn_v_proj_weight_palettized, x = var_4508_cast_fp16)[name = string("op_4615")];
+            tensor<int32, [4]> var_4620 = const()[name = string("op_4620"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_4621 = reshape(shape = var_4620, x = var_4615)[name = string("op_4621")];
+            tensor<int32, [4]> var_4626 = const()[name = string("op_4626"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_4636 = const()[name = string("op_4636"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp16, [1, 2, 1, 256]> var_4599 = transpose(perm = var_4598, x = var_4593)[name = string("transpose_195")];
+            tensor<fp16, [1, 2, 256]> x_123 = reshape(shape = var_4636, x = var_4599)[name = string("x_123")];
+            int32 var_4642 = const()[name = string("op_4642"), val = int32(-1)];
+            fp16 const_73_promoted = const()[name = string("const_73_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 256]> var_4644 = mul(x = x_123, y = const_73_promoted)[name = string("op_4644")];
+            bool input_183_interleave_0 = const()[name = string("input_183_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512]> input_183 = concat(axis = var_4642, interleave = input_183_interleave_0, values = (x_123, var_4644))[name = string("input_183")];
+            tensor<int32, [1]> normed_177_axes_0 = const()[name = string("normed_177_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4639_to_fp16 = const()[name = string("op_4639_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 512]> normed_177_cast_fp16 = layer_norm(axes = normed_177_axes_0, epsilon = var_4639_to_fp16, x = input_183)[name = string("normed_177_cast_fp16")];
+            tensor<int32, [2]> var_4649_split_sizes_0 = const()[name = string("op_4649_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_4649_axis_0 = const()[name = string("op_4649_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 256]> var_4649_0, tensor<fp16, [1, 2, 256]> var_4649_1 = split(axis = var_4649_axis_0, split_sizes = var_4649_split_sizes_0, x = normed_177_cast_fp16)[name = string("op_4649")];
+            tensor<fp16, [1, 2, 256]> var_4651 = mul(x = var_4649_0, y = layers_c2_6_self_attn_k_norm_weight)[name = string("op_4651")];
+            tensor<int32, [4]> var_4656 = const()[name = string("op_4656"), val = tensor<int32, [4]>([1, 2, 1, 256])];
+            tensor<fp16, [1, 2, 1, 256]> q_53 = reshape(shape = var_4656, x = var_4651)[name = string("q_53")];
+            fp16 var_4658_promoted = const()[name = string("op_4658_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 256]> var_4627 = transpose(perm = var_4626, x = var_4621)[name = string("transpose_194")];
+            tensor<fp16, [1, 2, 1, 256]> var_4659 = pow(x = var_4627, y = var_4658_promoted)[name = string("op_4659")];
+            tensor<int32, [1]> var_4664_axes_0 = const()[name = string("op_4664_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4664_keep_dims_0 = const()[name = string("op_4664_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_4664 = reduce_mean(axes = var_4664_axes_0, keep_dims = var_4664_keep_dims_0, x = var_4659)[name = string("op_4664")];
+            fp16 var_4666_to_fp16 = const()[name = string("op_4666_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_13_cast_fp16 = add(x = var_4664, y = var_4666_to_fp16)[name = string("mean_sq_13_cast_fp16")];
+            fp32 var_4668_epsilon_0 = const()[name = string("op_4668_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_4668_cast_fp16 = rsqrt(epsilon = var_4668_epsilon_0, x = mean_sq_13_cast_fp16)[name = string("op_4668_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_187_cast_fp16 = mul(x = var_4627, y = var_4668_cast_fp16)[name = string("input_187_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> var_4670_cast_fp16 = mul(x = q_53, y = cos_s)[name = string("op_4670_cast_fp16")];
+            tensor<int32, [2]> var_4671_split_sizes_0 = const()[name = string("op_4671_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_4671_axis_0 = const()[name = string("op_4671_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 128]> var_4671_0, tensor<fp16, [1, 2, 1, 128]> var_4671_1 = split(axis = var_4671_axis_0, split_sizes = var_4671_split_sizes_0, x = q_53)[name = string("op_4671")];
+            fp16 const_74_promoted = const()[name = string("const_74_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 128]> var_4673 = mul(x = var_4671_1, y = const_74_promoted)[name = string("op_4673")];
+            int32 var_4675 = const()[name = string("op_4675"), val = int32(-1)];
+            bool var_4676_interleave_0 = const()[name = string("op_4676_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 256]> var_4676 = concat(axis = var_4675, interleave = var_4676_interleave_0, values = (var_4673, var_4671_0))[name = string("op_4676")];
+            tensor<fp16, [1, 2, 1, 256]> var_4677_cast_fp16 = mul(x = var_4676, y = sin_s)[name = string("op_4677_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_185_cast_fp16 = add(x = var_4670_cast_fp16, y = var_4677_cast_fp16)[name = string("input_185_cast_fp16")];
+            tensor<int32, [8]> k_padded_11_pad_0 = const()[name = string("k_padded_11_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_11_mode_0 = const()[name = string("k_padded_11_mode_0"), val = string("constant")];
+            fp16 const_75_to_fp16 = const()[name = string("const_75_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> k_padded_11_cast_fp16 = pad(constant_val = const_75_to_fp16, mode = k_padded_11_mode_0, pad = k_padded_11_pad_0, x = input_185_cast_fp16)[name = string("k_padded_11_cast_fp16")];
+            tensor<int32, [8]> v_padded_11_pad_0 = const()[name = string("v_padded_11_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_11_mode_0 = const()[name = string("v_padded_11_mode_0"), val = string("constant")];
+            fp16 const_76_to_fp16 = const()[name = string("const_76_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> v_padded_11_cast_fp16 = pad(constant_val = const_76_to_fp16, mode = v_padded_11_mode_0, pad = v_padded_11_pad_0, x = input_187_cast_fp16)[name = string("v_padded_11_cast_fp16")];
+            tensor<int32, [4]> var_4706_begin_0 = const()[name = string("op_4706_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_4706_end_0 = const()[name = string("op_4706_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_4706_end_mask_0 = const()[name = string("op_4706_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_4706_cast_fp16 = slice_by_index(begin = var_4706_begin_0, end = var_4706_end_0, end_mask = var_4706_end_mask_0, x = K_sliding_slot_11_cast_fp16)[name = string("op_4706_cast_fp16")];
+            int32 var_4713 = const()[name = string("op_4713"), val = int32(2)];
+            bool K_sliding_out_11_interleave_0 = const()[name = string("K_sliding_out_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_out_11_cast_fp16 = concat(axis = var_4713, interleave = K_sliding_out_11_interleave_0, values = (var_4706_cast_fp16, k_padded_11_cast_fp16))[name = string("K_sliding_out_11_cast_fp16")];
+            tensor<int32, [4]> var_4729_begin_0 = const()[name = string("op_4729_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_4729_end_0 = const()[name = string("op_4729_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_4729_end_mask_0 = const()[name = string("op_4729_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_4729_cast_fp16 = slice_by_index(begin = var_4729_begin_0, end = var_4729_end_0, end_mask = var_4729_end_mask_0, x = V_sliding_slot_11_cast_fp16)[name = string("op_4729_cast_fp16")];
+            int32 var_4736 = const()[name = string("op_4736"), val = int32(2)];
+            bool V_sliding_out_11_interleave_0 = const()[name = string("V_sliding_out_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_out_11_cast_fp16 = concat(axis = var_4736, interleave = V_sliding_out_11_interleave_0, values = (var_4729_cast_fp16, v_padded_11_cast_fp16))[name = string("V_sliding_out_11_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_13_begin_0 = const()[name = string("K_for_attn_13_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_13_end_0 = const()[name = string("K_for_attn_13_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_13_end_mask_0 = const()[name = string("K_for_attn_13_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_13_cast_fp16 = slice_by_index(begin = K_for_attn_13_begin_0, end = K_for_attn_13_end_0, end_mask = K_for_attn_13_end_mask_0, x = K_sliding_out_11_cast_fp16)[name = string("K_for_attn_13_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_13_begin_0 = const()[name = string("V_for_attn_13_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_13_end_0 = const()[name = string("V_for_attn_13_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_13_end_mask_0 = const()[name = string("V_for_attn_13_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_13_cast_fp16 = slice_by_index(begin = V_for_attn_13_begin_0, end = V_for_attn_13_end_0, end_mask = V_for_attn_13_end_mask_0, x = V_sliding_out_11_cast_fp16)[name = string("V_for_attn_13_cast_fp16")];
+            tensor<int32, [4]> transpose_24_perm_0 = const()[name = string("transpose_24_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_12_reps_0 = const()[name = string("tile_12_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_24_cast_fp16 = transpose(perm = transpose_24_perm_0, x = K_for_attn_13_cast_fp16)[name = string("transpose_193")];
+            tensor<fp16, [8, 1, 512, 256]> tile_12_cast_fp16 = tile(reps = tile_12_reps_0, x = transpose_24_cast_fp16)[name = string("tile_12_cast_fp16")];
+            tensor<int32, [5]> concat_24 = const()[name = string("concat_24"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_24_cast_fp16 = reshape(shape = concat_24, x = tile_12_cast_fp16)[name = string("reshape_24_cast_fp16")];
+            tensor<int32, [5]> transpose_25_perm_0 = const()[name = string("transpose_25_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_25 = const()[name = string("concat_25"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_25_cast_fp16 = transpose(perm = transpose_25_perm_0, x = reshape_24_cast_fp16)[name = string("transpose_192")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_25_cast_fp16 = reshape(shape = concat_25, x = transpose_25_cast_fp16)[name = string("reshape_25_cast_fp16")];
+            tensor<int32, [4]> transpose_90_perm_0 = const()[name = string("transpose_90_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_26_perm_0 = const()[name = string("transpose_26_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_13_reps_0 = const()[name = string("tile_13_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_26_cast_fp16 = transpose(perm = transpose_26_perm_0, x = V_for_attn_13_cast_fp16)[name = string("transpose_191")];
+            tensor<fp16, [8, 1, 512, 256]> tile_13_cast_fp16 = tile(reps = tile_13_reps_0, x = transpose_26_cast_fp16)[name = string("tile_13_cast_fp16")];
+            tensor<int32, [5]> concat_26 = const()[name = string("concat_26"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_26_cast_fp16 = reshape(shape = concat_26, x = tile_13_cast_fp16)[name = string("reshape_26_cast_fp16")];
+            tensor<int32, [5]> transpose_27_perm_0 = const()[name = string("transpose_27_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_27 = const()[name = string("concat_27"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_27_cast_fp16 = transpose(perm = transpose_27_perm_0, x = reshape_26_cast_fp16)[name = string("transpose_190")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_27_cast_fp16 = reshape(shape = concat_27, x = transpose_27_cast_fp16)[name = string("reshape_27_cast_fp16")];
+            tensor<int32, [4]> V_expanded_13_perm_0 = const()[name = string("V_expanded_13_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_25_transpose_x_0 = const()[name = string("attn_weights_25_transpose_x_0"), val = bool(false)];
+            bool attn_weights_25_transpose_y_0 = const()[name = string("attn_weights_25_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_90_cast_fp16 = transpose(perm = transpose_90_perm_0, x = reshape_25_cast_fp16)[name = string("transpose_189")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_25_cast_fp16 = matmul(transpose_x = attn_weights_25_transpose_x_0, transpose_y = attn_weights_25_transpose_y_0, x = q_55_cast_fp16, y = transpose_90_cast_fp16)[name = string("attn_weights_25_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_127_cast_fp16 = add(x = attn_weights_25_cast_fp16, y = causal_mask_sliding)[name = string("x_127_cast_fp16")];
+            tensor<int32, [1]> reduce_max_6_axes_0 = const()[name = string("reduce_max_6_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_6_keep_dims_0 = const()[name = string("reduce_max_6_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_6 = reduce_max(axes = reduce_max_6_axes_0, keep_dims = reduce_max_6_keep_dims_0, x = x_127_cast_fp16)[name = string("reduce_max_6")];
+            tensor<fp16, [1, 8, 1, 512]> var_4777 = sub(x = x_127_cast_fp16, y = reduce_max_6)[name = string("op_4777")];
+            tensor<fp16, [1, 8, 1, 512]> var_4783 = exp(x = var_4777)[name = string("op_4783")];
+            tensor<int32, [1]> var_4793_axes_0 = const()[name = string("op_4793_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4793_keep_dims_0 = const()[name = string("op_4793_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_4793 = reduce_sum(axes = var_4793_axes_0, keep_dims = var_4793_keep_dims_0, x = var_4783)[name = string("op_4793")];
+            tensor<fp16, [1, 8, 1, 512]> var_4799_cast_fp16 = real_div(x = var_4783, y = var_4793)[name = string("op_4799_cast_fp16")];
+            bool attn_output_37_transpose_x_0 = const()[name = string("attn_output_37_transpose_x_0"), val = bool(false)];
+            bool attn_output_37_transpose_y_0 = const()[name = string("attn_output_37_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_13_cast_fp16 = transpose(perm = V_expanded_13_perm_0, x = reshape_27_cast_fp16)[name = string("transpose_188")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_37_cast_fp16 = matmul(transpose_x = attn_output_37_transpose_x_0, transpose_y = attn_output_37_transpose_y_0, x = var_4799_cast_fp16, y = V_expanded_13_cast_fp16)[name = string("attn_output_37_cast_fp16")];
+            tensor<int32, [4]> var_4810 = const()[name = string("op_4810"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_4817 = const()[name = string("op_4817"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_4811_cast_fp16 = transpose(perm = var_4810, x = attn_output_37_cast_fp16)[name = string("transpose_187")];
+            tensor<fp16, [1, 1, 2048]> attn_output_39_cast_fp16 = reshape(shape = var_4817, x = var_4811_cast_fp16)[name = string("attn_output_39_cast_fp16")];
+            tensor<int32, [3]> var_4822 = const()[name = string("op_4822"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_4838_pad_type_0 = const()[name = string("op_4838_pad_type_0"), val = string("valid")];
+            int32 var_4838_groups_0 = const()[name = string("op_4838_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_4838_strides_0 = const()[name = string("op_4838_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_4838_pad_0 = const()[name = string("op_4838_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_4838_dilations_0 = const()[name = string("op_4838_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_6_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(935069824))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(937691328))))[name = string("squeeze_6_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_4823_cast_fp16 = transpose(perm = var_4822, x = attn_output_39_cast_fp16)[name = string("transpose_186")];
+            tensor<fp16, [1, 2560, 1]> var_4838_cast_fp16 = conv(dilations = var_4838_dilations_0, groups = var_4838_groups_0, pad = var_4838_pad_0, pad_type = var_4838_pad_type_0, strides = var_4838_strides_0, weight = squeeze_6_cast_fp16_to_fp32_to_fp16_palettized, x = var_4823_cast_fp16)[name = string("op_4838_cast_fp16")];
+            tensor<int32, [3]> var_4842 = const()[name = string("op_4842"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_4848 = const()[name = string("op_4848"), val = int32(-1)];
+            fp16 const_77_promoted_to_fp16 = const()[name = string("const_77_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_131_cast_fp16 = transpose(perm = var_4842, x = var_4838_cast_fp16)[name = string("transpose_185")];
+            tensor<fp16, [1, 1, 2560]> var_4850_cast_fp16 = mul(x = x_131_cast_fp16, y = const_77_promoted_to_fp16)[name = string("op_4850_cast_fp16")];
+            bool input_191_interleave_0 = const()[name = string("input_191_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_191_cast_fp16 = concat(axis = var_4848, interleave = input_191_interleave_0, values = (x_131_cast_fp16, var_4850_cast_fp16))[name = string("input_191_cast_fp16")];
+            tensor<int32, [1]> normed_181_axes_0 = const()[name = string("normed_181_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4845_to_fp16 = const()[name = string("op_4845_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_181_cast_fp16 = layer_norm(axes = normed_181_axes_0, epsilon = var_4845_to_fp16, x = input_191_cast_fp16)[name = string("normed_181_cast_fp16")];
+            tensor<int32, [2]> var_4855_split_sizes_0 = const()[name = string("op_4855_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4855_axis_0 = const()[name = string("op_4855_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_4855_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_4855_cast_fp16_1 = split(axis = var_4855_axis_0, split_sizes = var_4855_split_sizes_0, x = normed_181_cast_fp16)[name = string("op_4855_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_6_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_6_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(937693952)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_41_cast_fp16 = mul(x = var_4855_cast_fp16_0, y = layers_c2_6_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_41_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_133_cast_fp16 = add(x = x_119_cast_fp16, y = attn_output_41_cast_fp16)[name = string("x_133_cast_fp16")];
+            int32 var_4864 = const()[name = string("op_4864"), val = int32(-1)];
+            fp16 const_78_promoted_to_fp16 = const()[name = string("const_78_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_4866_cast_fp16 = mul(x = x_133_cast_fp16, y = const_78_promoted_to_fp16)[name = string("op_4866_cast_fp16")];
+            bool input_193_interleave_0 = const()[name = string("input_193_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_193_cast_fp16 = concat(axis = var_4864, interleave = input_193_interleave_0, values = (x_133_cast_fp16, var_4866_cast_fp16))[name = string("input_193_cast_fp16")];
+            tensor<int32, [1]> normed_185_axes_0 = const()[name = string("normed_185_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4861_to_fp16 = const()[name = string("op_4861_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_185_cast_fp16 = layer_norm(axes = normed_185_axes_0, epsilon = var_4861_to_fp16, x = input_193_cast_fp16)[name = string("normed_185_cast_fp16")];
+            tensor<int32, [2]> var_4871_split_sizes_0 = const()[name = string("op_4871_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4871_axis_0 = const()[name = string("op_4871_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_4871_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_4871_cast_fp16_1 = split(axis = var_4871_axis_0, split_sizes = var_4871_split_sizes_0, x = normed_185_cast_fp16)[name = string("op_4871_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_6_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_6_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(937699136)))];
+            tensor<fp16, [1, 1, 2560]> h_39_cast_fp16 = mul(x = var_4871_cast_fp16_0, y = layers_c2_6_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_39_cast_fp16")];
+            tensor<int32, [3]> var_4882 = const()[name = string("op_4882"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_195_axes_0 = const()[name = string("input_195_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_4883 = transpose(perm = var_4882, x = h_39_cast_fp16)[name = string("transpose_184")];
+            tensor<fp16, [1, 2560, 1, 1]> input_195 = expand_dims(axes = input_195_axes_0, x = var_4883)[name = string("input_195")];
+            string gate_25_pad_type_0 = const()[name = string("gate_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_25_strides_0 = const()[name = string("gate_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_25_pad_0 = const()[name = string("gate_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_25_dilations_0 = const()[name = string("gate_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_25_groups_0 = const()[name = string("gate_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_25 = conv(dilations = gate_25_dilations_0, groups = gate_25_groups_0, pad = gate_25_pad_0, pad_type = gate_25_pad_type_0, strides = gate_25_strides_0, weight = layers_c2_6_mlp_gate_proj_weight_palettized, x = input_195)[name = string("gate_25")];
+            string up_13_pad_type_0 = const()[name = string("up_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_13_strides_0 = const()[name = string("up_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_13_pad_0 = const()[name = string("up_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_13_dilations_0 = const()[name = string("up_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_13_groups_0 = const()[name = string("up_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_13 = conv(dilations = up_13_dilations_0, groups = up_13_groups_0, pad = up_13_pad_0, pad_type = up_13_pad_type_0, strides = up_13_strides_0, weight = layers_c2_6_mlp_up_proj_weight_palettized, x = input_195)[name = string("up_13")];
+            string gate_27_mode_0 = const()[name = string("gate_27_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_27 = gelu(mode = gate_27_mode_0, x = gate_25)[name = string("gate_27")];
+            tensor<fp16, [1, 10240, 1, 1]> input_197 = mul(x = gate_27, y = up_13)[name = string("input_197")];
+            string mlp_out_13_pad_type_0 = const()[name = string("mlp_out_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_13_strides_0 = const()[name = string("mlp_out_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_13_pad_0 = const()[name = string("mlp_out_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_13_dilations_0 = const()[name = string("mlp_out_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_13_groups_0 = const()[name = string("mlp_out_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_13 = conv(dilations = mlp_out_13_dilations_0, groups = mlp_out_13_groups_0, pad = mlp_out_13_pad_0, pad_type = mlp_out_13_pad_type_0, strides = mlp_out_13_strides_0, weight = layers_c2_6_mlp_down_proj_weight_palettized, x = input_197)[name = string("mlp_out_13")];
+            tensor<int32, [1]> var_4923_axes_0 = const()[name = string("op_4923_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_4923 = squeeze(axes = var_4923_axes_0, x = mlp_out_13)[name = string("op_4923")];
+            tensor<int32, [3]> var_4927 = const()[name = string("op_4927"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_4933 = const()[name = string("op_4933"), val = int32(-1)];
+            fp16 const_79_promoted = const()[name = string("const_79_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_135 = transpose(perm = var_4927, x = var_4923)[name = string("transpose_183")];
+            tensor<fp16, [1, 1, 2560]> var_4935 = mul(x = x_135, y = const_79_promoted)[name = string("op_4935")];
+            bool input_199_interleave_0 = const()[name = string("input_199_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_199 = concat(axis = var_4933, interleave = input_199_interleave_0, values = (x_135, var_4935))[name = string("input_199")];
+            tensor<int32, [1]> normed_189_axes_0 = const()[name = string("normed_189_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_4930_to_fp16 = const()[name = string("op_4930_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_189_cast_fp16 = layer_norm(axes = normed_189_axes_0, epsilon = var_4930_to_fp16, x = input_199)[name = string("normed_189_cast_fp16")];
+            tensor<int32, [2]> var_4940_split_sizes_0 = const()[name = string("op_4940_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_4940_axis_0 = const()[name = string("op_4940_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_4940_0, tensor<fp16, [1, 1, 2560]> var_4940_1 = split(axis = var_4940_axis_0, split_sizes = var_4940_split_sizes_0, x = normed_189_cast_fp16)[name = string("op_4940")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_63 = mul(x = var_4940_0, y = layers_c2_6_post_feedforward_layernorm_weight)[name = string("hidden_states_63")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_65_cast_fp16 = add(x = x_133_cast_fp16, y = hidden_states_63)[name = string("hidden_states_65_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_13_begin_0 = const()[name = string("per_layer_slice_13_begin_0"), val = tensor<int32, [3]>([0, 0, 4608])];
+            tensor<int32, [3]> per_layer_slice_13_end_0 = const()[name = string("per_layer_slice_13_end_0"), val = tensor<int32, [3]>([1, 1, 4864])];
+            tensor<bool, [3]> per_layer_slice_13_end_mask_0 = const()[name = string("per_layer_slice_13_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_13_cast_fp16 = slice_by_index(begin = per_layer_slice_13_begin_0, end = per_layer_slice_13_end_0, end_mask = per_layer_slice_13_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_13_cast_fp16")];
+            tensor<int32, [3]> var_4968 = const()[name = string("op_4968"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_201_axes_0 = const()[name = string("input_201_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_4969 = transpose(perm = var_4968, x = hidden_states_65_cast_fp16)[name = string("transpose_182")];
+            tensor<fp16, [1, 2560, 1, 1]> input_201 = expand_dims(axes = input_201_axes_0, x = var_4969)[name = string("input_201")];
+            string gated_37_pad_type_0 = const()[name = string("gated_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_37_strides_0 = const()[name = string("gated_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_37_pad_0 = const()[name = string("gated_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_37_dilations_0 = const()[name = string("gated_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_37_groups_0 = const()[name = string("gated_37_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_37 = conv(dilations = gated_37_dilations_0, groups = gated_37_groups_0, pad = gated_37_pad_0, pad_type = gated_37_pad_type_0, strides = gated_37_strides_0, weight = layers_c2_6_per_layer_input_gate_weight_palettized, x = input_201)[name = string("gated_37")];
+            string gated_39_mode_0 = const()[name = string("gated_39_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_39 = gelu(mode = gated_39_mode_0, x = gated_37)[name = string("gated_39")];
+            tensor<int32, [3]> var_4988 = const()[name = string("op_4988"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_13_axes_0 = const()[name = string("per_layer_slice_conv_13_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_4989_cast_fp16 = transpose(perm = var_4988, x = per_layer_slice_13_cast_fp16)[name = string("transpose_181")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_13_cast_fp16 = expand_dims(axes = per_layer_slice_conv_13_axes_0, x = var_4989_cast_fp16)[name = string("per_layer_slice_conv_13_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_203_cast_fp16 = mul(x = gated_39, y = per_layer_slice_conv_13_cast_fp16)[name = string("input_203_cast_fp16")];
+            string gated_41_pad_type_0 = const()[name = string("gated_41_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_41_strides_0 = const()[name = string("gated_41_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_41_pad_0 = const()[name = string("gated_41_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_41_dilations_0 = const()[name = string("gated_41_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_41_groups_0 = const()[name = string("gated_41_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_c2_6_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(937704320))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(938032064))))[name = string("layers_c2_6_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_41_cast_fp16 = conv(dilations = gated_41_dilations_0, groups = gated_41_groups_0, pad = gated_41_pad_0, pad_type = gated_41_pad_type_0, strides = gated_41_strides_0, weight = layers_c2_6_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_203_cast_fp16)[name = string("gated_41_cast_fp16")];
+            tensor<int32, [1]> var_5005_axes_0 = const()[name = string("op_5005_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_5005_cast_fp16 = squeeze(axes = var_5005_axes_0, x = gated_41_cast_fp16)[name = string("op_5005_cast_fp16")];
+            tensor<int32, [3]> var_5009 = const()[name = string("op_5009"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_5015 = const()[name = string("op_5015"), val = int32(-1)];
+            fp16 const_80_promoted_to_fp16 = const()[name = string("const_80_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_137_cast_fp16 = transpose(perm = var_5009, x = var_5005_cast_fp16)[name = string("transpose_180")];
+            tensor<fp16, [1, 1, 2560]> var_5017_cast_fp16 = mul(x = x_137_cast_fp16, y = const_80_promoted_to_fp16)[name = string("op_5017_cast_fp16")];
+            bool input_205_interleave_0 = const()[name = string("input_205_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_205_cast_fp16 = concat(axis = var_5015, interleave = input_205_interleave_0, values = (x_137_cast_fp16, var_5017_cast_fp16))[name = string("input_205_cast_fp16")];
+            tensor<int32, [1]> normed_193_axes_0 = const()[name = string("normed_193_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5012_to_fp16 = const()[name = string("op_5012_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_193_cast_fp16 = layer_norm(axes = normed_193_axes_0, epsilon = var_5012_to_fp16, x = input_205_cast_fp16)[name = string("normed_193_cast_fp16")];
+            tensor<int32, [2]> var_5022_split_sizes_0 = const()[name = string("op_5022_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5022_axis_0 = const()[name = string("op_5022_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_5022_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_5022_cast_fp16_1 = split(axis = var_5022_axis_0, split_sizes = var_5022_split_sizes_0, x = normed_193_cast_fp16)[name = string("op_5022_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_6_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_c2_6_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(938034688)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_69_cast_fp16 = mul(x = var_5022_cast_fp16_0, y = layers_c2_6_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_69_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_71_cast_fp16 = add(x = hidden_states_65_cast_fp16, y = hidden_states_69_cast_fp16)[name = string("hidden_states_71_cast_fp16")];
+            tensor<fp16, [1]> const_81_promoted_to_fp16 = const()[name = string("const_81_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.16p-1])];
+            tensor<fp16, [1, 1, 2560]> x_139_cast_fp16 = mul(x = hidden_states_71_cast_fp16, y = const_81_promoted_to_fp16)[name = string("x_139_cast_fp16")];
+            tensor<int32, [1]> var_5034_axes_0 = const()[name = string("op_5034_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_5034_cast_fp16 = squeeze(axes = var_5034_axes_0, x = K_sliding_out_11_cast_fp16)[name = string("op_5034_cast_fp16")];
+            tensor<int32, [1]> var_5036_axes_0 = const()[name = string("op_5036_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_5036_cast_fp16 = squeeze(axes = var_5036_axes_0, x = V_sliding_out_11_cast_fp16)[name = string("op_5036_cast_fp16")];
+            tensor<int32, [4]> var_5039_begin_0 = const()[name = string("op_5039_begin_0"), val = tensor<int32, [4]>([6, 0, 0, 0])];
+            tensor<int32, [4]> var_5039_end_0 = const()[name = string("op_5039_end_0"), val = tensor<int32, [4]>([7, 2, 512, 512])];
+            tensor<bool, [4]> var_5039_end_mask_0 = const()[name = string("op_5039_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_5039_squeeze_mask_0 = const()[name = string("op_5039_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_5039_cast_fp16 = slice_by_index(begin = var_5039_begin_0, end = var_5039_end_0, end_mask = var_5039_end_mask_0, squeeze_mask = var_5039_squeeze_mask_0, x = K_sliding_in)[name = string("op_5039_cast_fp16")];
+            tensor<int32, [1]> K_sliding_slot_13_axes_0 = const()[name = string("K_sliding_slot_13_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_slot_13_cast_fp16 = expand_dims(axes = K_sliding_slot_13_axes_0, x = var_5039_cast_fp16)[name = string("K_sliding_slot_13_cast_fp16")];
+            tensor<int32, [4]> var_5044_begin_0 = const()[name = string("op_5044_begin_0"), val = tensor<int32, [4]>([6, 0, 0, 0])];
+            tensor<int32, [4]> var_5044_end_0 = const()[name = string("op_5044_end_0"), val = tensor<int32, [4]>([7, 2, 512, 512])];
+            tensor<bool, [4]> var_5044_end_mask_0 = const()[name = string("op_5044_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_5044_squeeze_mask_0 = const()[name = string("op_5044_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_5044_cast_fp16 = slice_by_index(begin = var_5044_begin_0, end = var_5044_end_0, end_mask = var_5044_end_mask_0, squeeze_mask = var_5044_squeeze_mask_0, x = V_sliding_in)[name = string("op_5044_cast_fp16")];
+            tensor<int32, [1]> V_sliding_slot_13_axes_0 = const()[name = string("V_sliding_slot_13_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_slot_13_cast_fp16 = expand_dims(axes = V_sliding_slot_13_axes_0, x = var_5044_cast_fp16)[name = string("V_sliding_slot_13_cast_fp16")];
+            int32 var_5051 = const()[name = string("op_5051"), val = int32(-1)];
+            fp16 const_82_promoted_to_fp16 = const()[name = string("const_82_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_5053_cast_fp16 = mul(x = x_139_cast_fp16, y = const_82_promoted_to_fp16)[name = string("op_5053_cast_fp16")];
+            bool input_207_interleave_0 = const()[name = string("input_207_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_207_cast_fp16 = concat(axis = var_5051, interleave = input_207_interleave_0, values = (x_139_cast_fp16, var_5053_cast_fp16))[name = string("input_207_cast_fp16")];
+            tensor<int32, [1]> normed_197_axes_0 = const()[name = string("normed_197_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5048_to_fp16 = const()[name = string("op_5048_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_197_cast_fp16 = layer_norm(axes = normed_197_axes_0, epsilon = var_5048_to_fp16, x = input_207_cast_fp16)[name = string("normed_197_cast_fp16")];
+            tensor<int32, [2]> var_5058_split_sizes_0 = const()[name = string("op_5058_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5058_axis_0 = const()[name = string("op_5058_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_5058_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_5058_cast_fp16_1 = split(axis = var_5058_axis_0, split_sizes = var_5058_split_sizes_0, x = normed_197_cast_fp16)[name = string("op_5058_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_7_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_7_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(938039872)))];
+            tensor<fp16, [1, 1, 2560]> h_43_cast_fp16 = mul(x = var_5058_cast_fp16_0, y = layers_c2_7_input_layernorm_weight_promoted_to_fp16)[name = string("h_43_cast_fp16")];
+            tensor<int32, [3]> var_5064 = const()[name = string("op_5064"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_5067_axes_0 = const()[name = string("op_5067_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_5065_cast_fp16 = transpose(perm = var_5064, x = h_43_cast_fp16)[name = string("transpose_179")];
+            tensor<fp16, [1, 2560, 1, 1]> var_5067_cast_fp16 = expand_dims(axes = var_5067_axes_0, x = var_5065_cast_fp16)[name = string("op_5067_cast_fp16")];
+            string var_5083_pad_type_0 = const()[name = string("op_5083_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_5083_strides_0 = const()[name = string("op_5083_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_5083_pad_0 = const()[name = string("op_5083_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_5083_dilations_0 = const()[name = string("op_5083_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_5083_groups_0 = const()[name = string("op_5083_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_5083 = conv(dilations = var_5083_dilations_0, groups = var_5083_groups_0, pad = var_5083_pad_0, pad_type = var_5083_pad_type_0, strides = var_5083_strides_0, weight = layers_c2_7_self_attn_q_proj_weight_palettized, x = var_5067_cast_fp16)[name = string("op_5083")];
+            tensor<int32, [4]> var_5088 = const()[name = string("op_5088"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_5089 = reshape(shape = var_5088, x = var_5083)[name = string("op_5089")];
+            tensor<int32, [4]> var_5094 = const()[name = string("op_5094"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_5104 = const()[name = string("op_5104"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_5095 = transpose(perm = var_5094, x = var_5089)[name = string("transpose_178")];
+            tensor<fp16, [1, 8, 256]> x_141 = reshape(shape = var_5104, x = var_5095)[name = string("x_141")];
+            int32 var_5110 = const()[name = string("op_5110"), val = int32(-1)];
+            fp16 const_83_promoted = const()[name = string("const_83_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_5112 = mul(x = x_141, y = const_83_promoted)[name = string("op_5112")];
+            bool input_211_interleave_0 = const()[name = string("input_211_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_211 = concat(axis = var_5110, interleave = input_211_interleave_0, values = (x_141, var_5112))[name = string("input_211")];
+            tensor<int32, [1]> normed_201_axes_0 = const()[name = string("normed_201_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5107_to_fp16 = const()[name = string("op_5107_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_201_cast_fp16 = layer_norm(axes = normed_201_axes_0, epsilon = var_5107_to_fp16, x = input_211)[name = string("normed_201_cast_fp16")];
+            tensor<int32, [2]> var_5117_split_sizes_0 = const()[name = string("op_5117_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_5117_axis_0 = const()[name = string("op_5117_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_5117_0, tensor<fp16, [1, 8, 256]> var_5117_1 = split(axis = var_5117_axis_0, split_sizes = var_5117_split_sizes_0, x = normed_201_cast_fp16)[name = string("op_5117")];
+            tensor<fp16, [1, 8, 256]> var_5119 = mul(x = var_5117_0, y = layers_c2_7_self_attn_q_norm_weight)[name = string("op_5119")];
+            tensor<int32, [4]> var_5124 = const()[name = string("op_5124"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_59 = reshape(shape = var_5124, x = var_5119)[name = string("q_59")];
+            tensor<fp16, [1, 8, 1, 256]> var_5126_cast_fp16 = mul(x = q_59, y = cos_s)[name = string("op_5126_cast_fp16")];
+            tensor<int32, [2]> var_5127_split_sizes_0 = const()[name = string("op_5127_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_5127_axis_0 = const()[name = string("op_5127_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_5127_0, tensor<fp16, [1, 8, 1, 128]> var_5127_1 = split(axis = var_5127_axis_0, split_sizes = var_5127_split_sizes_0, x = q_59)[name = string("op_5127")];
+            fp16 const_84_promoted = const()[name = string("const_84_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_5129 = mul(x = var_5127_1, y = const_84_promoted)[name = string("op_5129")];
+            int32 var_5131 = const()[name = string("op_5131"), val = int32(-1)];
+            bool var_5132_interleave_0 = const()[name = string("op_5132_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_5132 = concat(axis = var_5131, interleave = var_5132_interleave_0, values = (var_5129, var_5127_0))[name = string("op_5132")];
+            tensor<fp16, [1, 8, 1, 256]> var_5133_cast_fp16 = mul(x = var_5132, y = sin_s)[name = string("op_5133_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_63_cast_fp16 = add(x = var_5126_cast_fp16, y = var_5133_cast_fp16)[name = string("q_63_cast_fp16")];
+            string var_5146_pad_type_0 = const()[name = string("op_5146_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_5146_strides_0 = const()[name = string("op_5146_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_5146_pad_0 = const()[name = string("op_5146_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_5146_dilations_0 = const()[name = string("op_5146_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_5146_groups_0 = const()[name = string("op_5146_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_5146 = conv(dilations = var_5146_dilations_0, groups = var_5146_groups_0, pad = var_5146_pad_0, pad_type = var_5146_pad_type_0, strides = var_5146_strides_0, weight = layers_c2_7_self_attn_k_proj_weight_palettized, x = var_5067_cast_fp16)[name = string("op_5146")];
+            tensor<int32, [4]> var_5151 = const()[name = string("op_5151"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_5152 = reshape(shape = var_5151, x = var_5146)[name = string("op_5152")];
+            tensor<int32, [4]> var_5157 = const()[name = string("op_5157"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_5174_pad_type_0 = const()[name = string("op_5174_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_5174_strides_0 = const()[name = string("op_5174_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_5174_pad_0 = const()[name = string("op_5174_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_5174_dilations_0 = const()[name = string("op_5174_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_5174_groups_0 = const()[name = string("op_5174_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_5174 = conv(dilations = var_5174_dilations_0, groups = var_5174_groups_0, pad = var_5174_pad_0, pad_type = var_5174_pad_type_0, strides = var_5174_strides_0, weight = layers_c2_7_self_attn_v_proj_weight_palettized, x = var_5067_cast_fp16)[name = string("op_5174")];
+            tensor<int32, [4]> var_5179 = const()[name = string("op_5179"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_5180 = reshape(shape = var_5179, x = var_5174)[name = string("op_5180")];
+            tensor<int32, [4]> var_5185 = const()[name = string("op_5185"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_5195 = const()[name = string("op_5195"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp16, [1, 2, 1, 256]> var_5158 = transpose(perm = var_5157, x = var_5152)[name = string("transpose_177")];
+            tensor<fp16, [1, 2, 256]> x_143 = reshape(shape = var_5195, x = var_5158)[name = string("x_143")];
+            int32 var_5201 = const()[name = string("op_5201"), val = int32(-1)];
+            fp16 const_85_promoted = const()[name = string("const_85_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 256]> var_5203 = mul(x = x_143, y = const_85_promoted)[name = string("op_5203")];
+            bool input_213_interleave_0 = const()[name = string("input_213_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512]> input_213 = concat(axis = var_5201, interleave = input_213_interleave_0, values = (x_143, var_5203))[name = string("input_213")];
+            tensor<int32, [1]> normed_205_axes_0 = const()[name = string("normed_205_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5198_to_fp16 = const()[name = string("op_5198_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 512]> normed_205_cast_fp16 = layer_norm(axes = normed_205_axes_0, epsilon = var_5198_to_fp16, x = input_213)[name = string("normed_205_cast_fp16")];
+            tensor<int32, [2]> var_5208_split_sizes_0 = const()[name = string("op_5208_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_5208_axis_0 = const()[name = string("op_5208_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 256]> var_5208_0, tensor<fp16, [1, 2, 256]> var_5208_1 = split(axis = var_5208_axis_0, split_sizes = var_5208_split_sizes_0, x = normed_205_cast_fp16)[name = string("op_5208")];
+            tensor<fp16, [1, 2, 256]> var_5210 = mul(x = var_5208_0, y = layers_c2_7_self_attn_k_norm_weight)[name = string("op_5210")];
+            tensor<int32, [4]> var_5215 = const()[name = string("op_5215"), val = tensor<int32, [4]>([1, 2, 1, 256])];
+            tensor<fp16, [1, 2, 1, 256]> q_61 = reshape(shape = var_5215, x = var_5210)[name = string("q_61")];
+            fp16 var_5217_promoted = const()[name = string("op_5217_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 256]> var_5186 = transpose(perm = var_5185, x = var_5180)[name = string("transpose_176")];
+            tensor<fp16, [1, 2, 1, 256]> var_5218 = pow(x = var_5186, y = var_5217_promoted)[name = string("op_5218")];
+            tensor<int32, [1]> var_5223_axes_0 = const()[name = string("op_5223_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5223_keep_dims_0 = const()[name = string("op_5223_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_5223 = reduce_mean(axes = var_5223_axes_0, keep_dims = var_5223_keep_dims_0, x = var_5218)[name = string("op_5223")];
+            fp16 var_5225_to_fp16 = const()[name = string("op_5225_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_15_cast_fp16 = add(x = var_5223, y = var_5225_to_fp16)[name = string("mean_sq_15_cast_fp16")];
+            fp32 var_5227_epsilon_0 = const()[name = string("op_5227_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_5227_cast_fp16 = rsqrt(epsilon = var_5227_epsilon_0, x = mean_sq_15_cast_fp16)[name = string("op_5227_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_217_cast_fp16 = mul(x = var_5186, y = var_5227_cast_fp16)[name = string("input_217_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> var_5229_cast_fp16 = mul(x = q_61, y = cos_s)[name = string("op_5229_cast_fp16")];
+            tensor<int32, [2]> var_5230_split_sizes_0 = const()[name = string("op_5230_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_5230_axis_0 = const()[name = string("op_5230_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 128]> var_5230_0, tensor<fp16, [1, 2, 1, 128]> var_5230_1 = split(axis = var_5230_axis_0, split_sizes = var_5230_split_sizes_0, x = q_61)[name = string("op_5230")];
+            fp16 const_86_promoted = const()[name = string("const_86_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 128]> var_5232 = mul(x = var_5230_1, y = const_86_promoted)[name = string("op_5232")];
+            int32 var_5234 = const()[name = string("op_5234"), val = int32(-1)];
+            bool var_5235_interleave_0 = const()[name = string("op_5235_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 256]> var_5235 = concat(axis = var_5234, interleave = var_5235_interleave_0, values = (var_5232, var_5230_0))[name = string("op_5235")];
+            tensor<fp16, [1, 2, 1, 256]> var_5236_cast_fp16 = mul(x = var_5235, y = sin_s)[name = string("op_5236_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_215_cast_fp16 = add(x = var_5229_cast_fp16, y = var_5236_cast_fp16)[name = string("input_215_cast_fp16")];
+            tensor<int32, [8]> k_padded_13_pad_0 = const()[name = string("k_padded_13_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_13_mode_0 = const()[name = string("k_padded_13_mode_0"), val = string("constant")];
+            fp16 const_87_to_fp16 = const()[name = string("const_87_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> k_padded_13_cast_fp16 = pad(constant_val = const_87_to_fp16, mode = k_padded_13_mode_0, pad = k_padded_13_pad_0, x = input_215_cast_fp16)[name = string("k_padded_13_cast_fp16")];
+            tensor<int32, [8]> v_padded_13_pad_0 = const()[name = string("v_padded_13_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_13_mode_0 = const()[name = string("v_padded_13_mode_0"), val = string("constant")];
+            fp16 const_88_to_fp16 = const()[name = string("const_88_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> v_padded_13_cast_fp16 = pad(constant_val = const_88_to_fp16, mode = v_padded_13_mode_0, pad = v_padded_13_pad_0, x = input_217_cast_fp16)[name = string("v_padded_13_cast_fp16")];
+            tensor<int32, [4]> var_5265_begin_0 = const()[name = string("op_5265_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_5265_end_0 = const()[name = string("op_5265_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_5265_end_mask_0 = const()[name = string("op_5265_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_5265_cast_fp16 = slice_by_index(begin = var_5265_begin_0, end = var_5265_end_0, end_mask = var_5265_end_mask_0, x = K_sliding_slot_13_cast_fp16)[name = string("op_5265_cast_fp16")];
+            int32 var_5272 = const()[name = string("op_5272"), val = int32(2)];
+            bool K_sliding_out_13_interleave_0 = const()[name = string("K_sliding_out_13_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_out_13_cast_fp16 = concat(axis = var_5272, interleave = K_sliding_out_13_interleave_0, values = (var_5265_cast_fp16, k_padded_13_cast_fp16))[name = string("K_sliding_out_13_cast_fp16")];
+            tensor<int32, [4]> var_5288_begin_0 = const()[name = string("op_5288_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_5288_end_0 = const()[name = string("op_5288_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_5288_end_mask_0 = const()[name = string("op_5288_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_5288_cast_fp16 = slice_by_index(begin = var_5288_begin_0, end = var_5288_end_0, end_mask = var_5288_end_mask_0, x = V_sliding_slot_13_cast_fp16)[name = string("op_5288_cast_fp16")];
+            int32 var_5295 = const()[name = string("op_5295"), val = int32(2)];
+            bool V_sliding_out_13_interleave_0 = const()[name = string("V_sliding_out_13_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_out_13_cast_fp16 = concat(axis = var_5295, interleave = V_sliding_out_13_interleave_0, values = (var_5288_cast_fp16, v_padded_13_cast_fp16))[name = string("V_sliding_out_13_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_15_begin_0 = const()[name = string("K_for_attn_15_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_15_end_0 = const()[name = string("K_for_attn_15_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_15_end_mask_0 = const()[name = string("K_for_attn_15_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_15_cast_fp16 = slice_by_index(begin = K_for_attn_15_begin_0, end = K_for_attn_15_end_0, end_mask = K_for_attn_15_end_mask_0, x = K_sliding_out_13_cast_fp16)[name = string("K_for_attn_15_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_15_begin_0 = const()[name = string("V_for_attn_15_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_15_end_0 = const()[name = string("V_for_attn_15_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_15_end_mask_0 = const()[name = string("V_for_attn_15_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_15_cast_fp16 = slice_by_index(begin = V_for_attn_15_begin_0, end = V_for_attn_15_end_0, end_mask = V_for_attn_15_end_mask_0, x = V_sliding_out_13_cast_fp16)[name = string("V_for_attn_15_cast_fp16")];
+            tensor<int32, [4]> transpose_28_perm_0 = const()[name = string("transpose_28_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_14_reps_0 = const()[name = string("tile_14_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_28_cast_fp16 = transpose(perm = transpose_28_perm_0, x = K_for_attn_15_cast_fp16)[name = string("transpose_175")];
+            tensor<fp16, [8, 1, 512, 256]> tile_14_cast_fp16 = tile(reps = tile_14_reps_0, x = transpose_28_cast_fp16)[name = string("tile_14_cast_fp16")];
+            tensor<int32, [5]> concat_28 = const()[name = string("concat_28"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_28_cast_fp16 = reshape(shape = concat_28, x = tile_14_cast_fp16)[name = string("reshape_28_cast_fp16")];
+            tensor<int32, [5]> transpose_29_perm_0 = const()[name = string("transpose_29_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_29 = const()[name = string("concat_29"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_29_cast_fp16 = transpose(perm = transpose_29_perm_0, x = reshape_28_cast_fp16)[name = string("transpose_174")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_29_cast_fp16 = reshape(shape = concat_29, x = transpose_29_cast_fp16)[name = string("reshape_29_cast_fp16")];
+            tensor<int32, [4]> transpose_91_perm_0 = const()[name = string("transpose_91_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_30_perm_0 = const()[name = string("transpose_30_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_15_reps_0 = const()[name = string("tile_15_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_30_cast_fp16 = transpose(perm = transpose_30_perm_0, x = V_for_attn_15_cast_fp16)[name = string("transpose_173")];
+            tensor<fp16, [8, 1, 512, 256]> tile_15_cast_fp16 = tile(reps = tile_15_reps_0, x = transpose_30_cast_fp16)[name = string("tile_15_cast_fp16")];
+            tensor<int32, [5]> concat_30 = const()[name = string("concat_30"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_30_cast_fp16 = reshape(shape = concat_30, x = tile_15_cast_fp16)[name = string("reshape_30_cast_fp16")];
+            tensor<int32, [5]> transpose_31_perm_0 = const()[name = string("transpose_31_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_31 = const()[name = string("concat_31"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_31_cast_fp16 = transpose(perm = transpose_31_perm_0, x = reshape_30_cast_fp16)[name = string("transpose_172")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_31_cast_fp16 = reshape(shape = concat_31, x = transpose_31_cast_fp16)[name = string("reshape_31_cast_fp16")];
+            tensor<int32, [4]> V_expanded_15_perm_0 = const()[name = string("V_expanded_15_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_29_transpose_x_0 = const()[name = string("attn_weights_29_transpose_x_0"), val = bool(false)];
+            bool attn_weights_29_transpose_y_0 = const()[name = string("attn_weights_29_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_91_cast_fp16 = transpose(perm = transpose_91_perm_0, x = reshape_29_cast_fp16)[name = string("transpose_171")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_29_cast_fp16 = matmul(transpose_x = attn_weights_29_transpose_x_0, transpose_y = attn_weights_29_transpose_y_0, x = q_63_cast_fp16, y = transpose_91_cast_fp16)[name = string("attn_weights_29_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_147_cast_fp16 = add(x = attn_weights_29_cast_fp16, y = causal_mask_sliding)[name = string("x_147_cast_fp16")];
+            tensor<int32, [1]> reduce_max_7_axes_0 = const()[name = string("reduce_max_7_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_7_keep_dims_0 = const()[name = string("reduce_max_7_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_7 = reduce_max(axes = reduce_max_7_axes_0, keep_dims = reduce_max_7_keep_dims_0, x = x_147_cast_fp16)[name = string("reduce_max_7")];
+            tensor<fp16, [1, 8, 1, 512]> var_5336 = sub(x = x_147_cast_fp16, y = reduce_max_7)[name = string("op_5336")];
+            tensor<fp16, [1, 8, 1, 512]> var_5342 = exp(x = var_5336)[name = string("op_5342")];
+            tensor<int32, [1]> var_5352_axes_0 = const()[name = string("op_5352_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5352_keep_dims_0 = const()[name = string("op_5352_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_5352 = reduce_sum(axes = var_5352_axes_0, keep_dims = var_5352_keep_dims_0, x = var_5342)[name = string("op_5352")];
+            tensor<fp16, [1, 8, 1, 512]> var_5358_cast_fp16 = real_div(x = var_5342, y = var_5352)[name = string("op_5358_cast_fp16")];
+            bool attn_output_43_transpose_x_0 = const()[name = string("attn_output_43_transpose_x_0"), val = bool(false)];
+            bool attn_output_43_transpose_y_0 = const()[name = string("attn_output_43_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_15_cast_fp16 = transpose(perm = V_expanded_15_perm_0, x = reshape_31_cast_fp16)[name = string("transpose_170")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_43_cast_fp16 = matmul(transpose_x = attn_output_43_transpose_x_0, transpose_y = attn_output_43_transpose_y_0, x = var_5358_cast_fp16, y = V_expanded_15_cast_fp16)[name = string("attn_output_43_cast_fp16")];
+            tensor<int32, [4]> var_5369 = const()[name = string("op_5369"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_5376 = const()[name = string("op_5376"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_5370_cast_fp16 = transpose(perm = var_5369, x = attn_output_43_cast_fp16)[name = string("transpose_169")];
+            tensor<fp16, [1, 1, 2048]> attn_output_45_cast_fp16 = reshape(shape = var_5376, x = var_5370_cast_fp16)[name = string("attn_output_45_cast_fp16")];
+            tensor<int32, [3]> var_5381 = const()[name = string("op_5381"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_5397_pad_type_0 = const()[name = string("op_5397_pad_type_0"), val = string("valid")];
+            int32 var_5397_groups_0 = const()[name = string("op_5397_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_5397_strides_0 = const()[name = string("op_5397_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_5397_pad_0 = const()[name = string("op_5397_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_5397_dilations_0 = const()[name = string("op_5397_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_7_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(938045056))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(940666560))))[name = string("squeeze_7_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_5382_cast_fp16 = transpose(perm = var_5381, x = attn_output_45_cast_fp16)[name = string("transpose_168")];
+            tensor<fp16, [1, 2560, 1]> var_5397_cast_fp16 = conv(dilations = var_5397_dilations_0, groups = var_5397_groups_0, pad = var_5397_pad_0, pad_type = var_5397_pad_type_0, strides = var_5397_strides_0, weight = squeeze_7_cast_fp16_to_fp32_to_fp16_palettized, x = var_5382_cast_fp16)[name = string("op_5397_cast_fp16")];
+            tensor<int32, [3]> var_5401 = const()[name = string("op_5401"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_5407 = const()[name = string("op_5407"), val = int32(-1)];
+            fp16 const_89_promoted_to_fp16 = const()[name = string("const_89_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_151_cast_fp16 = transpose(perm = var_5401, x = var_5397_cast_fp16)[name = string("transpose_167")];
+            tensor<fp16, [1, 1, 2560]> var_5409_cast_fp16 = mul(x = x_151_cast_fp16, y = const_89_promoted_to_fp16)[name = string("op_5409_cast_fp16")];
+            bool input_221_interleave_0 = const()[name = string("input_221_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_221_cast_fp16 = concat(axis = var_5407, interleave = input_221_interleave_0, values = (x_151_cast_fp16, var_5409_cast_fp16))[name = string("input_221_cast_fp16")];
+            tensor<int32, [1]> normed_209_axes_0 = const()[name = string("normed_209_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5404_to_fp16 = const()[name = string("op_5404_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_209_cast_fp16 = layer_norm(axes = normed_209_axes_0, epsilon = var_5404_to_fp16, x = input_221_cast_fp16)[name = string("normed_209_cast_fp16")];
+            tensor<int32, [2]> var_5414_split_sizes_0 = const()[name = string("op_5414_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5414_axis_0 = const()[name = string("op_5414_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_5414_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_5414_cast_fp16_1 = split(axis = var_5414_axis_0, split_sizes = var_5414_split_sizes_0, x = normed_209_cast_fp16)[name = string("op_5414_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_7_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_7_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(940669184)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_47_cast_fp16 = mul(x = var_5414_cast_fp16_0, y = layers_c2_7_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_47_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_153_cast_fp16 = add(x = x_139_cast_fp16, y = attn_output_47_cast_fp16)[name = string("x_153_cast_fp16")];
+            int32 var_5423 = const()[name = string("op_5423"), val = int32(-1)];
+            fp16 const_90_promoted_to_fp16 = const()[name = string("const_90_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_5425_cast_fp16 = mul(x = x_153_cast_fp16, y = const_90_promoted_to_fp16)[name = string("op_5425_cast_fp16")];
+            bool input_223_interleave_0 = const()[name = string("input_223_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_223_cast_fp16 = concat(axis = var_5423, interleave = input_223_interleave_0, values = (x_153_cast_fp16, var_5425_cast_fp16))[name = string("input_223_cast_fp16")];
+            tensor<int32, [1]> normed_213_axes_0 = const()[name = string("normed_213_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5420_to_fp16 = const()[name = string("op_5420_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_213_cast_fp16 = layer_norm(axes = normed_213_axes_0, epsilon = var_5420_to_fp16, x = input_223_cast_fp16)[name = string("normed_213_cast_fp16")];
+            tensor<int32, [2]> var_5430_split_sizes_0 = const()[name = string("op_5430_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5430_axis_0 = const()[name = string("op_5430_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_5430_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_5430_cast_fp16_1 = split(axis = var_5430_axis_0, split_sizes = var_5430_split_sizes_0, x = normed_213_cast_fp16)[name = string("op_5430_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_7_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_7_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(940674368)))];
+            tensor<fp16, [1, 1, 2560]> h_45_cast_fp16 = mul(x = var_5430_cast_fp16_0, y = layers_c2_7_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_45_cast_fp16")];
+            tensor<int32, [3]> var_5441 = const()[name = string("op_5441"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_225_axes_0 = const()[name = string("input_225_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_5442 = transpose(perm = var_5441, x = h_45_cast_fp16)[name = string("transpose_166")];
+            tensor<fp16, [1, 2560, 1, 1]> input_225 = expand_dims(axes = input_225_axes_0, x = var_5442)[name = string("input_225")];
+            string gate_29_pad_type_0 = const()[name = string("gate_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_29_strides_0 = const()[name = string("gate_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_29_pad_0 = const()[name = string("gate_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_29_dilations_0 = const()[name = string("gate_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_29_groups_0 = const()[name = string("gate_29_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_29 = conv(dilations = gate_29_dilations_0, groups = gate_29_groups_0, pad = gate_29_pad_0, pad_type = gate_29_pad_type_0, strides = gate_29_strides_0, weight = layers_c2_7_mlp_gate_proj_weight_palettized, x = input_225)[name = string("gate_29")];
+            string up_15_pad_type_0 = const()[name = string("up_15_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_15_strides_0 = const()[name = string("up_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_15_pad_0 = const()[name = string("up_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_15_dilations_0 = const()[name = string("up_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_15_groups_0 = const()[name = string("up_15_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_15 = conv(dilations = up_15_dilations_0, groups = up_15_groups_0, pad = up_15_pad_0, pad_type = up_15_pad_type_0, strides = up_15_strides_0, weight = layers_c2_7_mlp_up_proj_weight_palettized, x = input_225)[name = string("up_15")];
+            string gate_31_mode_0 = const()[name = string("gate_31_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_31 = gelu(mode = gate_31_mode_0, x = gate_29)[name = string("gate_31")];
+            tensor<fp16, [1, 10240, 1, 1]> input_227 = mul(x = gate_31, y = up_15)[name = string("input_227")];
+            string mlp_out_15_pad_type_0 = const()[name = string("mlp_out_15_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_15_strides_0 = const()[name = string("mlp_out_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_15_pad_0 = const()[name = string("mlp_out_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_15_dilations_0 = const()[name = string("mlp_out_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_15_groups_0 = const()[name = string("mlp_out_15_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_15 = conv(dilations = mlp_out_15_dilations_0, groups = mlp_out_15_groups_0, pad = mlp_out_15_pad_0, pad_type = mlp_out_15_pad_type_0, strides = mlp_out_15_strides_0, weight = layers_c2_7_mlp_down_proj_weight_palettized, x = input_227)[name = string("mlp_out_15")];
+            tensor<int32, [1]> var_5482_axes_0 = const()[name = string("op_5482_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_5482 = squeeze(axes = var_5482_axes_0, x = mlp_out_15)[name = string("op_5482")];
+            tensor<int32, [3]> var_5486 = const()[name = string("op_5486"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_5492 = const()[name = string("op_5492"), val = int32(-1)];
+            fp16 const_91_promoted = const()[name = string("const_91_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_155 = transpose(perm = var_5486, x = var_5482)[name = string("transpose_165")];
+            tensor<fp16, [1, 1, 2560]> var_5494 = mul(x = x_155, y = const_91_promoted)[name = string("op_5494")];
+            bool input_229_interleave_0 = const()[name = string("input_229_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_229 = concat(axis = var_5492, interleave = input_229_interleave_0, values = (x_155, var_5494))[name = string("input_229")];
+            tensor<int32, [1]> normed_217_axes_0 = const()[name = string("normed_217_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5489_to_fp16 = const()[name = string("op_5489_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_217_cast_fp16 = layer_norm(axes = normed_217_axes_0, epsilon = var_5489_to_fp16, x = input_229)[name = string("normed_217_cast_fp16")];
+            tensor<int32, [2]> var_5499_split_sizes_0 = const()[name = string("op_5499_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5499_axis_0 = const()[name = string("op_5499_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_5499_0, tensor<fp16, [1, 1, 2560]> var_5499_1 = split(axis = var_5499_axis_0, split_sizes = var_5499_split_sizes_0, x = normed_217_cast_fp16)[name = string("op_5499")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_73 = mul(x = var_5499_0, y = layers_c2_7_post_feedforward_layernorm_weight)[name = string("hidden_states_73")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_75_cast_fp16 = add(x = x_153_cast_fp16, y = hidden_states_73)[name = string("hidden_states_75_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_15_begin_0 = const()[name = string("per_layer_slice_15_begin_0"), val = tensor<int32, [3]>([0, 0, 4864])];
+            tensor<int32, [3]> per_layer_slice_15_end_0 = const()[name = string("per_layer_slice_15_end_0"), val = tensor<int32, [3]>([1, 1, 5120])];
+            tensor<bool, [3]> per_layer_slice_15_end_mask_0 = const()[name = string("per_layer_slice_15_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_15_cast_fp16 = slice_by_index(begin = per_layer_slice_15_begin_0, end = per_layer_slice_15_end_0, end_mask = per_layer_slice_15_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_15_cast_fp16")];
+            tensor<int32, [3]> var_5527 = const()[name = string("op_5527"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_231_axes_0 = const()[name = string("input_231_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_5528 = transpose(perm = var_5527, x = hidden_states_75_cast_fp16)[name = string("transpose_164")];
+            tensor<fp16, [1, 2560, 1, 1]> input_231 = expand_dims(axes = input_231_axes_0, x = var_5528)[name = string("input_231")];
+            string gated_43_pad_type_0 = const()[name = string("gated_43_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_43_strides_0 = const()[name = string("gated_43_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_43_pad_0 = const()[name = string("gated_43_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_43_dilations_0 = const()[name = string("gated_43_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_43_groups_0 = const()[name = string("gated_43_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_43 = conv(dilations = gated_43_dilations_0, groups = gated_43_groups_0, pad = gated_43_pad_0, pad_type = gated_43_pad_type_0, strides = gated_43_strides_0, weight = layers_c2_7_per_layer_input_gate_weight_palettized, x = input_231)[name = string("gated_43")];
+            string gated_45_mode_0 = const()[name = string("gated_45_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_45 = gelu(mode = gated_45_mode_0, x = gated_43)[name = string("gated_45")];
+            tensor<int32, [3]> var_5547 = const()[name = string("op_5547"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_15_axes_0 = const()[name = string("per_layer_slice_conv_15_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_5548_cast_fp16 = transpose(perm = var_5547, x = per_layer_slice_15_cast_fp16)[name = string("transpose_163")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_15_cast_fp16 = expand_dims(axes = per_layer_slice_conv_15_axes_0, x = var_5548_cast_fp16)[name = string("per_layer_slice_conv_15_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_233_cast_fp16 = mul(x = gated_45, y = per_layer_slice_conv_15_cast_fp16)[name = string("input_233_cast_fp16")];
+            string gated_47_pad_type_0 = const()[name = string("gated_47_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_47_strides_0 = const()[name = string("gated_47_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_47_pad_0 = const()[name = string("gated_47_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_47_dilations_0 = const()[name = string("gated_47_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_47_groups_0 = const()[name = string("gated_47_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_c2_7_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(940679552))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(941007296))))[name = string("layers_c2_7_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_47_cast_fp16 = conv(dilations = gated_47_dilations_0, groups = gated_47_groups_0, pad = gated_47_pad_0, pad_type = gated_47_pad_type_0, strides = gated_47_strides_0, weight = layers_c2_7_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_233_cast_fp16)[name = string("gated_47_cast_fp16")];
+            tensor<int32, [1]> var_5564_axes_0 = const()[name = string("op_5564_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_5564_cast_fp16 = squeeze(axes = var_5564_axes_0, x = gated_47_cast_fp16)[name = string("op_5564_cast_fp16")];
+            tensor<int32, [3]> var_5568 = const()[name = string("op_5568"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_5574 = const()[name = string("op_5574"), val = int32(-1)];
+            fp16 const_92_promoted_to_fp16 = const()[name = string("const_92_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_157_cast_fp16 = transpose(perm = var_5568, x = var_5564_cast_fp16)[name = string("transpose_162")];
+            tensor<fp16, [1, 1, 2560]> var_5576_cast_fp16 = mul(x = x_157_cast_fp16, y = const_92_promoted_to_fp16)[name = string("op_5576_cast_fp16")];
+            bool input_235_interleave_0 = const()[name = string("input_235_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_235_cast_fp16 = concat(axis = var_5574, interleave = input_235_interleave_0, values = (x_157_cast_fp16, var_5576_cast_fp16))[name = string("input_235_cast_fp16")];
+            tensor<int32, [1]> normed_221_axes_0 = const()[name = string("normed_221_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5571_to_fp16 = const()[name = string("op_5571_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_221_cast_fp16 = layer_norm(axes = normed_221_axes_0, epsilon = var_5571_to_fp16, x = input_235_cast_fp16)[name = string("normed_221_cast_fp16")];
+            tensor<int32, [2]> var_5581_split_sizes_0 = const()[name = string("op_5581_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5581_axis_0 = const()[name = string("op_5581_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_5581_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_5581_cast_fp16_1 = split(axis = var_5581_axis_0, split_sizes = var_5581_split_sizes_0, x = normed_221_cast_fp16)[name = string("op_5581_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_7_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_c2_7_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(941009920)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_79_cast_fp16 = mul(x = var_5581_cast_fp16_0, y = layers_c2_7_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_79_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_81_cast_fp16 = add(x = hidden_states_75_cast_fp16, y = hidden_states_79_cast_fp16)[name = string("hidden_states_81_cast_fp16")];
+            tensor<fp16, [1]> const_93_promoted_to_fp16 = const()[name = string("const_93_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.06p-1])];
+            tensor<fp16, [1, 1, 2560]> x_159_cast_fp16 = mul(x = hidden_states_81_cast_fp16, y = const_93_promoted_to_fp16)[name = string("x_159_cast_fp16")];
+            tensor<int32, [1]> var_5593_axes_0 = const()[name = string("op_5593_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_5593_cast_fp16 = squeeze(axes = var_5593_axes_0, x = K_sliding_out_13_cast_fp16)[name = string("op_5593_cast_fp16")];
+            tensor<int32, [1]> var_5595_axes_0 = const()[name = string("op_5595_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_5595_cast_fp16 = squeeze(axes = var_5595_axes_0, x = V_sliding_out_13_cast_fp16)[name = string("op_5595_cast_fp16")];
+            tensor<int32, [4]> var_5598_begin_0 = const()[name = string("op_5598_begin_0"), val = tensor<int32, [4]>([7, 0, 0, 0])];
+            tensor<int32, [4]> var_5598_end_0 = const()[name = string("op_5598_end_0"), val = tensor<int32, [4]>([8, 2, 512, 512])];
+            tensor<bool, [4]> var_5598_end_mask_0 = const()[name = string("op_5598_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_5598_squeeze_mask_0 = const()[name = string("op_5598_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_5598_cast_fp16 = slice_by_index(begin = var_5598_begin_0, end = var_5598_end_0, end_mask = var_5598_end_mask_0, squeeze_mask = var_5598_squeeze_mask_0, x = K_sliding_in)[name = string("op_5598_cast_fp16")];
+            tensor<int32, [1]> K_sliding_slot_15_axes_0 = const()[name = string("K_sliding_slot_15_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_slot_15_cast_fp16 = expand_dims(axes = K_sliding_slot_15_axes_0, x = var_5598_cast_fp16)[name = string("K_sliding_slot_15_cast_fp16")];
+            tensor<int32, [4]> var_5603_begin_0 = const()[name = string("op_5603_begin_0"), val = tensor<int32, [4]>([7, 0, 0, 0])];
+            tensor<int32, [4]> var_5603_end_0 = const()[name = string("op_5603_end_0"), val = tensor<int32, [4]>([8, 2, 512, 512])];
+            tensor<bool, [4]> var_5603_end_mask_0 = const()[name = string("op_5603_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_5603_squeeze_mask_0 = const()[name = string("op_5603_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_5603_cast_fp16 = slice_by_index(begin = var_5603_begin_0, end = var_5603_end_0, end_mask = var_5603_end_mask_0, squeeze_mask = var_5603_squeeze_mask_0, x = V_sliding_in)[name = string("op_5603_cast_fp16")];
+            tensor<int32, [1]> V_sliding_slot_15_axes_0 = const()[name = string("V_sliding_slot_15_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_slot_15_cast_fp16 = expand_dims(axes = V_sliding_slot_15_axes_0, x = var_5603_cast_fp16)[name = string("V_sliding_slot_15_cast_fp16")];
+            int32 var_5610 = const()[name = string("op_5610"), val = int32(-1)];
+            fp16 const_94_promoted_to_fp16 = const()[name = string("const_94_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_5612_cast_fp16 = mul(x = x_159_cast_fp16, y = const_94_promoted_to_fp16)[name = string("op_5612_cast_fp16")];
+            bool input_237_interleave_0 = const()[name = string("input_237_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_237_cast_fp16 = concat(axis = var_5610, interleave = input_237_interleave_0, values = (x_159_cast_fp16, var_5612_cast_fp16))[name = string("input_237_cast_fp16")];
+            tensor<int32, [1]> normed_225_axes_0 = const()[name = string("normed_225_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5607_to_fp16 = const()[name = string("op_5607_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_225_cast_fp16 = layer_norm(axes = normed_225_axes_0, epsilon = var_5607_to_fp16, x = input_237_cast_fp16)[name = string("normed_225_cast_fp16")];
+            tensor<int32, [2]> var_5617_split_sizes_0 = const()[name = string("op_5617_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5617_axis_0 = const()[name = string("op_5617_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_5617_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_5617_cast_fp16_1 = split(axis = var_5617_axis_0, split_sizes = var_5617_split_sizes_0, x = normed_225_cast_fp16)[name = string("op_5617_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_8_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_8_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(941015104)))];
+            tensor<fp16, [1, 1, 2560]> h_49_cast_fp16 = mul(x = var_5617_cast_fp16_0, y = layers_c2_8_input_layernorm_weight_promoted_to_fp16)[name = string("h_49_cast_fp16")];
+            tensor<int32, [3]> var_5623 = const()[name = string("op_5623"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_5626_axes_0 = const()[name = string("op_5626_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_5624_cast_fp16 = transpose(perm = var_5623, x = h_49_cast_fp16)[name = string("transpose_161")];
+            tensor<fp16, [1, 2560, 1, 1]> var_5626_cast_fp16 = expand_dims(axes = var_5626_axes_0, x = var_5624_cast_fp16)[name = string("op_5626_cast_fp16")];
+            string var_5642_pad_type_0 = const()[name = string("op_5642_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_5642_strides_0 = const()[name = string("op_5642_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_5642_pad_0 = const()[name = string("op_5642_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_5642_dilations_0 = const()[name = string("op_5642_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_5642_groups_0 = const()[name = string("op_5642_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_5642 = conv(dilations = var_5642_dilations_0, groups = var_5642_groups_0, pad = var_5642_pad_0, pad_type = var_5642_pad_type_0, strides = var_5642_strides_0, weight = layers_c2_8_self_attn_q_proj_weight_palettized, x = var_5626_cast_fp16)[name = string("op_5642")];
+            tensor<int32, [4]> var_5647 = const()[name = string("op_5647"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_5648 = reshape(shape = var_5647, x = var_5642)[name = string("op_5648")];
+            tensor<int32, [4]> var_5653 = const()[name = string("op_5653"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_5663 = const()[name = string("op_5663"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_5654 = transpose(perm = var_5653, x = var_5648)[name = string("transpose_160")];
+            tensor<fp16, [1, 8, 256]> x_161 = reshape(shape = var_5663, x = var_5654)[name = string("x_161")];
+            int32 var_5669 = const()[name = string("op_5669"), val = int32(-1)];
+            fp16 const_95_promoted = const()[name = string("const_95_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_5671 = mul(x = x_161, y = const_95_promoted)[name = string("op_5671")];
+            bool input_241_interleave_0 = const()[name = string("input_241_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_241 = concat(axis = var_5669, interleave = input_241_interleave_0, values = (x_161, var_5671))[name = string("input_241")];
+            tensor<int32, [1]> normed_229_axes_0 = const()[name = string("normed_229_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5666_to_fp16 = const()[name = string("op_5666_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_229_cast_fp16 = layer_norm(axes = normed_229_axes_0, epsilon = var_5666_to_fp16, x = input_241)[name = string("normed_229_cast_fp16")];
+            tensor<int32, [2]> var_5676_split_sizes_0 = const()[name = string("op_5676_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_5676_axis_0 = const()[name = string("op_5676_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_5676_0, tensor<fp16, [1, 8, 256]> var_5676_1 = split(axis = var_5676_axis_0, split_sizes = var_5676_split_sizes_0, x = normed_229_cast_fp16)[name = string("op_5676")];
+            tensor<int32, [4]> var_5683 = const()[name = string("op_5683"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_67 = reshape(shape = var_5683, x = var_5676_0)[name = string("q_67")];
+            tensor<fp16, [1, 8, 1, 256]> var_5685_cast_fp16 = mul(x = q_67, y = cos_s)[name = string("op_5685_cast_fp16")];
+            tensor<int32, [2]> var_5686_split_sizes_0 = const()[name = string("op_5686_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_5686_axis_0 = const()[name = string("op_5686_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_5686_0, tensor<fp16, [1, 8, 1, 128]> var_5686_1 = split(axis = var_5686_axis_0, split_sizes = var_5686_split_sizes_0, x = q_67)[name = string("op_5686")];
+            fp16 const_96_promoted = const()[name = string("const_96_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_5688 = mul(x = var_5686_1, y = const_96_promoted)[name = string("op_5688")];
+            int32 var_5690 = const()[name = string("op_5690"), val = int32(-1)];
+            bool var_5691_interleave_0 = const()[name = string("op_5691_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_5691 = concat(axis = var_5690, interleave = var_5691_interleave_0, values = (var_5688, var_5686_0))[name = string("op_5691")];
+            tensor<fp16, [1, 8, 1, 256]> var_5692_cast_fp16 = mul(x = var_5691, y = sin_s)[name = string("op_5692_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_71_cast_fp16 = add(x = var_5685_cast_fp16, y = var_5692_cast_fp16)[name = string("q_71_cast_fp16")];
+            string var_5705_pad_type_0 = const()[name = string("op_5705_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_5705_strides_0 = const()[name = string("op_5705_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_5705_pad_0 = const()[name = string("op_5705_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_5705_dilations_0 = const()[name = string("op_5705_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_5705_groups_0 = const()[name = string("op_5705_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_5705 = conv(dilations = var_5705_dilations_0, groups = var_5705_groups_0, pad = var_5705_pad_0, pad_type = var_5705_pad_type_0, strides = var_5705_strides_0, weight = layers_c2_8_self_attn_k_proj_weight_palettized, x = var_5626_cast_fp16)[name = string("op_5705")];
+            tensor<int32, [4]> var_5710 = const()[name = string("op_5710"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_5711 = reshape(shape = var_5710, x = var_5705)[name = string("op_5711")];
+            tensor<int32, [4]> var_5716 = const()[name = string("op_5716"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_5733_pad_type_0 = const()[name = string("op_5733_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_5733_strides_0 = const()[name = string("op_5733_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_5733_pad_0 = const()[name = string("op_5733_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_5733_dilations_0 = const()[name = string("op_5733_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_5733_groups_0 = const()[name = string("op_5733_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_5733 = conv(dilations = var_5733_dilations_0, groups = var_5733_groups_0, pad = var_5733_pad_0, pad_type = var_5733_pad_type_0, strides = var_5733_strides_0, weight = layers_c2_8_self_attn_v_proj_weight_palettized, x = var_5626_cast_fp16)[name = string("op_5733")];
+            tensor<int32, [4]> var_5738 = const()[name = string("op_5738"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_5739 = reshape(shape = var_5738, x = var_5733)[name = string("op_5739")];
+            tensor<int32, [4]> var_5744 = const()[name = string("op_5744"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_5754 = const()[name = string("op_5754"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp16, [1, 2, 1, 256]> var_5717 = transpose(perm = var_5716, x = var_5711)[name = string("transpose_159")];
+            tensor<fp16, [1, 2, 256]> x_163 = reshape(shape = var_5754, x = var_5717)[name = string("x_163")];
+            int32 var_5760 = const()[name = string("op_5760"), val = int32(-1)];
+            fp16 const_97_promoted = const()[name = string("const_97_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 256]> var_5762 = mul(x = x_163, y = const_97_promoted)[name = string("op_5762")];
+            bool input_243_interleave_0 = const()[name = string("input_243_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512]> input_243 = concat(axis = var_5760, interleave = input_243_interleave_0, values = (x_163, var_5762))[name = string("input_243")];
+            tensor<int32, [1]> normed_233_axes_0 = const()[name = string("normed_233_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5757_to_fp16 = const()[name = string("op_5757_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 512]> normed_233_cast_fp16 = layer_norm(axes = normed_233_axes_0, epsilon = var_5757_to_fp16, x = input_243)[name = string("normed_233_cast_fp16")];
+            tensor<int32, [2]> var_5767_split_sizes_0 = const()[name = string("op_5767_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_5767_axis_0 = const()[name = string("op_5767_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 256]> var_5767_0, tensor<fp16, [1, 2, 256]> var_5767_1 = split(axis = var_5767_axis_0, split_sizes = var_5767_split_sizes_0, x = normed_233_cast_fp16)[name = string("op_5767")];
+            tensor<fp16, [1, 2, 256]> var_5769 = mul(x = var_5767_0, y = layers_c2_8_self_attn_k_norm_weight)[name = string("op_5769")];
+            tensor<int32, [4]> var_5774 = const()[name = string("op_5774"), val = tensor<int32, [4]>([1, 2, 1, 256])];
+            tensor<fp16, [1, 2, 1, 256]> q_69 = reshape(shape = var_5774, x = var_5769)[name = string("q_69")];
+            fp16 var_5776_promoted = const()[name = string("op_5776_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 256]> var_5745 = transpose(perm = var_5744, x = var_5739)[name = string("transpose_158")];
+            tensor<fp16, [1, 2, 1, 256]> var_5777 = pow(x = var_5745, y = var_5776_promoted)[name = string("op_5777")];
+            tensor<int32, [1]> var_5782_axes_0 = const()[name = string("op_5782_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5782_keep_dims_0 = const()[name = string("op_5782_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_5782 = reduce_mean(axes = var_5782_axes_0, keep_dims = var_5782_keep_dims_0, x = var_5777)[name = string("op_5782")];
+            fp16 var_5784_to_fp16 = const()[name = string("op_5784_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_17_cast_fp16 = add(x = var_5782, y = var_5784_to_fp16)[name = string("mean_sq_17_cast_fp16")];
+            fp32 var_5786_epsilon_0 = const()[name = string("op_5786_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_5786_cast_fp16 = rsqrt(epsilon = var_5786_epsilon_0, x = mean_sq_17_cast_fp16)[name = string("op_5786_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_247_cast_fp16 = mul(x = var_5745, y = var_5786_cast_fp16)[name = string("input_247_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> var_5788_cast_fp16 = mul(x = q_69, y = cos_s)[name = string("op_5788_cast_fp16")];
+            tensor<int32, [2]> var_5789_split_sizes_0 = const()[name = string("op_5789_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_5789_axis_0 = const()[name = string("op_5789_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 128]> var_5789_0, tensor<fp16, [1, 2, 1, 128]> var_5789_1 = split(axis = var_5789_axis_0, split_sizes = var_5789_split_sizes_0, x = q_69)[name = string("op_5789")];
+            fp16 const_98_promoted = const()[name = string("const_98_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 128]> var_5791 = mul(x = var_5789_1, y = const_98_promoted)[name = string("op_5791")];
+            int32 var_5793 = const()[name = string("op_5793"), val = int32(-1)];
+            bool var_5794_interleave_0 = const()[name = string("op_5794_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 256]> var_5794 = concat(axis = var_5793, interleave = var_5794_interleave_0, values = (var_5791, var_5789_0))[name = string("op_5794")];
+            tensor<fp16, [1, 2, 1, 256]> var_5795_cast_fp16 = mul(x = var_5794, y = sin_s)[name = string("op_5795_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_245_cast_fp16 = add(x = var_5788_cast_fp16, y = var_5795_cast_fp16)[name = string("input_245_cast_fp16")];
+            tensor<int32, [8]> k_padded_15_pad_0 = const()[name = string("k_padded_15_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_15_mode_0 = const()[name = string("k_padded_15_mode_0"), val = string("constant")];
+            fp16 const_99_to_fp16 = const()[name = string("const_99_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> k_padded_15_cast_fp16 = pad(constant_val = const_99_to_fp16, mode = k_padded_15_mode_0, pad = k_padded_15_pad_0, x = input_245_cast_fp16)[name = string("k_padded_15_cast_fp16")];
+            tensor<int32, [8]> v_padded_15_pad_0 = const()[name = string("v_padded_15_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_15_mode_0 = const()[name = string("v_padded_15_mode_0"), val = string("constant")];
+            fp16 const_100_to_fp16 = const()[name = string("const_100_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> v_padded_15_cast_fp16 = pad(constant_val = const_100_to_fp16, mode = v_padded_15_mode_0, pad = v_padded_15_pad_0, x = input_247_cast_fp16)[name = string("v_padded_15_cast_fp16")];
+            tensor<int32, [4]> var_5824_begin_0 = const()[name = string("op_5824_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_5824_end_0 = const()[name = string("op_5824_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_5824_end_mask_0 = const()[name = string("op_5824_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_5824_cast_fp16 = slice_by_index(begin = var_5824_begin_0, end = var_5824_end_0, end_mask = var_5824_end_mask_0, x = K_sliding_slot_15_cast_fp16)[name = string("op_5824_cast_fp16")];
+            int32 var_5831 = const()[name = string("op_5831"), val = int32(2)];
+            bool K_sliding_out_15_interleave_0 = const()[name = string("K_sliding_out_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_out_15_cast_fp16 = concat(axis = var_5831, interleave = K_sliding_out_15_interleave_0, values = (var_5824_cast_fp16, k_padded_15_cast_fp16))[name = string("K_sliding_out_15_cast_fp16")];
+            tensor<int32, [4]> var_5847_begin_0 = const()[name = string("op_5847_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_5847_end_0 = const()[name = string("op_5847_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_5847_end_mask_0 = const()[name = string("op_5847_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_5847_cast_fp16 = slice_by_index(begin = var_5847_begin_0, end = var_5847_end_0, end_mask = var_5847_end_mask_0, x = V_sliding_slot_15_cast_fp16)[name = string("op_5847_cast_fp16")];
+            int32 var_5854 = const()[name = string("op_5854"), val = int32(2)];
+            bool V_sliding_out_15_interleave_0 = const()[name = string("V_sliding_out_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_out_15_cast_fp16 = concat(axis = var_5854, interleave = V_sliding_out_15_interleave_0, values = (var_5847_cast_fp16, v_padded_15_cast_fp16))[name = string("V_sliding_out_15_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_17_begin_0 = const()[name = string("K_for_attn_17_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_17_end_0 = const()[name = string("K_for_attn_17_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_17_end_mask_0 = const()[name = string("K_for_attn_17_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_17_cast_fp16 = slice_by_index(begin = K_for_attn_17_begin_0, end = K_for_attn_17_end_0, end_mask = K_for_attn_17_end_mask_0, x = K_sliding_out_15_cast_fp16)[name = string("K_for_attn_17_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_17_begin_0 = const()[name = string("V_for_attn_17_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_17_end_0 = const()[name = string("V_for_attn_17_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_17_end_mask_0 = const()[name = string("V_for_attn_17_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_17_cast_fp16 = slice_by_index(begin = V_for_attn_17_begin_0, end = V_for_attn_17_end_0, end_mask = V_for_attn_17_end_mask_0, x = V_sliding_out_15_cast_fp16)[name = string("V_for_attn_17_cast_fp16")];
+            tensor<int32, [4]> transpose_32_perm_0 = const()[name = string("transpose_32_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_16_reps_0 = const()[name = string("tile_16_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_32_cast_fp16 = transpose(perm = transpose_32_perm_0, x = K_for_attn_17_cast_fp16)[name = string("transpose_157")];
+            tensor<fp16, [8, 1, 512, 256]> tile_16_cast_fp16 = tile(reps = tile_16_reps_0, x = transpose_32_cast_fp16)[name = string("tile_16_cast_fp16")];
+            tensor<int32, [5]> concat_32 = const()[name = string("concat_32"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_32_cast_fp16 = reshape(shape = concat_32, x = tile_16_cast_fp16)[name = string("reshape_32_cast_fp16")];
+            tensor<int32, [5]> transpose_33_perm_0 = const()[name = string("transpose_33_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_33 = const()[name = string("concat_33"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_33_cast_fp16 = transpose(perm = transpose_33_perm_0, x = reshape_32_cast_fp16)[name = string("transpose_156")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_33_cast_fp16 = reshape(shape = concat_33, x = transpose_33_cast_fp16)[name = string("reshape_33_cast_fp16")];
+            tensor<int32, [4]> transpose_92_perm_0 = const()[name = string("transpose_92_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_34_perm_0 = const()[name = string("transpose_34_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_17_reps_0 = const()[name = string("tile_17_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_34_cast_fp16 = transpose(perm = transpose_34_perm_0, x = V_for_attn_17_cast_fp16)[name = string("transpose_155")];
+            tensor<fp16, [8, 1, 512, 256]> tile_17_cast_fp16 = tile(reps = tile_17_reps_0, x = transpose_34_cast_fp16)[name = string("tile_17_cast_fp16")];
+            tensor<int32, [5]> concat_34 = const()[name = string("concat_34"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_34_cast_fp16 = reshape(shape = concat_34, x = tile_17_cast_fp16)[name = string("reshape_34_cast_fp16")];
+            tensor<int32, [5]> transpose_35_perm_0 = const()[name = string("transpose_35_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_35 = const()[name = string("concat_35"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_35_cast_fp16 = transpose(perm = transpose_35_perm_0, x = reshape_34_cast_fp16)[name = string("transpose_154")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_35_cast_fp16 = reshape(shape = concat_35, x = transpose_35_cast_fp16)[name = string("reshape_35_cast_fp16")];
+            tensor<int32, [4]> V_expanded_17_perm_0 = const()[name = string("V_expanded_17_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_33_transpose_x_0 = const()[name = string("attn_weights_33_transpose_x_0"), val = bool(false)];
+            bool attn_weights_33_transpose_y_0 = const()[name = string("attn_weights_33_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_92_cast_fp16 = transpose(perm = transpose_92_perm_0, x = reshape_33_cast_fp16)[name = string("transpose_153")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_33_cast_fp16 = matmul(transpose_x = attn_weights_33_transpose_x_0, transpose_y = attn_weights_33_transpose_y_0, x = q_71_cast_fp16, y = transpose_92_cast_fp16)[name = string("attn_weights_33_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_167_cast_fp16 = add(x = attn_weights_33_cast_fp16, y = causal_mask_sliding)[name = string("x_167_cast_fp16")];
+            tensor<int32, [1]> reduce_max_8_axes_0 = const()[name = string("reduce_max_8_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_8_keep_dims_0 = const()[name = string("reduce_max_8_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_8 = reduce_max(axes = reduce_max_8_axes_0, keep_dims = reduce_max_8_keep_dims_0, x = x_167_cast_fp16)[name = string("reduce_max_8")];
+            tensor<fp16, [1, 8, 1, 512]> var_5895 = sub(x = x_167_cast_fp16, y = reduce_max_8)[name = string("op_5895")];
+            tensor<fp16, [1, 8, 1, 512]> var_5901 = exp(x = var_5895)[name = string("op_5901")];
+            tensor<int32, [1]> var_5911_axes_0 = const()[name = string("op_5911_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5911_keep_dims_0 = const()[name = string("op_5911_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_5911 = reduce_sum(axes = var_5911_axes_0, keep_dims = var_5911_keep_dims_0, x = var_5901)[name = string("op_5911")];
+            tensor<fp16, [1, 8, 1, 512]> var_5917_cast_fp16 = real_div(x = var_5901, y = var_5911)[name = string("op_5917_cast_fp16")];
+            bool attn_output_49_transpose_x_0 = const()[name = string("attn_output_49_transpose_x_0"), val = bool(false)];
+            bool attn_output_49_transpose_y_0 = const()[name = string("attn_output_49_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_17_cast_fp16 = transpose(perm = V_expanded_17_perm_0, x = reshape_35_cast_fp16)[name = string("transpose_152")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_49_cast_fp16 = matmul(transpose_x = attn_output_49_transpose_x_0, transpose_y = attn_output_49_transpose_y_0, x = var_5917_cast_fp16, y = V_expanded_17_cast_fp16)[name = string("attn_output_49_cast_fp16")];
+            tensor<int32, [4]> var_5928 = const()[name = string("op_5928"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_5935 = const()[name = string("op_5935"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_5929_cast_fp16 = transpose(perm = var_5928, x = attn_output_49_cast_fp16)[name = string("transpose_151")];
+            tensor<fp16, [1, 1, 2048]> attn_output_51_cast_fp16 = reshape(shape = var_5935, x = var_5929_cast_fp16)[name = string("attn_output_51_cast_fp16")];
+            tensor<int32, [3]> var_5940 = const()[name = string("op_5940"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_5956_pad_type_0 = const()[name = string("op_5956_pad_type_0"), val = string("valid")];
+            int32 var_5956_groups_0 = const()[name = string("op_5956_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_5956_strides_0 = const()[name = string("op_5956_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_5956_pad_0 = const()[name = string("op_5956_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_5956_dilations_0 = const()[name = string("op_5956_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_8_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(941020288))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(943641792))))[name = string("squeeze_8_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_5941_cast_fp16 = transpose(perm = var_5940, x = attn_output_51_cast_fp16)[name = string("transpose_150")];
+            tensor<fp16, [1, 2560, 1]> var_5956_cast_fp16 = conv(dilations = var_5956_dilations_0, groups = var_5956_groups_0, pad = var_5956_pad_0, pad_type = var_5956_pad_type_0, strides = var_5956_strides_0, weight = squeeze_8_cast_fp16_to_fp32_to_fp16_palettized, x = var_5941_cast_fp16)[name = string("op_5956_cast_fp16")];
+            tensor<int32, [3]> var_5960 = const()[name = string("op_5960"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_5966 = const()[name = string("op_5966"), val = int32(-1)];
+            fp16 const_101_promoted_to_fp16 = const()[name = string("const_101_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_171_cast_fp16 = transpose(perm = var_5960, x = var_5956_cast_fp16)[name = string("transpose_149")];
+            tensor<fp16, [1, 1, 2560]> var_5968_cast_fp16 = mul(x = x_171_cast_fp16, y = const_101_promoted_to_fp16)[name = string("op_5968_cast_fp16")];
+            bool input_251_interleave_0 = const()[name = string("input_251_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_251_cast_fp16 = concat(axis = var_5966, interleave = input_251_interleave_0, values = (x_171_cast_fp16, var_5968_cast_fp16))[name = string("input_251_cast_fp16")];
+            tensor<int32, [1]> normed_237_axes_0 = const()[name = string("normed_237_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5963_to_fp16 = const()[name = string("op_5963_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_237_cast_fp16 = layer_norm(axes = normed_237_axes_0, epsilon = var_5963_to_fp16, x = input_251_cast_fp16)[name = string("normed_237_cast_fp16")];
+            tensor<int32, [2]> var_5973_split_sizes_0 = const()[name = string("op_5973_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5973_axis_0 = const()[name = string("op_5973_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_5973_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_5973_cast_fp16_1 = split(axis = var_5973_axis_0, split_sizes = var_5973_split_sizes_0, x = normed_237_cast_fp16)[name = string("op_5973_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_8_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_8_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(943644416)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_53_cast_fp16 = mul(x = var_5973_cast_fp16_0, y = layers_c2_8_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_53_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_173_cast_fp16 = add(x = x_159_cast_fp16, y = attn_output_53_cast_fp16)[name = string("x_173_cast_fp16")];
+            int32 var_5982 = const()[name = string("op_5982"), val = int32(-1)];
+            fp16 const_102_promoted_to_fp16 = const()[name = string("const_102_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_5984_cast_fp16 = mul(x = x_173_cast_fp16, y = const_102_promoted_to_fp16)[name = string("op_5984_cast_fp16")];
+            bool input_253_interleave_0 = const()[name = string("input_253_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_253_cast_fp16 = concat(axis = var_5982, interleave = input_253_interleave_0, values = (x_173_cast_fp16, var_5984_cast_fp16))[name = string("input_253_cast_fp16")];
+            tensor<int32, [1]> normed_241_axes_0 = const()[name = string("normed_241_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5979_to_fp16 = const()[name = string("op_5979_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_241_cast_fp16 = layer_norm(axes = normed_241_axes_0, epsilon = var_5979_to_fp16, x = input_253_cast_fp16)[name = string("normed_241_cast_fp16")];
+            tensor<int32, [2]> var_5989_split_sizes_0 = const()[name = string("op_5989_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_5989_axis_0 = const()[name = string("op_5989_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_5989_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_5989_cast_fp16_1 = split(axis = var_5989_axis_0, split_sizes = var_5989_split_sizes_0, x = normed_241_cast_fp16)[name = string("op_5989_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_8_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_8_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(943649600)))];
+            tensor<fp16, [1, 1, 2560]> h_51_cast_fp16 = mul(x = var_5989_cast_fp16_0, y = layers_c2_8_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_51_cast_fp16")];
+            tensor<int32, [3]> var_6000 = const()[name = string("op_6000"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_255_axes_0 = const()[name = string("input_255_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_6001 = transpose(perm = var_6000, x = h_51_cast_fp16)[name = string("transpose_148")];
+            tensor<fp16, [1, 2560, 1, 1]> input_255 = expand_dims(axes = input_255_axes_0, x = var_6001)[name = string("input_255")];
+            string gate_33_pad_type_0 = const()[name = string("gate_33_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_33_strides_0 = const()[name = string("gate_33_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_33_pad_0 = const()[name = string("gate_33_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_33_dilations_0 = const()[name = string("gate_33_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_33_groups_0 = const()[name = string("gate_33_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_33 = conv(dilations = gate_33_dilations_0, groups = gate_33_groups_0, pad = gate_33_pad_0, pad_type = gate_33_pad_type_0, strides = gate_33_strides_0, weight = layers_c2_8_mlp_gate_proj_weight_palettized, x = input_255)[name = string("gate_33")];
+            string up_17_pad_type_0 = const()[name = string("up_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_17_strides_0 = const()[name = string("up_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_17_pad_0 = const()[name = string("up_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_17_dilations_0 = const()[name = string("up_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_17_groups_0 = const()[name = string("up_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_17 = conv(dilations = up_17_dilations_0, groups = up_17_groups_0, pad = up_17_pad_0, pad_type = up_17_pad_type_0, strides = up_17_strides_0, weight = layers_c2_8_mlp_up_proj_weight_palettized, x = input_255)[name = string("up_17")];
+            string gate_35_mode_0 = const()[name = string("gate_35_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_35 = gelu(mode = gate_35_mode_0, x = gate_33)[name = string("gate_35")];
+            tensor<fp16, [1, 10240, 1, 1]> input_257 = mul(x = gate_35, y = up_17)[name = string("input_257")];
+            string mlp_out_17_pad_type_0 = const()[name = string("mlp_out_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_17_strides_0 = const()[name = string("mlp_out_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_17_pad_0 = const()[name = string("mlp_out_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_17_dilations_0 = const()[name = string("mlp_out_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_17_groups_0 = const()[name = string("mlp_out_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_17 = conv(dilations = mlp_out_17_dilations_0, groups = mlp_out_17_groups_0, pad = mlp_out_17_pad_0, pad_type = mlp_out_17_pad_type_0, strides = mlp_out_17_strides_0, weight = layers_c2_8_mlp_down_proj_weight_palettized, x = input_257)[name = string("mlp_out_17")];
+            tensor<int32, [1]> var_6041_axes_0 = const()[name = string("op_6041_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_6041 = squeeze(axes = var_6041_axes_0, x = mlp_out_17)[name = string("op_6041")];
+            tensor<int32, [3]> var_6045 = const()[name = string("op_6045"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_6051 = const()[name = string("op_6051"), val = int32(-1)];
+            fp16 const_103_promoted = const()[name = string("const_103_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_175 = transpose(perm = var_6045, x = var_6041)[name = string("transpose_147")];
+            tensor<fp16, [1, 1, 2560]> var_6053 = mul(x = x_175, y = const_103_promoted)[name = string("op_6053")];
+            bool input_259_interleave_0 = const()[name = string("input_259_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_259 = concat(axis = var_6051, interleave = input_259_interleave_0, values = (x_175, var_6053))[name = string("input_259")];
+            tensor<int32, [1]> normed_245_axes_0 = const()[name = string("normed_245_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6048_to_fp16 = const()[name = string("op_6048_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_245_cast_fp16 = layer_norm(axes = normed_245_axes_0, epsilon = var_6048_to_fp16, x = input_259)[name = string("normed_245_cast_fp16")];
+            tensor<int32, [2]> var_6058_split_sizes_0 = const()[name = string("op_6058_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6058_axis_0 = const()[name = string("op_6058_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_6058_0, tensor<fp16, [1, 1, 2560]> var_6058_1 = split(axis = var_6058_axis_0, split_sizes = var_6058_split_sizes_0, x = normed_245_cast_fp16)[name = string("op_6058")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_83 = mul(x = var_6058_0, y = layers_c2_8_post_feedforward_layernorm_weight)[name = string("hidden_states_83")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_85_cast_fp16 = add(x = x_173_cast_fp16, y = hidden_states_83)[name = string("hidden_states_85_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_17_begin_0 = const()[name = string("per_layer_slice_17_begin_0"), val = tensor<int32, [3]>([0, 0, 5120])];
+            tensor<int32, [3]> per_layer_slice_17_end_0 = const()[name = string("per_layer_slice_17_end_0"), val = tensor<int32, [3]>([1, 1, 5376])];
+            tensor<bool, [3]> per_layer_slice_17_end_mask_0 = const()[name = string("per_layer_slice_17_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_17_cast_fp16 = slice_by_index(begin = per_layer_slice_17_begin_0, end = per_layer_slice_17_end_0, end_mask = per_layer_slice_17_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_17_cast_fp16")];
+            tensor<int32, [3]> var_6086 = const()[name = string("op_6086"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_261_axes_0 = const()[name = string("input_261_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_6087 = transpose(perm = var_6086, x = hidden_states_85_cast_fp16)[name = string("transpose_146")];
+            tensor<fp16, [1, 2560, 1, 1]> input_261 = expand_dims(axes = input_261_axes_0, x = var_6087)[name = string("input_261")];
+            string gated_49_pad_type_0 = const()[name = string("gated_49_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_49_strides_0 = const()[name = string("gated_49_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_49_pad_0 = const()[name = string("gated_49_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_49_dilations_0 = const()[name = string("gated_49_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_49_groups_0 = const()[name = string("gated_49_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_49 = conv(dilations = gated_49_dilations_0, groups = gated_49_groups_0, pad = gated_49_pad_0, pad_type = gated_49_pad_type_0, strides = gated_49_strides_0, weight = layers_c2_8_per_layer_input_gate_weight_palettized, x = input_261)[name = string("gated_49")];
+            string gated_51_mode_0 = const()[name = string("gated_51_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_51 = gelu(mode = gated_51_mode_0, x = gated_49)[name = string("gated_51")];
+            tensor<int32, [3]> var_6106 = const()[name = string("op_6106"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_17_axes_0 = const()[name = string("per_layer_slice_conv_17_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_6107_cast_fp16 = transpose(perm = var_6106, x = per_layer_slice_17_cast_fp16)[name = string("transpose_145")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_17_cast_fp16 = expand_dims(axes = per_layer_slice_conv_17_axes_0, x = var_6107_cast_fp16)[name = string("per_layer_slice_conv_17_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_263_cast_fp16 = mul(x = gated_51, y = per_layer_slice_conv_17_cast_fp16)[name = string("input_263_cast_fp16")];
+            string gated_53_pad_type_0 = const()[name = string("gated_53_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_53_strides_0 = const()[name = string("gated_53_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_53_pad_0 = const()[name = string("gated_53_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_53_dilations_0 = const()[name = string("gated_53_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_53_groups_0 = const()[name = string("gated_53_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_c2_8_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(943654784))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(943982528))))[name = string("layers_c2_8_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_53_cast_fp16 = conv(dilations = gated_53_dilations_0, groups = gated_53_groups_0, pad = gated_53_pad_0, pad_type = gated_53_pad_type_0, strides = gated_53_strides_0, weight = layers_c2_8_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_263_cast_fp16)[name = string("gated_53_cast_fp16")];
+            tensor<int32, [1]> var_6123_axes_0 = const()[name = string("op_6123_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_6123_cast_fp16 = squeeze(axes = var_6123_axes_0, x = gated_53_cast_fp16)[name = string("op_6123_cast_fp16")];
+            tensor<int32, [3]> var_6127 = const()[name = string("op_6127"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_6133 = const()[name = string("op_6133"), val = int32(-1)];
+            fp16 const_104_promoted_to_fp16 = const()[name = string("const_104_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_177_cast_fp16 = transpose(perm = var_6127, x = var_6123_cast_fp16)[name = string("transpose_144")];
+            tensor<fp16, [1, 1, 2560]> var_6135_cast_fp16 = mul(x = x_177_cast_fp16, y = const_104_promoted_to_fp16)[name = string("op_6135_cast_fp16")];
+            bool input_265_interleave_0 = const()[name = string("input_265_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_265_cast_fp16 = concat(axis = var_6133, interleave = input_265_interleave_0, values = (x_177_cast_fp16, var_6135_cast_fp16))[name = string("input_265_cast_fp16")];
+            tensor<int32, [1]> normed_249_axes_0 = const()[name = string("normed_249_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6130_to_fp16 = const()[name = string("op_6130_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_249_cast_fp16 = layer_norm(axes = normed_249_axes_0, epsilon = var_6130_to_fp16, x = input_265_cast_fp16)[name = string("normed_249_cast_fp16")];
+            tensor<int32, [2]> var_6140_split_sizes_0 = const()[name = string("op_6140_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6140_axis_0 = const()[name = string("op_6140_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_6140_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_6140_cast_fp16_1 = split(axis = var_6140_axis_0, split_sizes = var_6140_split_sizes_0, x = normed_249_cast_fp16)[name = string("op_6140_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_8_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_c2_8_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(943985152)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_89_cast_fp16 = mul(x = var_6140_cast_fp16_0, y = layers_c2_8_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_89_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_91_cast_fp16 = add(x = hidden_states_85_cast_fp16, y = hidden_states_89_cast_fp16)[name = string("hidden_states_91_cast_fp16")];
+            tensor<fp16, [1]> const_105_promoted_to_fp16 = const()[name = string("const_105_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.bap-2])];
+            tensor<fp16, [1, 1, 2560]> x_179_cast_fp16 = mul(x = hidden_states_91_cast_fp16, y = const_105_promoted_to_fp16)[name = string("x_179_cast_fp16")];
+            tensor<int32, [1]> var_6152_axes_0 = const()[name = string("op_6152_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_6152_cast_fp16 = squeeze(axes = var_6152_axes_0, x = K_sliding_out_15_cast_fp16)[name = string("op_6152_cast_fp16")];
+            tensor<int32, [1]> var_6154_axes_0 = const()[name = string("op_6154_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_6154_cast_fp16 = squeeze(axes = var_6154_axes_0, x = V_sliding_out_15_cast_fp16)[name = string("op_6154_cast_fp16")];
+            tensor<int32, [4]> var_6157_begin_0 = const()[name = string("op_6157_begin_0"), val = tensor<int32, [4]>([8, 0, 0, 0])];
+            tensor<int32, [4]> var_6157_end_0 = const()[name = string("op_6157_end_0"), val = tensor<int32, [4]>([9, 2, 512, 512])];
+            tensor<bool, [4]> var_6157_end_mask_0 = const()[name = string("op_6157_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_6157_squeeze_mask_0 = const()[name = string("op_6157_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_6157_cast_fp16 = slice_by_index(begin = var_6157_begin_0, end = var_6157_end_0, end_mask = var_6157_end_mask_0, squeeze_mask = var_6157_squeeze_mask_0, x = K_sliding_in)[name = string("op_6157_cast_fp16")];
+            tensor<int32, [1]> K_sliding_slot_17_axes_0 = const()[name = string("K_sliding_slot_17_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_slot_17_cast_fp16 = expand_dims(axes = K_sliding_slot_17_axes_0, x = var_6157_cast_fp16)[name = string("K_sliding_slot_17_cast_fp16")];
+            tensor<int32, [4]> var_6162_begin_0 = const()[name = string("op_6162_begin_0"), val = tensor<int32, [4]>([8, 0, 0, 0])];
+            tensor<int32, [4]> var_6162_end_0 = const()[name = string("op_6162_end_0"), val = tensor<int32, [4]>([9, 2, 512, 512])];
+            tensor<bool, [4]> var_6162_end_mask_0 = const()[name = string("op_6162_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_6162_squeeze_mask_0 = const()[name = string("op_6162_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_6162_cast_fp16 = slice_by_index(begin = var_6162_begin_0, end = var_6162_end_0, end_mask = var_6162_end_mask_0, squeeze_mask = var_6162_squeeze_mask_0, x = V_sliding_in)[name = string("op_6162_cast_fp16")];
+            tensor<int32, [1]> V_sliding_slot_17_axes_0 = const()[name = string("V_sliding_slot_17_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_slot_17_cast_fp16 = expand_dims(axes = V_sliding_slot_17_axes_0, x = var_6162_cast_fp16)[name = string("V_sliding_slot_17_cast_fp16")];
+            int32 var_6169 = const()[name = string("op_6169"), val = int32(-1)];
+            fp16 const_106_promoted_to_fp16 = const()[name = string("const_106_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_6171_cast_fp16 = mul(x = x_179_cast_fp16, y = const_106_promoted_to_fp16)[name = string("op_6171_cast_fp16")];
+            bool input_267_interleave_0 = const()[name = string("input_267_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_267_cast_fp16 = concat(axis = var_6169, interleave = input_267_interleave_0, values = (x_179_cast_fp16, var_6171_cast_fp16))[name = string("input_267_cast_fp16")];
+            tensor<int32, [1]> normed_253_axes_0 = const()[name = string("normed_253_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6166_to_fp16 = const()[name = string("op_6166_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_253_cast_fp16 = layer_norm(axes = normed_253_axes_0, epsilon = var_6166_to_fp16, x = input_267_cast_fp16)[name = string("normed_253_cast_fp16")];
+            tensor<int32, [2]> var_6176_split_sizes_0 = const()[name = string("op_6176_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6176_axis_0 = const()[name = string("op_6176_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_6176_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_6176_cast_fp16_1 = split(axis = var_6176_axis_0, split_sizes = var_6176_split_sizes_0, x = normed_253_cast_fp16)[name = string("op_6176_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_9_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_9_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(943990336)))];
+            tensor<fp16, [1, 1, 2560]> h_55_cast_fp16 = mul(x = var_6176_cast_fp16_0, y = layers_c2_9_input_layernorm_weight_promoted_to_fp16)[name = string("h_55_cast_fp16")];
+            tensor<int32, [3]> var_6182 = const()[name = string("op_6182"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_6185_axes_0 = const()[name = string("op_6185_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_6183_cast_fp16 = transpose(perm = var_6182, x = h_55_cast_fp16)[name = string("transpose_143")];
+            tensor<fp16, [1, 2560, 1, 1]> var_6185_cast_fp16 = expand_dims(axes = var_6185_axes_0, x = var_6183_cast_fp16)[name = string("op_6185_cast_fp16")];
+            string var_6201_pad_type_0 = const()[name = string("op_6201_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_6201_strides_0 = const()[name = string("op_6201_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_6201_pad_0 = const()[name = string("op_6201_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_6201_dilations_0 = const()[name = string("op_6201_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_6201_groups_0 = const()[name = string("op_6201_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_6201 = conv(dilations = var_6201_dilations_0, groups = var_6201_groups_0, pad = var_6201_pad_0, pad_type = var_6201_pad_type_0, strides = var_6201_strides_0, weight = layers_c2_9_self_attn_q_proj_weight_palettized, x = var_6185_cast_fp16)[name = string("op_6201")];
+            tensor<int32, [4]> var_6206 = const()[name = string("op_6206"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_6207 = reshape(shape = var_6206, x = var_6201)[name = string("op_6207")];
+            tensor<int32, [4]> var_6212 = const()[name = string("op_6212"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_6222 = const()[name = string("op_6222"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_6213 = transpose(perm = var_6212, x = var_6207)[name = string("transpose_142")];
+            tensor<fp16, [1, 8, 256]> x_181 = reshape(shape = var_6222, x = var_6213)[name = string("x_181")];
+            int32 var_6228 = const()[name = string("op_6228"), val = int32(-1)];
+            fp16 const_107_promoted = const()[name = string("const_107_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_6230 = mul(x = x_181, y = const_107_promoted)[name = string("op_6230")];
+            bool input_271_interleave_0 = const()[name = string("input_271_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_271 = concat(axis = var_6228, interleave = input_271_interleave_0, values = (x_181, var_6230))[name = string("input_271")];
+            tensor<int32, [1]> normed_257_axes_0 = const()[name = string("normed_257_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6225_to_fp16 = const()[name = string("op_6225_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_257_cast_fp16 = layer_norm(axes = normed_257_axes_0, epsilon = var_6225_to_fp16, x = input_271)[name = string("normed_257_cast_fp16")];
+            tensor<int32, [2]> var_6235_split_sizes_0 = const()[name = string("op_6235_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_6235_axis_0 = const()[name = string("op_6235_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_6235_0, tensor<fp16, [1, 8, 256]> var_6235_1 = split(axis = var_6235_axis_0, split_sizes = var_6235_split_sizes_0, x = normed_257_cast_fp16)[name = string("op_6235")];
+            tensor<fp16, [1, 8, 256]> var_6237 = mul(x = var_6235_0, y = layers_c2_9_self_attn_q_norm_weight)[name = string("op_6237")];
+            tensor<int32, [4]> var_6242 = const()[name = string("op_6242"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_75 = reshape(shape = var_6242, x = var_6237)[name = string("q_75")];
+            tensor<fp16, [1, 8, 1, 256]> var_6244_cast_fp16 = mul(x = q_75, y = cos_s)[name = string("op_6244_cast_fp16")];
+            tensor<int32, [2]> var_6245_split_sizes_0 = const()[name = string("op_6245_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_6245_axis_0 = const()[name = string("op_6245_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_6245_0, tensor<fp16, [1, 8, 1, 128]> var_6245_1 = split(axis = var_6245_axis_0, split_sizes = var_6245_split_sizes_0, x = q_75)[name = string("op_6245")];
+            fp16 const_108_promoted = const()[name = string("const_108_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_6247 = mul(x = var_6245_1, y = const_108_promoted)[name = string("op_6247")];
+            int32 var_6249 = const()[name = string("op_6249"), val = int32(-1)];
+            bool var_6250_interleave_0 = const()[name = string("op_6250_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_6250 = concat(axis = var_6249, interleave = var_6250_interleave_0, values = (var_6247, var_6245_0))[name = string("op_6250")];
+            tensor<fp16, [1, 8, 1, 256]> var_6251_cast_fp16 = mul(x = var_6250, y = sin_s)[name = string("op_6251_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_79_cast_fp16 = add(x = var_6244_cast_fp16, y = var_6251_cast_fp16)[name = string("q_79_cast_fp16")];
+            string var_6264_pad_type_0 = const()[name = string("op_6264_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_6264_strides_0 = const()[name = string("op_6264_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_6264_pad_0 = const()[name = string("op_6264_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_6264_dilations_0 = const()[name = string("op_6264_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_6264_groups_0 = const()[name = string("op_6264_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_6264 = conv(dilations = var_6264_dilations_0, groups = var_6264_groups_0, pad = var_6264_pad_0, pad_type = var_6264_pad_type_0, strides = var_6264_strides_0, weight = layers_c2_9_self_attn_k_proj_weight_palettized, x = var_6185_cast_fp16)[name = string("op_6264")];
+            tensor<int32, [4]> var_6269 = const()[name = string("op_6269"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_6270 = reshape(shape = var_6269, x = var_6264)[name = string("op_6270")];
+            tensor<int32, [4]> var_6275 = const()[name = string("op_6275"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_6292_pad_type_0 = const()[name = string("op_6292_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_6292_strides_0 = const()[name = string("op_6292_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_6292_pad_0 = const()[name = string("op_6292_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_6292_dilations_0 = const()[name = string("op_6292_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_6292_groups_0 = const()[name = string("op_6292_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_6292 = conv(dilations = var_6292_dilations_0, groups = var_6292_groups_0, pad = var_6292_pad_0, pad_type = var_6292_pad_type_0, strides = var_6292_strides_0, weight = layers_c2_9_self_attn_v_proj_weight_palettized, x = var_6185_cast_fp16)[name = string("op_6292")];
+            tensor<int32, [4]> var_6297 = const()[name = string("op_6297"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_6298 = reshape(shape = var_6297, x = var_6292)[name = string("op_6298")];
+            tensor<int32, [4]> var_6303 = const()[name = string("op_6303"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_6313 = const()[name = string("op_6313"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp16, [1, 2, 1, 256]> var_6276 = transpose(perm = var_6275, x = var_6270)[name = string("transpose_141")];
+            tensor<fp16, [1, 2, 256]> x_183 = reshape(shape = var_6313, x = var_6276)[name = string("x_183")];
+            int32 var_6319 = const()[name = string("op_6319"), val = int32(-1)];
+            fp16 const_109_promoted = const()[name = string("const_109_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 256]> var_6321 = mul(x = x_183, y = const_109_promoted)[name = string("op_6321")];
+            bool input_273_interleave_0 = const()[name = string("input_273_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512]> input_273 = concat(axis = var_6319, interleave = input_273_interleave_0, values = (x_183, var_6321))[name = string("input_273")];
+            tensor<int32, [1]> normed_261_axes_0 = const()[name = string("normed_261_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6316_to_fp16 = const()[name = string("op_6316_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 512]> normed_261_cast_fp16 = layer_norm(axes = normed_261_axes_0, epsilon = var_6316_to_fp16, x = input_273)[name = string("normed_261_cast_fp16")];
+            tensor<int32, [2]> var_6326_split_sizes_0 = const()[name = string("op_6326_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_6326_axis_0 = const()[name = string("op_6326_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 256]> var_6326_0, tensor<fp16, [1, 2, 256]> var_6326_1 = split(axis = var_6326_axis_0, split_sizes = var_6326_split_sizes_0, x = normed_261_cast_fp16)[name = string("op_6326")];
+            tensor<fp16, [1, 2, 256]> var_6328 = mul(x = var_6326_0, y = layers_c2_9_self_attn_k_norm_weight)[name = string("op_6328")];
+            tensor<int32, [4]> var_6333 = const()[name = string("op_6333"), val = tensor<int32, [4]>([1, 2, 1, 256])];
+            tensor<fp16, [1, 2, 1, 256]> q_77 = reshape(shape = var_6333, x = var_6328)[name = string("q_77")];
+            fp16 var_6335_promoted = const()[name = string("op_6335_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 256]> var_6304 = transpose(perm = var_6303, x = var_6298)[name = string("transpose_140")];
+            tensor<fp16, [1, 2, 1, 256]> var_6336 = pow(x = var_6304, y = var_6335_promoted)[name = string("op_6336")];
+            tensor<int32, [1]> var_6341_axes_0 = const()[name = string("op_6341_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_6341_keep_dims_0 = const()[name = string("op_6341_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_6341 = reduce_mean(axes = var_6341_axes_0, keep_dims = var_6341_keep_dims_0, x = var_6336)[name = string("op_6341")];
+            fp16 var_6343_to_fp16 = const()[name = string("op_6343_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_19_cast_fp16 = add(x = var_6341, y = var_6343_to_fp16)[name = string("mean_sq_19_cast_fp16")];
+            fp32 var_6345_epsilon_0 = const()[name = string("op_6345_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_6345_cast_fp16 = rsqrt(epsilon = var_6345_epsilon_0, x = mean_sq_19_cast_fp16)[name = string("op_6345_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_277_cast_fp16 = mul(x = var_6304, y = var_6345_cast_fp16)[name = string("input_277_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> var_6347_cast_fp16 = mul(x = q_77, y = cos_s)[name = string("op_6347_cast_fp16")];
+            tensor<int32, [2]> var_6348_split_sizes_0 = const()[name = string("op_6348_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_6348_axis_0 = const()[name = string("op_6348_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 128]> var_6348_0, tensor<fp16, [1, 2, 1, 128]> var_6348_1 = split(axis = var_6348_axis_0, split_sizes = var_6348_split_sizes_0, x = q_77)[name = string("op_6348")];
+            fp16 const_110_promoted = const()[name = string("const_110_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 128]> var_6350 = mul(x = var_6348_1, y = const_110_promoted)[name = string("op_6350")];
+            int32 var_6352 = const()[name = string("op_6352"), val = int32(-1)];
+            bool var_6353_interleave_0 = const()[name = string("op_6353_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 256]> var_6353 = concat(axis = var_6352, interleave = var_6353_interleave_0, values = (var_6350, var_6348_0))[name = string("op_6353")];
+            tensor<fp16, [1, 2, 1, 256]> var_6354_cast_fp16 = mul(x = var_6353, y = sin_s)[name = string("op_6354_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_275_cast_fp16 = add(x = var_6347_cast_fp16, y = var_6354_cast_fp16)[name = string("input_275_cast_fp16")];
+            tensor<int32, [8]> k_padded_17_pad_0 = const()[name = string("k_padded_17_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_17_mode_0 = const()[name = string("k_padded_17_mode_0"), val = string("constant")];
+            fp16 const_111_to_fp16 = const()[name = string("const_111_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> k_padded_17_cast_fp16 = pad(constant_val = const_111_to_fp16, mode = k_padded_17_mode_0, pad = k_padded_17_pad_0, x = input_275_cast_fp16)[name = string("k_padded_17_cast_fp16")];
+            tensor<int32, [8]> v_padded_17_pad_0 = const()[name = string("v_padded_17_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_17_mode_0 = const()[name = string("v_padded_17_mode_0"), val = string("constant")];
+            fp16 const_112_to_fp16 = const()[name = string("const_112_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> v_padded_17_cast_fp16 = pad(constant_val = const_112_to_fp16, mode = v_padded_17_mode_0, pad = v_padded_17_pad_0, x = input_277_cast_fp16)[name = string("v_padded_17_cast_fp16")];
+            tensor<int32, [4]> var_6383_begin_0 = const()[name = string("op_6383_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_6383_end_0 = const()[name = string("op_6383_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_6383_end_mask_0 = const()[name = string("op_6383_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_6383_cast_fp16 = slice_by_index(begin = var_6383_begin_0, end = var_6383_end_0, end_mask = var_6383_end_mask_0, x = K_sliding_slot_17_cast_fp16)[name = string("op_6383_cast_fp16")];
+            int32 var_6390 = const()[name = string("op_6390"), val = int32(2)];
+            bool K_sliding_out_17_interleave_0 = const()[name = string("K_sliding_out_17_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_out_17_cast_fp16 = concat(axis = var_6390, interleave = K_sliding_out_17_interleave_0, values = (var_6383_cast_fp16, k_padded_17_cast_fp16))[name = string("K_sliding_out_17_cast_fp16")];
+            tensor<int32, [4]> var_6406_begin_0 = const()[name = string("op_6406_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_6406_end_0 = const()[name = string("op_6406_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_6406_end_mask_0 = const()[name = string("op_6406_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_6406_cast_fp16 = slice_by_index(begin = var_6406_begin_0, end = var_6406_end_0, end_mask = var_6406_end_mask_0, x = V_sliding_slot_17_cast_fp16)[name = string("op_6406_cast_fp16")];
+            int32 var_6413 = const()[name = string("op_6413"), val = int32(2)];
+            bool V_sliding_out_17_interleave_0 = const()[name = string("V_sliding_out_17_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_out_17_cast_fp16 = concat(axis = var_6413, interleave = V_sliding_out_17_interleave_0, values = (var_6406_cast_fp16, v_padded_17_cast_fp16))[name = string("V_sliding_out_17_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_19_begin_0 = const()[name = string("K_for_attn_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_19_end_0 = const()[name = string("K_for_attn_19_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_19_end_mask_0 = const()[name = string("K_for_attn_19_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> K_for_attn_19_cast_fp16 = slice_by_index(begin = K_for_attn_19_begin_0, end = K_for_attn_19_end_0, end_mask = K_for_attn_19_end_mask_0, x = K_sliding_out_17_cast_fp16)[name = string("K_for_attn_19_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_19_begin_0 = const()[name = string("V_for_attn_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_19_end_0 = const()[name = string("V_for_attn_19_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_19_end_mask_0 = const()[name = string("V_for_attn_19_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> V_for_attn_19_cast_fp16 = slice_by_index(begin = V_for_attn_19_begin_0, end = V_for_attn_19_end_0, end_mask = V_for_attn_19_end_mask_0, x = V_sliding_out_17_cast_fp16)[name = string("V_for_attn_19_cast_fp16")];
+            tensor<int32, [4]> transpose_36_perm_0 = const()[name = string("transpose_36_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_18_reps_0 = const()[name = string("tile_18_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_36_cast_fp16 = transpose(perm = transpose_36_perm_0, x = K_for_attn_19_cast_fp16)[name = string("transpose_139")];
+            tensor<fp16, [8, 1, 512, 256]> tile_18_cast_fp16 = tile(reps = tile_18_reps_0, x = transpose_36_cast_fp16)[name = string("tile_18_cast_fp16")];
+            tensor<int32, [5]> concat_36 = const()[name = string("concat_36"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_36_cast_fp16 = reshape(shape = concat_36, x = tile_18_cast_fp16)[name = string("reshape_36_cast_fp16")];
+            tensor<int32, [5]> transpose_37_perm_0 = const()[name = string("transpose_37_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_37 = const()[name = string("concat_37"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_37_cast_fp16 = transpose(perm = transpose_37_perm_0, x = reshape_36_cast_fp16)[name = string("transpose_138")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_37_cast_fp16 = reshape(shape = concat_37, x = transpose_37_cast_fp16)[name = string("reshape_37_cast_fp16")];
+            tensor<int32, [4]> transpose_93_perm_0 = const()[name = string("transpose_93_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_38_perm_0 = const()[name = string("transpose_38_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_19_reps_0 = const()[name = string("tile_19_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_38_cast_fp16 = transpose(perm = transpose_38_perm_0, x = V_for_attn_19_cast_fp16)[name = string("transpose_137")];
+            tensor<fp16, [8, 1, 512, 256]> tile_19_cast_fp16 = tile(reps = tile_19_reps_0, x = transpose_38_cast_fp16)[name = string("tile_19_cast_fp16")];
+            tensor<int32, [5]> concat_38 = const()[name = string("concat_38"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_38_cast_fp16 = reshape(shape = concat_38, x = tile_19_cast_fp16)[name = string("reshape_38_cast_fp16")];
+            tensor<int32, [5]> transpose_39_perm_0 = const()[name = string("transpose_39_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_39 = const()[name = string("concat_39"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_39_cast_fp16 = transpose(perm = transpose_39_perm_0, x = reshape_38_cast_fp16)[name = string("transpose_136")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_39_cast_fp16 = reshape(shape = concat_39, x = transpose_39_cast_fp16)[name = string("reshape_39_cast_fp16")];
+            tensor<int32, [4]> V_expanded_19_perm_0 = const()[name = string("V_expanded_19_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_37_transpose_x_0 = const()[name = string("attn_weights_37_transpose_x_0"), val = bool(false)];
+            bool attn_weights_37_transpose_y_0 = const()[name = string("attn_weights_37_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_93_cast_fp16 = transpose(perm = transpose_93_perm_0, x = reshape_37_cast_fp16)[name = string("transpose_135")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_37_cast_fp16 = matmul(transpose_x = attn_weights_37_transpose_x_0, transpose_y = attn_weights_37_transpose_y_0, x = q_79_cast_fp16, y = transpose_93_cast_fp16)[name = string("attn_weights_37_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_187_cast_fp16 = add(x = attn_weights_37_cast_fp16, y = causal_mask_sliding)[name = string("x_187_cast_fp16")];
+            tensor<int32, [1]> reduce_max_9_axes_0 = const()[name = string("reduce_max_9_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_9_keep_dims_0 = const()[name = string("reduce_max_9_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_9 = reduce_max(axes = reduce_max_9_axes_0, keep_dims = reduce_max_9_keep_dims_0, x = x_187_cast_fp16)[name = string("reduce_max_9")];
+            tensor<fp16, [1, 8, 1, 512]> var_6454 = sub(x = x_187_cast_fp16, y = reduce_max_9)[name = string("op_6454")];
+            tensor<fp16, [1, 8, 1, 512]> var_6460 = exp(x = var_6454)[name = string("op_6460")];
+            tensor<int32, [1]> var_6470_axes_0 = const()[name = string("op_6470_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_6470_keep_dims_0 = const()[name = string("op_6470_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_6470 = reduce_sum(axes = var_6470_axes_0, keep_dims = var_6470_keep_dims_0, x = var_6460)[name = string("op_6470")];
+            tensor<fp16, [1, 8, 1, 512]> var_6476_cast_fp16 = real_div(x = var_6460, y = var_6470)[name = string("op_6476_cast_fp16")];
+            bool attn_output_55_transpose_x_0 = const()[name = string("attn_output_55_transpose_x_0"), val = bool(false)];
+            bool attn_output_55_transpose_y_0 = const()[name = string("attn_output_55_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_19_cast_fp16 = transpose(perm = V_expanded_19_perm_0, x = reshape_39_cast_fp16)[name = string("transpose_134")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_55_cast_fp16 = matmul(transpose_x = attn_output_55_transpose_x_0, transpose_y = attn_output_55_transpose_y_0, x = var_6476_cast_fp16, y = V_expanded_19_cast_fp16)[name = string("attn_output_55_cast_fp16")];
+            tensor<int32, [4]> var_6487 = const()[name = string("op_6487"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_6494 = const()[name = string("op_6494"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_6488_cast_fp16 = transpose(perm = var_6487, x = attn_output_55_cast_fp16)[name = string("transpose_133")];
+            tensor<fp16, [1, 1, 2048]> attn_output_57_cast_fp16 = reshape(shape = var_6494, x = var_6488_cast_fp16)[name = string("attn_output_57_cast_fp16")];
+            tensor<int32, [3]> var_6499 = const()[name = string("op_6499"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_6515_pad_type_0 = const()[name = string("op_6515_pad_type_0"), val = string("valid")];
+            int32 var_6515_groups_0 = const()[name = string("op_6515_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_6515_strides_0 = const()[name = string("op_6515_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_6515_pad_0 = const()[name = string("op_6515_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_6515_dilations_0 = const()[name = string("op_6515_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_9_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(943995520))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(946617024))))[name = string("squeeze_9_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_6500_cast_fp16 = transpose(perm = var_6499, x = attn_output_57_cast_fp16)[name = string("transpose_132")];
+            tensor<fp16, [1, 2560, 1]> var_6515_cast_fp16 = conv(dilations = var_6515_dilations_0, groups = var_6515_groups_0, pad = var_6515_pad_0, pad_type = var_6515_pad_type_0, strides = var_6515_strides_0, weight = squeeze_9_cast_fp16_to_fp32_to_fp16_palettized, x = var_6500_cast_fp16)[name = string("op_6515_cast_fp16")];
+            tensor<int32, [3]> var_6519 = const()[name = string("op_6519"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_6525 = const()[name = string("op_6525"), val = int32(-1)];
+            fp16 const_113_promoted_to_fp16 = const()[name = string("const_113_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_191_cast_fp16 = transpose(perm = var_6519, x = var_6515_cast_fp16)[name = string("transpose_131")];
+            tensor<fp16, [1, 1, 2560]> var_6527_cast_fp16 = mul(x = x_191_cast_fp16, y = const_113_promoted_to_fp16)[name = string("op_6527_cast_fp16")];
+            bool input_281_interleave_0 = const()[name = string("input_281_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_281_cast_fp16 = concat(axis = var_6525, interleave = input_281_interleave_0, values = (x_191_cast_fp16, var_6527_cast_fp16))[name = string("input_281_cast_fp16")];
+            tensor<int32, [1]> normed_265_axes_0 = const()[name = string("normed_265_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6522_to_fp16 = const()[name = string("op_6522_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_265_cast_fp16 = layer_norm(axes = normed_265_axes_0, epsilon = var_6522_to_fp16, x = input_281_cast_fp16)[name = string("normed_265_cast_fp16")];
+            tensor<int32, [2]> var_6532_split_sizes_0 = const()[name = string("op_6532_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6532_axis_0 = const()[name = string("op_6532_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_6532_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_6532_cast_fp16_1 = split(axis = var_6532_axis_0, split_sizes = var_6532_split_sizes_0, x = normed_265_cast_fp16)[name = string("op_6532_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_9_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_9_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(946619648)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_59_cast_fp16 = mul(x = var_6532_cast_fp16_0, y = layers_c2_9_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_59_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_193_cast_fp16 = add(x = x_179_cast_fp16, y = attn_output_59_cast_fp16)[name = string("x_193_cast_fp16")];
+            int32 var_6541 = const()[name = string("op_6541"), val = int32(-1)];
+            fp16 const_114_promoted_to_fp16 = const()[name = string("const_114_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_6543_cast_fp16 = mul(x = x_193_cast_fp16, y = const_114_promoted_to_fp16)[name = string("op_6543_cast_fp16")];
+            bool input_283_interleave_0 = const()[name = string("input_283_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_283_cast_fp16 = concat(axis = var_6541, interleave = input_283_interleave_0, values = (x_193_cast_fp16, var_6543_cast_fp16))[name = string("input_283_cast_fp16")];
+            tensor<int32, [1]> normed_269_axes_0 = const()[name = string("normed_269_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6538_to_fp16 = const()[name = string("op_6538_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_269_cast_fp16 = layer_norm(axes = normed_269_axes_0, epsilon = var_6538_to_fp16, x = input_283_cast_fp16)[name = string("normed_269_cast_fp16")];
+            tensor<int32, [2]> var_6548_split_sizes_0 = const()[name = string("op_6548_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6548_axis_0 = const()[name = string("op_6548_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_6548_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_6548_cast_fp16_1 = split(axis = var_6548_axis_0, split_sizes = var_6548_split_sizes_0, x = normed_269_cast_fp16)[name = string("op_6548_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_9_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_9_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(946624832)))];
+            tensor<fp16, [1, 1, 2560]> h_57_cast_fp16 = mul(x = var_6548_cast_fp16_0, y = layers_c2_9_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_57_cast_fp16")];
+            tensor<int32, [3]> var_6559 = const()[name = string("op_6559"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_285_axes_0 = const()[name = string("input_285_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_6560 = transpose(perm = var_6559, x = h_57_cast_fp16)[name = string("transpose_130")];
+            tensor<fp16, [1, 2560, 1, 1]> input_285 = expand_dims(axes = input_285_axes_0, x = var_6560)[name = string("input_285")];
+            string gate_37_pad_type_0 = const()[name = string("gate_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_37_strides_0 = const()[name = string("gate_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_37_pad_0 = const()[name = string("gate_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_37_dilations_0 = const()[name = string("gate_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_37_groups_0 = const()[name = string("gate_37_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_37 = conv(dilations = gate_37_dilations_0, groups = gate_37_groups_0, pad = gate_37_pad_0, pad_type = gate_37_pad_type_0, strides = gate_37_strides_0, weight = layers_c2_9_mlp_gate_proj_weight_palettized, x = input_285)[name = string("gate_37")];
+            string up_19_pad_type_0 = const()[name = string("up_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_19_strides_0 = const()[name = string("up_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_19_pad_0 = const()[name = string("up_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_19_dilations_0 = const()[name = string("up_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_19_groups_0 = const()[name = string("up_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_19 = conv(dilations = up_19_dilations_0, groups = up_19_groups_0, pad = up_19_pad_0, pad_type = up_19_pad_type_0, strides = up_19_strides_0, weight = layers_c2_9_mlp_up_proj_weight_palettized, x = input_285)[name = string("up_19")];
+            string gate_39_mode_0 = const()[name = string("gate_39_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_39 = gelu(mode = gate_39_mode_0, x = gate_37)[name = string("gate_39")];
+            tensor<fp16, [1, 10240, 1, 1]> input_287 = mul(x = gate_39, y = up_19)[name = string("input_287")];
+            string mlp_out_19_pad_type_0 = const()[name = string("mlp_out_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_19_strides_0 = const()[name = string("mlp_out_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_19_pad_0 = const()[name = string("mlp_out_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_19_dilations_0 = const()[name = string("mlp_out_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_19_groups_0 = const()[name = string("mlp_out_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_19 = conv(dilations = mlp_out_19_dilations_0, groups = mlp_out_19_groups_0, pad = mlp_out_19_pad_0, pad_type = mlp_out_19_pad_type_0, strides = mlp_out_19_strides_0, weight = layers_c2_9_mlp_down_proj_weight_palettized, x = input_287)[name = string("mlp_out_19")];
+            tensor<int32, [1]> var_6600_axes_0 = const()[name = string("op_6600_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_6600 = squeeze(axes = var_6600_axes_0, x = mlp_out_19)[name = string("op_6600")];
+            tensor<int32, [3]> var_6604 = const()[name = string("op_6604"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_6610 = const()[name = string("op_6610"), val = int32(-1)];
+            fp16 const_115_promoted = const()[name = string("const_115_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_195 = transpose(perm = var_6604, x = var_6600)[name = string("transpose_129")];
+            tensor<fp16, [1, 1, 2560]> var_6612 = mul(x = x_195, y = const_115_promoted)[name = string("op_6612")];
+            bool input_289_interleave_0 = const()[name = string("input_289_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_289 = concat(axis = var_6610, interleave = input_289_interleave_0, values = (x_195, var_6612))[name = string("input_289")];
+            tensor<int32, [1]> normed_273_axes_0 = const()[name = string("normed_273_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6607_to_fp16 = const()[name = string("op_6607_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_273_cast_fp16 = layer_norm(axes = normed_273_axes_0, epsilon = var_6607_to_fp16, x = input_289)[name = string("normed_273_cast_fp16")];
+            tensor<int32, [2]> var_6617_split_sizes_0 = const()[name = string("op_6617_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6617_axis_0 = const()[name = string("op_6617_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_6617_0, tensor<fp16, [1, 1, 2560]> var_6617_1 = split(axis = var_6617_axis_0, split_sizes = var_6617_split_sizes_0, x = normed_273_cast_fp16)[name = string("op_6617")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_93 = mul(x = var_6617_0, y = layers_c2_9_post_feedforward_layernorm_weight)[name = string("hidden_states_93")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_95_cast_fp16 = add(x = x_193_cast_fp16, y = hidden_states_93)[name = string("hidden_states_95_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_19_begin_0 = const()[name = string("per_layer_slice_19_begin_0"), val = tensor<int32, [3]>([0, 0, 5376])];
+            tensor<int32, [3]> per_layer_slice_19_end_0 = const()[name = string("per_layer_slice_19_end_0"), val = tensor<int32, [3]>([1, 1, 5632])];
+            tensor<bool, [3]> per_layer_slice_19_end_mask_0 = const()[name = string("per_layer_slice_19_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_19_cast_fp16 = slice_by_index(begin = per_layer_slice_19_begin_0, end = per_layer_slice_19_end_0, end_mask = per_layer_slice_19_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_19_cast_fp16")];
+            tensor<int32, [3]> var_6645 = const()[name = string("op_6645"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_291_axes_0 = const()[name = string("input_291_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_6646 = transpose(perm = var_6645, x = hidden_states_95_cast_fp16)[name = string("transpose_128")];
+            tensor<fp16, [1, 2560, 1, 1]> input_291 = expand_dims(axes = input_291_axes_0, x = var_6646)[name = string("input_291")];
+            string gated_55_pad_type_0 = const()[name = string("gated_55_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_55_strides_0 = const()[name = string("gated_55_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_55_pad_0 = const()[name = string("gated_55_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_55_dilations_0 = const()[name = string("gated_55_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_55_groups_0 = const()[name = string("gated_55_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_55 = conv(dilations = gated_55_dilations_0, groups = gated_55_groups_0, pad = gated_55_pad_0, pad_type = gated_55_pad_type_0, strides = gated_55_strides_0, weight = layers_c2_9_per_layer_input_gate_weight_palettized, x = input_291)[name = string("gated_55")];
+            string gated_57_mode_0 = const()[name = string("gated_57_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_57 = gelu(mode = gated_57_mode_0, x = gated_55)[name = string("gated_57")];
+            tensor<int32, [3]> var_6665 = const()[name = string("op_6665"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_19_axes_0 = const()[name = string("per_layer_slice_conv_19_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_6666_cast_fp16 = transpose(perm = var_6665, x = per_layer_slice_19_cast_fp16)[name = string("transpose_127")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_19_cast_fp16 = expand_dims(axes = per_layer_slice_conv_19_axes_0, x = var_6666_cast_fp16)[name = string("per_layer_slice_conv_19_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_293_cast_fp16 = mul(x = gated_57, y = per_layer_slice_conv_19_cast_fp16)[name = string("input_293_cast_fp16")];
+            string gated_59_pad_type_0 = const()[name = string("gated_59_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_59_strides_0 = const()[name = string("gated_59_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_59_pad_0 = const()[name = string("gated_59_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_59_dilations_0 = const()[name = string("gated_59_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_59_groups_0 = const()[name = string("gated_59_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_c2_9_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(946630016))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(946957760))))[name = string("layers_c2_9_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_59_cast_fp16 = conv(dilations = gated_59_dilations_0, groups = gated_59_groups_0, pad = gated_59_pad_0, pad_type = gated_59_pad_type_0, strides = gated_59_strides_0, weight = layers_c2_9_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_293_cast_fp16)[name = string("gated_59_cast_fp16")];
+            tensor<int32, [1]> var_6682_axes_0 = const()[name = string("op_6682_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_6682_cast_fp16 = squeeze(axes = var_6682_axes_0, x = gated_59_cast_fp16)[name = string("op_6682_cast_fp16")];
+            tensor<int32, [3]> var_6686 = const()[name = string("op_6686"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_6692 = const()[name = string("op_6692"), val = int32(-1)];
+            fp16 const_116_promoted_to_fp16 = const()[name = string("const_116_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_197_cast_fp16 = transpose(perm = var_6686, x = var_6682_cast_fp16)[name = string("transpose_126")];
+            tensor<fp16, [1, 1, 2560]> var_6694_cast_fp16 = mul(x = x_197_cast_fp16, y = const_116_promoted_to_fp16)[name = string("op_6694_cast_fp16")];
+            bool input_295_interleave_0 = const()[name = string("input_295_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_295_cast_fp16 = concat(axis = var_6692, interleave = input_295_interleave_0, values = (x_197_cast_fp16, var_6694_cast_fp16))[name = string("input_295_cast_fp16")];
+            tensor<int32, [1]> normed_277_axes_0 = const()[name = string("normed_277_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6689_to_fp16 = const()[name = string("op_6689_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_277_cast_fp16 = layer_norm(axes = normed_277_axes_0, epsilon = var_6689_to_fp16, x = input_295_cast_fp16)[name = string("normed_277_cast_fp16")];
+            tensor<int32, [2]> var_6699_split_sizes_0 = const()[name = string("op_6699_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6699_axis_0 = const()[name = string("op_6699_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_6699_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_6699_cast_fp16_1 = split(axis = var_6699_axis_0, split_sizes = var_6699_split_sizes_0, x = normed_277_cast_fp16)[name = string("op_6699_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_9_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_c2_9_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(946960384)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_99_cast_fp16 = mul(x = var_6699_cast_fp16_0, y = layers_c2_9_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_99_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_101_cast_fp16 = add(x = hidden_states_95_cast_fp16, y = hidden_states_99_cast_fp16)[name = string("hidden_states_101_cast_fp16")];
+            tensor<fp16, [1]> const_117_promoted_to_fp16 = const()[name = string("const_117_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.d8p-2])];
+            tensor<fp16, [1, 1, 2560]> x_199_cast_fp16 = mul(x = hidden_states_101_cast_fp16, y = const_117_promoted_to_fp16)[name = string("x_199_cast_fp16")];
+            tensor<int32, [1]> var_6711_axes_0 = const()[name = string("op_6711_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_6711_cast_fp16 = squeeze(axes = var_6711_axes_0, x = K_sliding_out_17_cast_fp16)[name = string("op_6711_cast_fp16")];
+            tensor<int32, [1]> var_6713_axes_0 = const()[name = string("op_6713_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_6713_cast_fp16 = squeeze(axes = var_6713_axes_0, x = V_sliding_out_17_cast_fp16)[name = string("op_6713_cast_fp16")];
+            tensor<int32, [4]> var_6716_begin_0 = const()[name = string("op_6716_begin_0"), val = tensor<int32, [4]>([9, 0, 0, 0])];
+            tensor<int32, [4]> var_6716_end_0 = const()[name = string("op_6716_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_6716_end_mask_0 = const()[name = string("op_6716_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_6716_squeeze_mask_0 = const()[name = string("op_6716_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_6716_cast_fp16 = slice_by_index(begin = var_6716_begin_0, end = var_6716_end_0, end_mask = var_6716_end_mask_0, squeeze_mask = var_6716_squeeze_mask_0, x = K_sliding_in)[name = string("op_6716_cast_fp16")];
+            tensor<int32, [1]> K_sliding_slot_axes_0 = const()[name = string("K_sliding_slot_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_slot_cast_fp16 = expand_dims(axes = K_sliding_slot_axes_0, x = var_6716_cast_fp16)[name = string("K_sliding_slot_cast_fp16")];
+            tensor<int32, [4]> var_6721_begin_0 = const()[name = string("op_6721_begin_0"), val = tensor<int32, [4]>([9, 0, 0, 0])];
+            tensor<int32, [4]> var_6721_end_0 = const()[name = string("op_6721_end_0"), val = tensor<int32, [4]>([10, 2, 512, 512])];
+            tensor<bool, [4]> var_6721_end_mask_0 = const()[name = string("op_6721_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_6721_squeeze_mask_0 = const()[name = string("op_6721_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 512, 512]> var_6721_cast_fp16 = slice_by_index(begin = var_6721_begin_0, end = var_6721_end_0, end_mask = var_6721_end_mask_0, squeeze_mask = var_6721_squeeze_mask_0, x = V_sliding_in)[name = string("op_6721_cast_fp16")];
+            tensor<int32, [1]> V_sliding_slot_axes_0 = const()[name = string("V_sliding_slot_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_slot_cast_fp16 = expand_dims(axes = V_sliding_slot_axes_0, x = var_6721_cast_fp16)[name = string("V_sliding_slot_cast_fp16")];
+            int32 var_6728 = const()[name = string("op_6728"), val = int32(-1)];
+            fp16 const_118_promoted_to_fp16 = const()[name = string("const_118_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_6730_cast_fp16 = mul(x = x_199_cast_fp16, y = const_118_promoted_to_fp16)[name = string("op_6730_cast_fp16")];
+            bool input_297_interleave_0 = const()[name = string("input_297_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_297_cast_fp16 = concat(axis = var_6728, interleave = input_297_interleave_0, values = (x_199_cast_fp16, var_6730_cast_fp16))[name = string("input_297_cast_fp16")];
+            tensor<int32, [1]> normed_281_axes_0 = const()[name = string("normed_281_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6725_to_fp16 = const()[name = string("op_6725_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_281_cast_fp16 = layer_norm(axes = normed_281_axes_0, epsilon = var_6725_to_fp16, x = input_297_cast_fp16)[name = string("normed_281_cast_fp16")];
+            tensor<int32, [2]> var_6735_split_sizes_0 = const()[name = string("op_6735_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_6735_axis_0 = const()[name = string("op_6735_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_6735_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_6735_cast_fp16_1 = split(axis = var_6735_axis_0, split_sizes = var_6735_split_sizes_0, x = normed_281_cast_fp16)[name = string("op_6735_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_10_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_10_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(946965568)))];
+            tensor<fp16, [1, 1, 2560]> h_61_cast_fp16 = mul(x = var_6735_cast_fp16_0, y = layers_c2_10_input_layernorm_weight_promoted_to_fp16)[name = string("h_61_cast_fp16")];
+            tensor<int32, [3]> var_6741 = const()[name = string("op_6741"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_6744_axes_0 = const()[name = string("op_6744_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_6742_cast_fp16 = transpose(perm = var_6741, x = h_61_cast_fp16)[name = string("transpose_125")];
+            tensor<fp16, [1, 2560, 1, 1]> var_6744_cast_fp16 = expand_dims(axes = var_6744_axes_0, x = var_6742_cast_fp16)[name = string("op_6744_cast_fp16")];
+            string var_6760_pad_type_0 = const()[name = string("op_6760_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_6760_strides_0 = const()[name = string("op_6760_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_6760_pad_0 = const()[name = string("op_6760_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_6760_dilations_0 = const()[name = string("op_6760_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_6760_groups_0 = const()[name = string("op_6760_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_6760 = conv(dilations = var_6760_dilations_0, groups = var_6760_groups_0, pad = var_6760_pad_0, pad_type = var_6760_pad_type_0, strides = var_6760_strides_0, weight = layers_c2_10_self_attn_q_proj_weight_palettized, x = var_6744_cast_fp16)[name = string("op_6760")];
+            tensor<int32, [4]> var_6765 = const()[name = string("op_6765"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_6766 = reshape(shape = var_6765, x = var_6760)[name = string("op_6766")];
+            tensor<int32, [4]> var_6771 = const()[name = string("op_6771"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_6781 = const()[name = string("op_6781"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_6772 = transpose(perm = var_6771, x = var_6766)[name = string("transpose_124")];
+            tensor<fp16, [1, 8, 256]> x_201 = reshape(shape = var_6781, x = var_6772)[name = string("x_201")];
+            int32 var_6787 = const()[name = string("op_6787"), val = int32(-1)];
+            fp16 const_119_promoted = const()[name = string("const_119_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_6789 = mul(x = x_201, y = const_119_promoted)[name = string("op_6789")];
+            bool input_301_interleave_0 = const()[name = string("input_301_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_301 = concat(axis = var_6787, interleave = input_301_interleave_0, values = (x_201, var_6789))[name = string("input_301")];
+            tensor<int32, [1]> normed_285_axes_0 = const()[name = string("normed_285_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6784_to_fp16 = const()[name = string("op_6784_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_285_cast_fp16 = layer_norm(axes = normed_285_axes_0, epsilon = var_6784_to_fp16, x = input_301)[name = string("normed_285_cast_fp16")];
+            tensor<int32, [2]> var_6794_split_sizes_0 = const()[name = string("op_6794_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_6794_axis_0 = const()[name = string("op_6794_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_6794_0, tensor<fp16, [1, 8, 256]> var_6794_1 = split(axis = var_6794_axis_0, split_sizes = var_6794_split_sizes_0, x = normed_285_cast_fp16)[name = string("op_6794")];
+            tensor<fp16, [1, 8, 256]> var_6796 = mul(x = var_6794_0, y = layers_c2_10_self_attn_q_norm_weight)[name = string("op_6796")];
+            tensor<int32, [4]> var_6801 = const()[name = string("op_6801"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_83 = reshape(shape = var_6801, x = var_6796)[name = string("q_83")];
+            tensor<fp16, [1, 8, 1, 256]> var_6803_cast_fp16 = mul(x = q_83, y = cos_s)[name = string("op_6803_cast_fp16")];
+            tensor<int32, [2]> var_6804_split_sizes_0 = const()[name = string("op_6804_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_6804_axis_0 = const()[name = string("op_6804_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_6804_0, tensor<fp16, [1, 8, 1, 128]> var_6804_1 = split(axis = var_6804_axis_0, split_sizes = var_6804_split_sizes_0, x = q_83)[name = string("op_6804")];
+            fp16 const_120_promoted = const()[name = string("const_120_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_6806 = mul(x = var_6804_1, y = const_120_promoted)[name = string("op_6806")];
+            int32 var_6808 = const()[name = string("op_6808"), val = int32(-1)];
+            bool var_6809_interleave_0 = const()[name = string("op_6809_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_6809 = concat(axis = var_6808, interleave = var_6809_interleave_0, values = (var_6806, var_6804_0))[name = string("op_6809")];
+            tensor<fp16, [1, 8, 1, 256]> var_6810_cast_fp16 = mul(x = var_6809, y = sin_s)[name = string("op_6810_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_87_cast_fp16 = add(x = var_6803_cast_fp16, y = var_6810_cast_fp16)[name = string("q_87_cast_fp16")];
+            string var_6823_pad_type_0 = const()[name = string("op_6823_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_6823_strides_0 = const()[name = string("op_6823_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_6823_pad_0 = const()[name = string("op_6823_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_6823_dilations_0 = const()[name = string("op_6823_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_6823_groups_0 = const()[name = string("op_6823_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_6823 = conv(dilations = var_6823_dilations_0, groups = var_6823_groups_0, pad = var_6823_pad_0, pad_type = var_6823_pad_type_0, strides = var_6823_strides_0, weight = layers_c2_10_self_attn_k_proj_weight_palettized, x = var_6744_cast_fp16)[name = string("op_6823")];
+            tensor<int32, [4]> var_6828 = const()[name = string("op_6828"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_6829 = reshape(shape = var_6828, x = var_6823)[name = string("op_6829")];
+            tensor<int32, [4]> var_6834 = const()[name = string("op_6834"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_6851_pad_type_0 = const()[name = string("op_6851_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_6851_strides_0 = const()[name = string("op_6851_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_6851_pad_0 = const()[name = string("op_6851_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_6851_dilations_0 = const()[name = string("op_6851_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_6851_groups_0 = const()[name = string("op_6851_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 512, 1, 1]> var_6851 = conv(dilations = var_6851_dilations_0, groups = var_6851_groups_0, pad = var_6851_pad_0, pad_type = var_6851_pad_type_0, strides = var_6851_strides_0, weight = layers_c2_10_self_attn_v_proj_weight_palettized, x = var_6744_cast_fp16)[name = string("op_6851")];
+            tensor<int32, [4]> var_6856 = const()[name = string("op_6856"), val = tensor<int32, [4]>([1, 2, 256, 1])];
+            tensor<fp16, [1, 2, 256, 1]> var_6857 = reshape(shape = var_6856, x = var_6851)[name = string("op_6857")];
+            tensor<int32, [4]> var_6862 = const()[name = string("op_6862"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_6872 = const()[name = string("op_6872"), val = tensor<int32, [3]>([1, 2, 256])];
+            tensor<fp16, [1, 2, 1, 256]> var_6835 = transpose(perm = var_6834, x = var_6829)[name = string("transpose_123")];
+            tensor<fp16, [1, 2, 256]> x_203 = reshape(shape = var_6872, x = var_6835)[name = string("x_203")];
+            int32 var_6878 = const()[name = string("op_6878"), val = int32(-1)];
+            fp16 const_121_promoted = const()[name = string("const_121_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 256]> var_6880 = mul(x = x_203, y = const_121_promoted)[name = string("op_6880")];
+            bool input_303_interleave_0 = const()[name = string("input_303_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512]> input_303 = concat(axis = var_6878, interleave = input_303_interleave_0, values = (x_203, var_6880))[name = string("input_303")];
+            tensor<int32, [1]> normed_289_axes_0 = const()[name = string("normed_289_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_6875_to_fp16 = const()[name = string("op_6875_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 512]> normed_289_cast_fp16 = layer_norm(axes = normed_289_axes_0, epsilon = var_6875_to_fp16, x = input_303)[name = string("normed_289_cast_fp16")];
+            tensor<int32, [2]> var_6885_split_sizes_0 = const()[name = string("op_6885_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_6885_axis_0 = const()[name = string("op_6885_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 256]> var_6885_0, tensor<fp16, [1, 2, 256]> var_6885_1 = split(axis = var_6885_axis_0, split_sizes = var_6885_split_sizes_0, x = normed_289_cast_fp16)[name = string("op_6885")];
+            tensor<fp16, [1, 2, 256]> var_6887 = mul(x = var_6885_0, y = layers_c2_4_self_attn_k_norm_weight)[name = string("op_6887")];
+            tensor<int32, [4]> var_6892 = const()[name = string("op_6892"), val = tensor<int32, [4]>([1, 2, 1, 256])];
+            tensor<fp16, [1, 2, 1, 256]> q_85 = reshape(shape = var_6892, x = var_6887)[name = string("q_85")];
+            fp16 var_6894_promoted = const()[name = string("op_6894_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 256]> var_6863 = transpose(perm = var_6862, x = var_6857)[name = string("transpose_122")];
+            tensor<fp16, [1, 2, 1, 256]> var_6895 = pow(x = var_6863, y = var_6894_promoted)[name = string("op_6895")];
+            tensor<int32, [1]> var_6900_axes_0 = const()[name = string("op_6900_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_6900_keep_dims_0 = const()[name = string("op_6900_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_6900 = reduce_mean(axes = var_6900_axes_0, keep_dims = var_6900_keep_dims_0, x = var_6895)[name = string("op_6900")];
+            fp16 var_6902_to_fp16 = const()[name = string("op_6902_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_21_cast_fp16 = add(x = var_6900, y = var_6902_to_fp16)[name = string("mean_sq_21_cast_fp16")];
+            fp32 var_6904_epsilon_0 = const()[name = string("op_6904_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_6904_cast_fp16 = rsqrt(epsilon = var_6904_epsilon_0, x = mean_sq_21_cast_fp16)[name = string("op_6904_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_307_cast_fp16 = mul(x = var_6863, y = var_6904_cast_fp16)[name = string("input_307_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> var_6906_cast_fp16 = mul(x = q_85, y = cos_s)[name = string("op_6906_cast_fp16")];
+            tensor<int32, [2]> var_6907_split_sizes_0 = const()[name = string("op_6907_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_6907_axis_0 = const()[name = string("op_6907_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 128]> var_6907_0, tensor<fp16, [1, 2, 1, 128]> var_6907_1 = split(axis = var_6907_axis_0, split_sizes = var_6907_split_sizes_0, x = q_85)[name = string("op_6907")];
+            fp16 const_122_promoted = const()[name = string("const_122_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 128]> var_6909 = mul(x = var_6907_1, y = const_122_promoted)[name = string("op_6909")];
+            int32 var_6911 = const()[name = string("op_6911"), val = int32(-1)];
+            bool var_6912_interleave_0 = const()[name = string("op_6912_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 256]> var_6912 = concat(axis = var_6911, interleave = var_6912_interleave_0, values = (var_6909, var_6907_0))[name = string("op_6912")];
+            tensor<fp16, [1, 2, 1, 256]> var_6913_cast_fp16 = mul(x = var_6912, y = sin_s)[name = string("op_6913_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 256]> input_305_cast_fp16 = add(x = var_6906_cast_fp16, y = var_6913_cast_fp16)[name = string("input_305_cast_fp16")];
+            tensor<int32, [8]> k_padded_pad_0 = const()[name = string("k_padded_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string k_padded_mode_0 = const()[name = string("k_padded_mode_0"), val = string("constant")];
+            fp16 const_123_to_fp16 = const()[name = string("const_123_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> k_padded_cast_fp16 = pad(constant_val = const_123_to_fp16, mode = k_padded_mode_0, pad = k_padded_pad_0, x = input_305_cast_fp16)[name = string("k_padded_cast_fp16")];
+            tensor<int32, [8]> v_padded_pad_0 = const()[name = string("v_padded_pad_0"), val = tensor<int32, [8]>([0, 0, 0, 0, 0, 0, 0, 256])];
+            string v_padded_mode_0 = const()[name = string("v_padded_mode_0"), val = string("constant")];
+            fp16 const_124_to_fp16 = const()[name = string("const_124_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2, 1, 512]> v_padded_cast_fp16 = pad(constant_val = const_124_to_fp16, mode = v_padded_mode_0, pad = v_padded_pad_0, x = input_307_cast_fp16)[name = string("v_padded_cast_fp16")];
+            tensor<int32, [4]> var_6942_begin_0 = const()[name = string("op_6942_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_6942_end_0 = const()[name = string("op_6942_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_6942_end_mask_0 = const()[name = string("op_6942_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_6942_cast_fp16 = slice_by_index(begin = var_6942_begin_0, end = var_6942_end_0, end_mask = var_6942_end_mask_0, x = K_sliding_slot_cast_fp16)[name = string("op_6942_cast_fp16")];
+            int32 var_6949 = const()[name = string("op_6949"), val = int32(2)];
+            bool K_sliding_out_interleave_0 = const()[name = string("K_sliding_out_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> K_sliding_out_cast_fp16 = concat(axis = var_6949, interleave = K_sliding_out_interleave_0, values = (var_6942_cast_fp16, k_padded_cast_fp16))[name = string("K_sliding_out_cast_fp16")];
+            tensor<int32, [4]> var_6965_begin_0 = const()[name = string("op_6965_begin_0"), val = tensor<int32, [4]>([0, 0, 1, 0])];
+            tensor<int32, [4]> var_6965_end_0 = const()[name = string("op_6965_end_0"), val = tensor<int32, [4]>([1, 2, 512, 512])];
+            tensor<bool, [4]> var_6965_end_mask_0 = const()[name = string("op_6965_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2, 511, 512]> var_6965_cast_fp16 = slice_by_index(begin = var_6965_begin_0, end = var_6965_end_0, end_mask = var_6965_end_mask_0, x = V_sliding_slot_cast_fp16)[name = string("op_6965_cast_fp16")];
+            int32 var_6972 = const()[name = string("op_6972"), val = int32(2)];
+            bool V_sliding_out_interleave_0 = const()[name = string("V_sliding_out_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 512, 512]> V_sliding_out_cast_fp16 = concat(axis = var_6972, interleave = V_sliding_out_interleave_0, values = (var_6965_cast_fp16, v_padded_cast_fp16))[name = string("V_sliding_out_cast_fp16")];
+            tensor<int32, [4]> K_for_attn_21_begin_0 = const()[name = string("K_for_attn_21_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> K_for_attn_21_end_0 = const()[name = string("K_for_attn_21_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> K_for_attn_21_end_mask_0 = const()[name = string("K_for_attn_21_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> kv13_k = slice_by_index(begin = K_for_attn_21_begin_0, end = K_for_attn_21_end_0, end_mask = K_for_attn_21_end_mask_0, x = K_sliding_out_cast_fp16)[name = string("K_for_attn_21_cast_fp16")];
+            tensor<int32, [4]> V_for_attn_21_begin_0 = const()[name = string("V_for_attn_21_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> V_for_attn_21_end_0 = const()[name = string("V_for_attn_21_end_0"), val = tensor<int32, [4]>([1, 2, 512, 256])];
+            tensor<bool, [4]> V_for_attn_21_end_mask_0 = const()[name = string("V_for_attn_21_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2, 512, 256]> kv13_v = slice_by_index(begin = V_for_attn_21_begin_0, end = V_for_attn_21_end_0, end_mask = V_for_attn_21_end_mask_0, x = V_sliding_out_cast_fp16)[name = string("V_for_attn_21_cast_fp16")];
+            tensor<int32, [4]> transpose_40_perm_0 = const()[name = string("transpose_40_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_20_reps_0 = const()[name = string("tile_20_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_40_cast_fp16 = transpose(perm = transpose_40_perm_0, x = kv13_k)[name = string("transpose_121")];
+            tensor<fp16, [8, 1, 512, 256]> tile_20_cast_fp16 = tile(reps = tile_20_reps_0, x = transpose_40_cast_fp16)[name = string("tile_20_cast_fp16")];
+            tensor<int32, [5]> concat_40 = const()[name = string("concat_40"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_40_cast_fp16 = reshape(shape = concat_40, x = tile_20_cast_fp16)[name = string("reshape_40_cast_fp16")];
+            tensor<int32, [5]> transpose_41_perm_0 = const()[name = string("transpose_41_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_41 = const()[name = string("concat_41"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_41_cast_fp16 = transpose(perm = transpose_41_perm_0, x = reshape_40_cast_fp16)[name = string("transpose_120")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_41_cast_fp16 = reshape(shape = concat_41, x = transpose_41_cast_fp16)[name = string("reshape_41_cast_fp16")];
+            tensor<int32, [4]> transpose_94_perm_0 = const()[name = string("transpose_94_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_42_perm_0 = const()[name = string("transpose_42_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_21_reps_0 = const()[name = string("tile_21_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_42_cast_fp16 = transpose(perm = transpose_42_perm_0, x = kv13_v)[name = string("transpose_119")];
+            tensor<fp16, [8, 1, 512, 256]> tile_21_cast_fp16 = tile(reps = tile_21_reps_0, x = transpose_42_cast_fp16)[name = string("tile_21_cast_fp16")];
+            tensor<int32, [5]> concat_42 = const()[name = string("concat_42"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_42_cast_fp16 = reshape(shape = concat_42, x = tile_21_cast_fp16)[name = string("reshape_42_cast_fp16")];
+            tensor<int32, [5]> transpose_43_perm_0 = const()[name = string("transpose_43_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_43 = const()[name = string("concat_43"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_43_cast_fp16 = transpose(perm = transpose_43_perm_0, x = reshape_42_cast_fp16)[name = string("transpose_118")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_43_cast_fp16 = reshape(shape = concat_43, x = transpose_43_cast_fp16)[name = string("reshape_43_cast_fp16")];
+            tensor<int32, [4]> V_expanded_21_perm_0 = const()[name = string("V_expanded_21_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_41_transpose_x_0 = const()[name = string("attn_weights_41_transpose_x_0"), val = bool(false)];
+            bool attn_weights_41_transpose_y_0 = const()[name = string("attn_weights_41_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_94_cast_fp16 = transpose(perm = transpose_94_perm_0, x = reshape_41_cast_fp16)[name = string("transpose_117")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_41_cast_fp16 = matmul(transpose_x = attn_weights_41_transpose_x_0, transpose_y = attn_weights_41_transpose_y_0, x = q_87_cast_fp16, y = transpose_94_cast_fp16)[name = string("attn_weights_41_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_207_cast_fp16 = add(x = attn_weights_41_cast_fp16, y = causal_mask_sliding)[name = string("x_207_cast_fp16")];
+            tensor<int32, [1]> reduce_max_10_axes_0 = const()[name = string("reduce_max_10_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_10_keep_dims_0 = const()[name = string("reduce_max_10_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_10 = reduce_max(axes = reduce_max_10_axes_0, keep_dims = reduce_max_10_keep_dims_0, x = x_207_cast_fp16)[name = string("reduce_max_10")];
+            tensor<fp16, [1, 8, 1, 512]> var_7023 = sub(x = x_207_cast_fp16, y = reduce_max_10)[name = string("op_7023")];
+            tensor<fp16, [1, 8, 1, 512]> var_7029 = exp(x = var_7023)[name = string("op_7029")];
+            tensor<int32, [1]> var_7039_axes_0 = const()[name = string("op_7039_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_7039_keep_dims_0 = const()[name = string("op_7039_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_7039 = reduce_sum(axes = var_7039_axes_0, keep_dims = var_7039_keep_dims_0, x = var_7029)[name = string("op_7039")];
+            tensor<fp16, [1, 8, 1, 512]> var_7045_cast_fp16 = real_div(x = var_7029, y = var_7039)[name = string("op_7045_cast_fp16")];
+            bool attn_output_61_transpose_x_0 = const()[name = string("attn_output_61_transpose_x_0"), val = bool(false)];
+            bool attn_output_61_transpose_y_0 = const()[name = string("attn_output_61_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_21_cast_fp16 = transpose(perm = V_expanded_21_perm_0, x = reshape_43_cast_fp16)[name = string("transpose_116")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_61_cast_fp16 = matmul(transpose_x = attn_output_61_transpose_x_0, transpose_y = attn_output_61_transpose_y_0, x = var_7045_cast_fp16, y = V_expanded_21_cast_fp16)[name = string("attn_output_61_cast_fp16")];
+            tensor<int32, [4]> var_7056 = const()[name = string("op_7056"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_7063 = const()[name = string("op_7063"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_7057_cast_fp16 = transpose(perm = var_7056, x = attn_output_61_cast_fp16)[name = string("transpose_115")];
+            tensor<fp16, [1, 1, 2048]> attn_output_63_cast_fp16 = reshape(shape = var_7063, x = var_7057_cast_fp16)[name = string("attn_output_63_cast_fp16")];
+            tensor<int32, [3]> var_7068 = const()[name = string("op_7068"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_7084_pad_type_0 = const()[name = string("op_7084_pad_type_0"), val = string("valid")];
+            int32 var_7084_groups_0 = const()[name = string("op_7084_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_7084_strides_0 = const()[name = string("op_7084_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_7084_pad_0 = const()[name = string("op_7084_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_7084_dilations_0 = const()[name = string("op_7084_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_10_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(946970752))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(949592256))))[name = string("squeeze_10_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_7069_cast_fp16 = transpose(perm = var_7068, x = attn_output_63_cast_fp16)[name = string("transpose_114")];
+            tensor<fp16, [1, 2560, 1]> var_7084_cast_fp16 = conv(dilations = var_7084_dilations_0, groups = var_7084_groups_0, pad = var_7084_pad_0, pad_type = var_7084_pad_type_0, strides = var_7084_strides_0, weight = squeeze_10_cast_fp16_to_fp32_to_fp16_palettized, x = var_7069_cast_fp16)[name = string("op_7084_cast_fp16")];
+            tensor<int32, [3]> var_7088 = const()[name = string("op_7088"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_7094 = const()[name = string("op_7094"), val = int32(-1)];
+            fp16 const_125_promoted_to_fp16 = const()[name = string("const_125_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_211_cast_fp16 = transpose(perm = var_7088, x = var_7084_cast_fp16)[name = string("transpose_113")];
+            tensor<fp16, [1, 1, 2560]> var_7096_cast_fp16 = mul(x = x_211_cast_fp16, y = const_125_promoted_to_fp16)[name = string("op_7096_cast_fp16")];
+            bool input_311_interleave_0 = const()[name = string("input_311_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_311_cast_fp16 = concat(axis = var_7094, interleave = input_311_interleave_0, values = (x_211_cast_fp16, var_7096_cast_fp16))[name = string("input_311_cast_fp16")];
+            tensor<int32, [1]> normed_293_axes_0 = const()[name = string("normed_293_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7091_to_fp16 = const()[name = string("op_7091_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_293_cast_fp16 = layer_norm(axes = normed_293_axes_0, epsilon = var_7091_to_fp16, x = input_311_cast_fp16)[name = string("normed_293_cast_fp16")];
+            tensor<int32, [2]> var_7101_split_sizes_0 = const()[name = string("op_7101_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7101_axis_0 = const()[name = string("op_7101_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_7101_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_7101_cast_fp16_1 = split(axis = var_7101_axis_0, split_sizes = var_7101_split_sizes_0, x = normed_293_cast_fp16)[name = string("op_7101_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_10_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_10_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(949594880)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_65_cast_fp16 = mul(x = var_7101_cast_fp16_0, y = layers_c2_10_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_65_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_213_cast_fp16 = add(x = x_199_cast_fp16, y = attn_output_65_cast_fp16)[name = string("x_213_cast_fp16")];
+            int32 var_7110 = const()[name = string("op_7110"), val = int32(-1)];
+            fp16 const_126_promoted_to_fp16 = const()[name = string("const_126_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_7112_cast_fp16 = mul(x = x_213_cast_fp16, y = const_126_promoted_to_fp16)[name = string("op_7112_cast_fp16")];
+            bool input_313_interleave_0 = const()[name = string("input_313_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_313_cast_fp16 = concat(axis = var_7110, interleave = input_313_interleave_0, values = (x_213_cast_fp16, var_7112_cast_fp16))[name = string("input_313_cast_fp16")];
+            tensor<int32, [1]> normed_297_axes_0 = const()[name = string("normed_297_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7107_to_fp16 = const()[name = string("op_7107_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_297_cast_fp16 = layer_norm(axes = normed_297_axes_0, epsilon = var_7107_to_fp16, x = input_313_cast_fp16)[name = string("normed_297_cast_fp16")];
+            tensor<int32, [2]> var_7117_split_sizes_0 = const()[name = string("op_7117_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7117_axis_0 = const()[name = string("op_7117_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_7117_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_7117_cast_fp16_1 = split(axis = var_7117_axis_0, split_sizes = var_7117_split_sizes_0, x = normed_297_cast_fp16)[name = string("op_7117_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_10_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_10_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(949600064)))];
+            tensor<fp16, [1, 1, 2560]> h_63_cast_fp16 = mul(x = var_7117_cast_fp16_0, y = layers_c2_10_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_63_cast_fp16")];
+            tensor<int32, [3]> var_7128 = const()[name = string("op_7128"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_315_axes_0 = const()[name = string("input_315_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_7129 = transpose(perm = var_7128, x = h_63_cast_fp16)[name = string("transpose_112")];
+            tensor<fp16, [1, 2560, 1, 1]> input_315 = expand_dims(axes = input_315_axes_0, x = var_7129)[name = string("input_315")];
+            string gate_41_pad_type_0 = const()[name = string("gate_41_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_41_strides_0 = const()[name = string("gate_41_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_41_pad_0 = const()[name = string("gate_41_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_41_dilations_0 = const()[name = string("gate_41_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_41_groups_0 = const()[name = string("gate_41_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_41 = conv(dilations = gate_41_dilations_0, groups = gate_41_groups_0, pad = gate_41_pad_0, pad_type = gate_41_pad_type_0, strides = gate_41_strides_0, weight = layers_c2_10_mlp_gate_proj_weight_palettized, x = input_315)[name = string("gate_41")];
+            string up_21_pad_type_0 = const()[name = string("up_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_21_strides_0 = const()[name = string("up_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_21_pad_0 = const()[name = string("up_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_21_dilations_0 = const()[name = string("up_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_21_groups_0 = const()[name = string("up_21_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_21 = conv(dilations = up_21_dilations_0, groups = up_21_groups_0, pad = up_21_pad_0, pad_type = up_21_pad_type_0, strides = up_21_strides_0, weight = layers_c2_10_mlp_up_proj_weight_palettized, x = input_315)[name = string("up_21")];
+            string gate_43_mode_0 = const()[name = string("gate_43_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_43 = gelu(mode = gate_43_mode_0, x = gate_41)[name = string("gate_43")];
+            tensor<fp16, [1, 10240, 1, 1]> input_317 = mul(x = gate_43, y = up_21)[name = string("input_317")];
+            string mlp_out_21_pad_type_0 = const()[name = string("mlp_out_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_21_strides_0 = const()[name = string("mlp_out_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_21_pad_0 = const()[name = string("mlp_out_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_21_dilations_0 = const()[name = string("mlp_out_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_21_groups_0 = const()[name = string("mlp_out_21_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_21 = conv(dilations = mlp_out_21_dilations_0, groups = mlp_out_21_groups_0, pad = mlp_out_21_pad_0, pad_type = mlp_out_21_pad_type_0, strides = mlp_out_21_strides_0, weight = layers_c2_10_mlp_down_proj_weight_palettized, x = input_317)[name = string("mlp_out_21")];
+            tensor<int32, [1]> var_7169_axes_0 = const()[name = string("op_7169_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_7169 = squeeze(axes = var_7169_axes_0, x = mlp_out_21)[name = string("op_7169")];
+            tensor<int32, [3]> var_7173 = const()[name = string("op_7173"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_7179 = const()[name = string("op_7179"), val = int32(-1)];
+            fp16 const_127_promoted = const()[name = string("const_127_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_215 = transpose(perm = var_7173, x = var_7169)[name = string("transpose_111")];
+            tensor<fp16, [1, 1, 2560]> var_7181 = mul(x = x_215, y = const_127_promoted)[name = string("op_7181")];
+            bool input_319_interleave_0 = const()[name = string("input_319_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_319 = concat(axis = var_7179, interleave = input_319_interleave_0, values = (x_215, var_7181))[name = string("input_319")];
+            tensor<int32, [1]> normed_301_axes_0 = const()[name = string("normed_301_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7176_to_fp16 = const()[name = string("op_7176_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_301_cast_fp16 = layer_norm(axes = normed_301_axes_0, epsilon = var_7176_to_fp16, x = input_319)[name = string("normed_301_cast_fp16")];
+            tensor<int32, [2]> var_7186_split_sizes_0 = const()[name = string("op_7186_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7186_axis_0 = const()[name = string("op_7186_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_7186_0, tensor<fp16, [1, 1, 2560]> var_7186_1 = split(axis = var_7186_axis_0, split_sizes = var_7186_split_sizes_0, x = normed_301_cast_fp16)[name = string("op_7186")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_103 = mul(x = var_7186_0, y = layers_c2_10_post_feedforward_layernorm_weight)[name = string("hidden_states_103")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_105_cast_fp16 = add(x = x_213_cast_fp16, y = hidden_states_103)[name = string("hidden_states_105_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_21_begin_0 = const()[name = string("per_layer_slice_21_begin_0"), val = tensor<int32, [3]>([0, 0, 5632])];
+            tensor<int32, [3]> per_layer_slice_21_end_0 = const()[name = string("per_layer_slice_21_end_0"), val = tensor<int32, [3]>([1, 1, 5888])];
+            tensor<bool, [3]> per_layer_slice_21_end_mask_0 = const()[name = string("per_layer_slice_21_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_21_cast_fp16 = slice_by_index(begin = per_layer_slice_21_begin_0, end = per_layer_slice_21_end_0, end_mask = per_layer_slice_21_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_21_cast_fp16")];
+            tensor<int32, [3]> var_7214 = const()[name = string("op_7214"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_321_axes_0 = const()[name = string("input_321_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_7215 = transpose(perm = var_7214, x = hidden_states_105_cast_fp16)[name = string("transpose_110")];
+            tensor<fp16, [1, 2560, 1, 1]> input_321 = expand_dims(axes = input_321_axes_0, x = var_7215)[name = string("input_321")];
+            string gated_61_pad_type_0 = const()[name = string("gated_61_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_61_strides_0 = const()[name = string("gated_61_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_61_pad_0 = const()[name = string("gated_61_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_61_dilations_0 = const()[name = string("gated_61_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_61_groups_0 = const()[name = string("gated_61_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_61 = conv(dilations = gated_61_dilations_0, groups = gated_61_groups_0, pad = gated_61_pad_0, pad_type = gated_61_pad_type_0, strides = gated_61_strides_0, weight = layers_c2_10_per_layer_input_gate_weight_palettized, x = input_321)[name = string("gated_61")];
+            string gated_63_mode_0 = const()[name = string("gated_63_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_63 = gelu(mode = gated_63_mode_0, x = gated_61)[name = string("gated_63")];
+            tensor<int32, [3]> var_7234 = const()[name = string("op_7234"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_21_axes_0 = const()[name = string("per_layer_slice_conv_21_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_7235_cast_fp16 = transpose(perm = var_7234, x = per_layer_slice_21_cast_fp16)[name = string("transpose_109")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_21_cast_fp16 = expand_dims(axes = per_layer_slice_conv_21_axes_0, x = var_7235_cast_fp16)[name = string("per_layer_slice_conv_21_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_323_cast_fp16 = mul(x = gated_63, y = per_layer_slice_conv_21_cast_fp16)[name = string("input_323_cast_fp16")];
+            string gated_65_pad_type_0 = const()[name = string("gated_65_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_65_strides_0 = const()[name = string("gated_65_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_65_pad_0 = const()[name = string("gated_65_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_65_dilations_0 = const()[name = string("gated_65_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_65_groups_0 = const()[name = string("gated_65_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_c2_10_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(949605248))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(949932992))))[name = string("layers_c2_10_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_65_cast_fp16 = conv(dilations = gated_65_dilations_0, groups = gated_65_groups_0, pad = gated_65_pad_0, pad_type = gated_65_pad_type_0, strides = gated_65_strides_0, weight = layers_c2_10_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_323_cast_fp16)[name = string("gated_65_cast_fp16")];
+            tensor<int32, [1]> var_7251_axes_0 = const()[name = string("op_7251_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_7251_cast_fp16 = squeeze(axes = var_7251_axes_0, x = gated_65_cast_fp16)[name = string("op_7251_cast_fp16")];
+            tensor<int32, [3]> var_7255 = const()[name = string("op_7255"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_7261 = const()[name = string("op_7261"), val = int32(-1)];
+            fp16 const_128_promoted_to_fp16 = const()[name = string("const_128_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_217_cast_fp16 = transpose(perm = var_7255, x = var_7251_cast_fp16)[name = string("transpose_108")];
+            tensor<fp16, [1, 1, 2560]> var_7263_cast_fp16 = mul(x = x_217_cast_fp16, y = const_128_promoted_to_fp16)[name = string("op_7263_cast_fp16")];
+            bool input_325_interleave_0 = const()[name = string("input_325_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_325_cast_fp16 = concat(axis = var_7261, interleave = input_325_interleave_0, values = (x_217_cast_fp16, var_7263_cast_fp16))[name = string("input_325_cast_fp16")];
+            tensor<int32, [1]> normed_305_axes_0 = const()[name = string("normed_305_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7258_to_fp16 = const()[name = string("op_7258_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_305_cast_fp16 = layer_norm(axes = normed_305_axes_0, epsilon = var_7258_to_fp16, x = input_325_cast_fp16)[name = string("normed_305_cast_fp16")];
+            tensor<int32, [2]> var_7268_split_sizes_0 = const()[name = string("op_7268_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7268_axis_0 = const()[name = string("op_7268_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_7268_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_7268_cast_fp16_1 = split(axis = var_7268_axis_0, split_sizes = var_7268_split_sizes_0, x = normed_305_cast_fp16)[name = string("op_7268_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_10_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_c2_10_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(949935616)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_109_cast_fp16 = mul(x = var_7268_cast_fp16_0, y = layers_c2_10_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_109_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_111_cast_fp16 = add(x = hidden_states_105_cast_fp16, y = hidden_states_109_cast_fp16)[name = string("hidden_states_111_cast_fp16")];
+            tensor<fp16, [1]> const_129_promoted_to_fp16 = const()[name = string("const_129_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.42p-3])];
+            tensor<fp16, [1, 1, 2560]> x_219_cast_fp16 = mul(x = hidden_states_111_cast_fp16, y = const_129_promoted_to_fp16)[name = string("x_219_cast_fp16")];
+            tensor<int32, [1]> var_7280_axes_0 = const()[name = string("op_7280_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_7280_cast_fp16 = squeeze(axes = var_7280_axes_0, x = K_sliding_out_cast_fp16)[name = string("op_7280_cast_fp16")];
+            tensor<int32, [1]> var_7282_axes_0 = const()[name = string("op_7282_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 512, 512]> var_7282_cast_fp16 = squeeze(axes = var_7282_axes_0, x = V_sliding_out_cast_fp16)[name = string("op_7282_cast_fp16")];
+            tensor<int32, [4]> var_7285_begin_0 = const()[name = string("op_7285_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> var_7285_end_0 = const()[name = string("op_7285_end_0"), val = tensor<int32, [4]>([2, 2, 2048, 512])];
+            tensor<bool, [4]> var_7285_end_mask_0 = const()[name = string("op_7285_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_7285_squeeze_mask_0 = const()[name = string("op_7285_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 2048, 512]> var_7285_cast_fp16 = slice_by_index(begin = var_7285_begin_0, end = var_7285_end_0, end_mask = var_7285_end_mask_0, squeeze_mask = var_7285_squeeze_mask_0, x = K_full_in)[name = string("op_7285_cast_fp16")];
+            tensor<int32, [1]> K_full_slot_axes_0 = const()[name = string("K_full_slot_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 2048, 512]> K_full_slot_cast_fp16 = expand_dims(axes = K_full_slot_axes_0, x = var_7285_cast_fp16)[name = string("K_full_slot_cast_fp16")];
+            tensor<int32, [4]> var_7290_begin_0 = const()[name = string("op_7290_begin_0"), val = tensor<int32, [4]>([1, 0, 0, 0])];
+            tensor<int32, [4]> var_7290_end_0 = const()[name = string("op_7290_end_0"), val = tensor<int32, [4]>([2, 2, 2048, 512])];
+            tensor<bool, [4]> var_7290_end_mask_0 = const()[name = string("op_7290_end_mask_0"), val = tensor<bool, [4]>([false, true, true, true])];
+            tensor<bool, [4]> var_7290_squeeze_mask_0 = const()[name = string("op_7290_squeeze_mask_0"), val = tensor<bool, [4]>([true, false, false, false])];
+            tensor<fp16, [2, 2048, 512]> var_7290_cast_fp16 = slice_by_index(begin = var_7290_begin_0, end = var_7290_end_0, end_mask = var_7290_end_mask_0, squeeze_mask = var_7290_squeeze_mask_0, x = V_full_in)[name = string("op_7290_cast_fp16")];
+            tensor<int32, [1]> V_full_slot_axes_0 = const()[name = string("V_full_slot_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 2, 2048, 512]> V_full_slot_cast_fp16 = expand_dims(axes = V_full_slot_axes_0, x = var_7290_cast_fp16)[name = string("V_full_slot_cast_fp16")];
+            int32 var_7297 = const()[name = string("op_7297"), val = int32(-1)];
+            fp16 const_130_promoted_to_fp16 = const()[name = string("const_130_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_7299_cast_fp16 = mul(x = x_219_cast_fp16, y = const_130_promoted_to_fp16)[name = string("op_7299_cast_fp16")];
+            bool input_327_interleave_0 = const()[name = string("input_327_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_327_cast_fp16 = concat(axis = var_7297, interleave = input_327_interleave_0, values = (x_219_cast_fp16, var_7299_cast_fp16))[name = string("input_327_cast_fp16")];
+            tensor<int32, [1]> normed_309_axes_0 = const()[name = string("normed_309_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7294_to_fp16 = const()[name = string("op_7294_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_309_cast_fp16 = layer_norm(axes = normed_309_axes_0, epsilon = var_7294_to_fp16, x = input_327_cast_fp16)[name = string("normed_309_cast_fp16")];
+            tensor<int32, [2]> var_7304_split_sizes_0 = const()[name = string("op_7304_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7304_axis_0 = const()[name = string("op_7304_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_7304_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_7304_cast_fp16_1 = split(axis = var_7304_axis_0, split_sizes = var_7304_split_sizes_0, x = normed_309_cast_fp16)[name = string("op_7304_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_11_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_11_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(949940800)))];
+            tensor<fp16, [1, 1, 2560]> h_67_cast_fp16 = mul(x = var_7304_cast_fp16_0, y = layers_c2_11_input_layernorm_weight_promoted_to_fp16)[name = string("h_67_cast_fp16")];
+            tensor<int32, [3]> var_7310 = const()[name = string("op_7310"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_7313_axes_0 = const()[name = string("op_7313_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_7311_cast_fp16 = transpose(perm = var_7310, x = h_67_cast_fp16)[name = string("transpose_107")];
+            tensor<fp16, [1, 2560, 1, 1]> var_7313_cast_fp16 = expand_dims(axes = var_7313_axes_0, x = var_7311_cast_fp16)[name = string("op_7313_cast_fp16")];
+            string var_7329_pad_type_0 = const()[name = string("op_7329_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_7329_strides_0 = const()[name = string("op_7329_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_7329_pad_0 = const()[name = string("op_7329_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_7329_dilations_0 = const()[name = string("op_7329_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_7329_groups_0 = const()[name = string("op_7329_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 4096, 1, 1]> var_7329 = conv(dilations = var_7329_dilations_0, groups = var_7329_groups_0, pad = var_7329_pad_0, pad_type = var_7329_pad_type_0, strides = var_7329_strides_0, weight = layers_c2_11_self_attn_q_proj_weight_palettized, x = var_7313_cast_fp16)[name = string("op_7329")];
+            tensor<int32, [4]> var_7334 = const()[name = string("op_7334"), val = tensor<int32, [4]>([1, 8, 512, 1])];
+            tensor<fp16, [1, 8, 512, 1]> var_7335 = reshape(shape = var_7334, x = var_7329)[name = string("op_7335")];
+            tensor<int32, [4]> var_7340 = const()[name = string("op_7340"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_7350 = const()[name = string("op_7350"), val = tensor<int32, [3]>([1, 8, 512])];
+            tensor<fp16, [1, 8, 1, 512]> var_7341 = transpose(perm = var_7340, x = var_7335)[name = string("transpose_106")];
+            tensor<fp16, [1, 8, 512]> x_221 = reshape(shape = var_7350, x = var_7341)[name = string("x_221")];
+            int32 var_7356 = const()[name = string("op_7356"), val = int32(-1)];
+            fp16 const_131_promoted = const()[name = string("const_131_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 512]> var_7358 = mul(x = x_221, y = const_131_promoted)[name = string("op_7358")];
+            bool input_331_interleave_0 = const()[name = string("input_331_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1024]> input_331 = concat(axis = var_7356, interleave = input_331_interleave_0, values = (x_221, var_7358))[name = string("input_331")];
+            tensor<int32, [1]> normed_313_axes_0 = const()[name = string("normed_313_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7353_to_fp16 = const()[name = string("op_7353_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 1024]> normed_313_cast_fp16 = layer_norm(axes = normed_313_axes_0, epsilon = var_7353_to_fp16, x = input_331)[name = string("normed_313_cast_fp16")];
+            tensor<int32, [2]> var_7363_split_sizes_0 = const()[name = string("op_7363_split_sizes_0"), val = tensor<int32, [2]>([512, 512])];
+            int32 var_7363_axis_0 = const()[name = string("op_7363_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 512]> var_7363_0, tensor<fp16, [1, 8, 512]> var_7363_1 = split(axis = var_7363_axis_0, split_sizes = var_7363_split_sizes_0, x = normed_313_cast_fp16)[name = string("op_7363")];
+            tensor<fp16, [1, 8, 512]> var_7365 = mul(x = var_7363_0, y = layers_c2_11_self_attn_q_norm_weight)[name = string("op_7365")];
+            tensor<int32, [4]> var_7370 = const()[name = string("op_7370"), val = tensor<int32, [4]>([1, 8, 1, 512])];
+            tensor<fp16, [1, 8, 1, 512]> q_91 = reshape(shape = var_7370, x = var_7365)[name = string("q_91")];
+            tensor<fp16, [1, 8, 1, 512]> var_7372_cast_fp16 = mul(x = q_91, y = cos_f)[name = string("op_7372_cast_fp16")];
+            tensor<int32, [2]> var_7373_split_sizes_0 = const()[name = string("op_7373_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_7373_axis_0 = const()[name = string("op_7373_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 256]> var_7373_0, tensor<fp16, [1, 8, 1, 256]> var_7373_1 = split(axis = var_7373_axis_0, split_sizes = var_7373_split_sizes_0, x = q_91)[name = string("op_7373")];
+            fp16 const_132_promoted = const()[name = string("const_132_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 256]> var_7375 = mul(x = var_7373_1, y = const_132_promoted)[name = string("op_7375")];
+            int32 var_7377 = const()[name = string("op_7377"), val = int32(-1)];
+            bool var_7378_interleave_0 = const()[name = string("op_7378_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> var_7378 = concat(axis = var_7377, interleave = var_7378_interleave_0, values = (var_7375, var_7373_0))[name = string("op_7378")];
+            tensor<fp16, [1, 8, 1, 512]> var_7379_cast_fp16 = mul(x = var_7378, y = sin_f)[name = string("op_7379_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> q_95_cast_fp16 = add(x = var_7372_cast_fp16, y = var_7379_cast_fp16)[name = string("q_95_cast_fp16")];
+            string var_7392_pad_type_0 = const()[name = string("op_7392_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_7392_strides_0 = const()[name = string("op_7392_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_7392_pad_0 = const()[name = string("op_7392_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_7392_dilations_0 = const()[name = string("op_7392_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_7392_groups_0 = const()[name = string("op_7392_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> var_7392 = conv(dilations = var_7392_dilations_0, groups = var_7392_groups_0, pad = var_7392_pad_0, pad_type = var_7392_pad_type_0, strides = var_7392_strides_0, weight = layers_c2_11_self_attn_k_proj_weight_palettized, x = var_7313_cast_fp16)[name = string("op_7392")];
+            tensor<int32, [4]> var_7397 = const()[name = string("op_7397"), val = tensor<int32, [4]>([1, 2, 512, 1])];
+            tensor<fp16, [1, 2, 512, 1]> var_7398 = reshape(shape = var_7397, x = var_7392)[name = string("op_7398")];
+            tensor<int32, [4]> var_7403 = const()[name = string("op_7403"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            string var_7420_pad_type_0 = const()[name = string("op_7420_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_7420_strides_0 = const()[name = string("op_7420_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_7420_pad_0 = const()[name = string("op_7420_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_7420_dilations_0 = const()[name = string("op_7420_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_7420_groups_0 = const()[name = string("op_7420_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 1024, 1, 1]> var_7420 = conv(dilations = var_7420_dilations_0, groups = var_7420_groups_0, pad = var_7420_pad_0, pad_type = var_7420_pad_type_0, strides = var_7420_strides_0, weight = layers_c2_11_self_attn_v_proj_weight_palettized, x = var_7313_cast_fp16)[name = string("op_7420")];
+            tensor<int32, [4]> var_7425 = const()[name = string("op_7425"), val = tensor<int32, [4]>([1, 2, 512, 1])];
+            tensor<fp16, [1, 2, 512, 1]> var_7426 = reshape(shape = var_7425, x = var_7420)[name = string("op_7426")];
+            tensor<int32, [4]> var_7431 = const()[name = string("op_7431"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_7441 = const()[name = string("op_7441"), val = tensor<int32, [3]>([1, 2, 512])];
+            tensor<fp16, [1, 2, 1, 512]> var_7404 = transpose(perm = var_7403, x = var_7398)[name = string("transpose_105")];
+            tensor<fp16, [1, 2, 512]> x_223 = reshape(shape = var_7441, x = var_7404)[name = string("x_223")];
+            int32 var_7447 = const()[name = string("op_7447"), val = int32(-1)];
+            fp16 const_133_promoted = const()[name = string("const_133_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 512]> var_7449 = mul(x = x_223, y = const_133_promoted)[name = string("op_7449")];
+            bool input_333_interleave_0 = const()[name = string("input_333_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1024]> input_333 = concat(axis = var_7447, interleave = input_333_interleave_0, values = (x_223, var_7449))[name = string("input_333")];
+            tensor<int32, [1]> normed_317_axes_0 = const()[name = string("normed_317_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7444_to_fp16 = const()[name = string("op_7444_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1024]> normed_317_cast_fp16 = layer_norm(axes = normed_317_axes_0, epsilon = var_7444_to_fp16, x = input_333)[name = string("normed_317_cast_fp16")];
+            tensor<int32, [2]> var_7454_split_sizes_0 = const()[name = string("op_7454_split_sizes_0"), val = tensor<int32, [2]>([512, 512])];
+            int32 var_7454_axis_0 = const()[name = string("op_7454_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 512]> var_7454_0, tensor<fp16, [1, 2, 512]> var_7454_1 = split(axis = var_7454_axis_0, split_sizes = var_7454_split_sizes_0, x = normed_317_cast_fp16)[name = string("op_7454")];
+            tensor<fp16, [1, 2, 512]> var_7456 = mul(x = var_7454_0, y = layers_c2_11_self_attn_k_norm_weight)[name = string("op_7456")];
+            tensor<int32, [4]> var_7461 = const()[name = string("op_7461"), val = tensor<int32, [4]>([1, 2, 1, 512])];
+            tensor<fp16, [1, 2, 1, 512]> q_93 = reshape(shape = var_7461, x = var_7456)[name = string("q_93")];
+            fp16 var_7463_promoted = const()[name = string("op_7463_promoted"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2, 1, 512]> var_7432 = transpose(perm = var_7431, x = var_7426)[name = string("transpose_104")];
+            tensor<fp16, [1, 2, 1, 512]> var_7464 = pow(x = var_7432, y = var_7463_promoted)[name = string("op_7464")];
+            tensor<int32, [1]> var_7469_axes_0 = const()[name = string("op_7469_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_7469_keep_dims_0 = const()[name = string("op_7469_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2, 1, 1]> var_7469 = reduce_mean(axes = var_7469_axes_0, keep_dims = var_7469_keep_dims_0, x = var_7464)[name = string("op_7469")];
+            fp16 var_7471_to_fp16 = const()[name = string("op_7471_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2, 1, 1]> mean_sq_cast_fp16 = add(x = var_7469, y = var_7471_to_fp16)[name = string("mean_sq_cast_fp16")];
+            fp32 var_7473_epsilon_0 = const()[name = string("op_7473_epsilon_0"), val = fp32(0x1.197998p-40)];
+            tensor<fp16, [1, 2, 1, 1]> var_7473_cast_fp16 = rsqrt(epsilon = var_7473_epsilon_0, x = mean_sq_cast_fp16)[name = string("op_7473_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 512]> v_cast_fp16 = mul(x = var_7432, y = var_7473_cast_fp16)[name = string("v_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 512]> var_7475_cast_fp16 = mul(x = q_93, y = cos_f)[name = string("op_7475_cast_fp16")];
+            tensor<int32, [2]> var_7476_split_sizes_0 = const()[name = string("op_7476_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_7476_axis_0 = const()[name = string("op_7476_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2, 1, 256]> var_7476_0, tensor<fp16, [1, 2, 1, 256]> var_7476_1 = split(axis = var_7476_axis_0, split_sizes = var_7476_split_sizes_0, x = q_93)[name = string("op_7476")];
+            fp16 const_134_promoted = const()[name = string("const_134_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2, 1, 256]> var_7478 = mul(x = var_7476_1, y = const_134_promoted)[name = string("op_7478")];
+            int32 var_7480 = const()[name = string("op_7480"), val = int32(-1)];
+            bool var_7481_interleave_0 = const()[name = string("op_7481_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2, 1, 512]> var_7481 = concat(axis = var_7480, interleave = var_7481_interleave_0, values = (var_7478, var_7476_0))[name = string("op_7481")];
+            tensor<fp16, [1, 2, 1, 512]> var_7482_cast_fp16 = mul(x = var_7481, y = sin_f)[name = string("op_7482_cast_fp16")];
+            tensor<fp16, [1, 2, 1, 512]> k_cast_fp16 = add(x = var_7475_cast_fp16, y = var_7482_cast_fp16)[name = string("k_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_7488_cast_fp16 = mul(x = K_full_slot_cast_fp16, y = var_4165_cast_fp16)[name = string("op_7488_cast_fp16")];
+            tensor<int32, [4]> var_7489_reps_0 = const()[name = string("op_7489_reps_0"), val = tensor<int32, [4]>([1, 1, 2048, 1])];
+            tensor<fp16, [1, 2, 2048, 512]> var_7489_cast_fp16 = tile(reps = var_7489_reps_0, x = k_cast_fp16)[name = string("op_7489_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_7490_cast_fp16 = mul(x = var_7489_cast_fp16, y = update_mask)[name = string("op_7490_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> kv14_k = add(x = var_7488_cast_fp16, y = var_7490_cast_fp16)[name = string("K_full_out_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_7496_cast_fp16 = mul(x = V_full_slot_cast_fp16, y = var_4165_cast_fp16)[name = string("op_7496_cast_fp16")];
+            tensor<int32, [4]> var_7497_reps_0 = const()[name = string("op_7497_reps_0"), val = tensor<int32, [4]>([1, 1, 2048, 1])];
+            tensor<fp16, [1, 2, 2048, 512]> var_7497_cast_fp16 = tile(reps = var_7497_reps_0, x = v_cast_fp16)[name = string("op_7497_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> var_7498_cast_fp16 = mul(x = var_7497_cast_fp16, y = update_mask)[name = string("op_7498_cast_fp16")];
+            tensor<fp16, [1, 2, 2048, 512]> kv14_v = add(x = var_7496_cast_fp16, y = var_7498_cast_fp16)[name = string("V_full_out_cast_fp16")];
+            tensor<int32, [4]> transpose_44_perm_0 = const()[name = string("transpose_44_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_22_reps_0 = const()[name = string("tile_22_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 2048, 512]> transpose_44_cast_fp16 = transpose(perm = transpose_44_perm_0, x = kv14_k)[name = string("transpose_103")];
+            tensor<fp16, [8, 1, 2048, 512]> tile_22_cast_fp16 = tile(reps = tile_22_reps_0, x = transpose_44_cast_fp16)[name = string("tile_22_cast_fp16")];
+            tensor<int32, [5]> concat_44 = const()[name = string("concat_44"), val = tensor<int32, [5]>([4, 2, 1, 2048, 512])];
+            tensor<fp16, [4, 2, 1, 2048, 512]> reshape_44_cast_fp16 = reshape(shape = concat_44, x = tile_22_cast_fp16)[name = string("reshape_44_cast_fp16")];
+            tensor<int32, [5]> transpose_45_perm_0 = const()[name = string("transpose_45_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_45 = const()[name = string("concat_45"), val = tensor<int32, [4]>([-1, 1, 2048, 512])];
+            tensor<fp16, [2, 4, 1, 2048, 512]> transpose_45_cast_fp16 = transpose(perm = transpose_45_perm_0, x = reshape_44_cast_fp16)[name = string("transpose_102")];
+            tensor<fp16, [8, 1, 2048, 512]> reshape_45_cast_fp16 = reshape(shape = concat_45, x = transpose_45_cast_fp16)[name = string("reshape_45_cast_fp16")];
+            tensor<int32, [4]> transpose_95_perm_0 = const()[name = string("transpose_95_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_46_perm_0 = const()[name = string("transpose_46_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_23_reps_0 = const()[name = string("tile_23_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 2048, 512]> transpose_46_cast_fp16 = transpose(perm = transpose_46_perm_0, x = kv14_v)[name = string("transpose_101")];
+            tensor<fp16, [8, 1, 2048, 512]> tile_23_cast_fp16 = tile(reps = tile_23_reps_0, x = transpose_46_cast_fp16)[name = string("tile_23_cast_fp16")];
+            tensor<int32, [5]> concat_46 = const()[name = string("concat_46"), val = tensor<int32, [5]>([4, 2, 1, 2048, 512])];
+            tensor<fp16, [4, 2, 1, 2048, 512]> reshape_46_cast_fp16 = reshape(shape = concat_46, x = tile_23_cast_fp16)[name = string("reshape_46_cast_fp16")];
+            tensor<int32, [5]> transpose_47_perm_0 = const()[name = string("transpose_47_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_47 = const()[name = string("concat_47"), val = tensor<int32, [4]>([-1, 1, 2048, 512])];
+            tensor<fp16, [2, 4, 1, 2048, 512]> transpose_47_cast_fp16 = transpose(perm = transpose_47_perm_0, x = reshape_46_cast_fp16)[name = string("transpose_100")];
+            tensor<fp16, [8, 1, 2048, 512]> reshape_47_cast_fp16 = reshape(shape = concat_47, x = transpose_47_cast_fp16)[name = string("reshape_47_cast_fp16")];
+            tensor<int32, [4]> V_expanded_23_perm_0 = const()[name = string("V_expanded_23_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_45_transpose_x_0 = const()[name = string("attn_weights_45_transpose_x_0"), val = bool(false)];
+            bool attn_weights_45_transpose_y_0 = const()[name = string("attn_weights_45_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 2048]> transpose_95_cast_fp16 = transpose(perm = transpose_95_perm_0, x = reshape_45_cast_fp16)[name = string("transpose_99")];
+            tensor<fp16, [1, 8, 1, 2048]> attn_weights_45_cast_fp16 = matmul(transpose_x = attn_weights_45_transpose_x_0, transpose_y = attn_weights_45_transpose_y_0, x = q_95_cast_fp16, y = transpose_95_cast_fp16)[name = string("attn_weights_45_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 2048]> x_227_cast_fp16 = add(x = attn_weights_45_cast_fp16, y = causal_mask_full)[name = string("x_227_cast_fp16")];
+            tensor<int32, [1]> reduce_max_11_axes_0 = const()[name = string("reduce_max_11_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_11_keep_dims_0 = const()[name = string("reduce_max_11_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_11 = reduce_max(axes = reduce_max_11_axes_0, keep_dims = reduce_max_11_keep_dims_0, x = x_227_cast_fp16)[name = string("reduce_max_11")];
+            tensor<fp16, [1, 8, 1, 2048]> var_7550 = sub(x = x_227_cast_fp16, y = reduce_max_11)[name = string("op_7550")];
+            tensor<fp16, [1, 8, 1, 2048]> var_7556 = exp(x = var_7550)[name = string("op_7556")];
+            tensor<int32, [1]> var_7566_axes_0 = const()[name = string("op_7566_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_7566_keep_dims_0 = const()[name = string("op_7566_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_7566 = reduce_sum(axes = var_7566_axes_0, keep_dims = var_7566_keep_dims_0, x = var_7556)[name = string("op_7566")];
+            tensor<fp16, [1, 8, 1, 2048]> var_7572_cast_fp16 = real_div(x = var_7556, y = var_7566)[name = string("op_7572_cast_fp16")];
+            bool attn_output_67_transpose_x_0 = const()[name = string("attn_output_67_transpose_x_0"), val = bool(false)];
+            bool attn_output_67_transpose_y_0 = const()[name = string("attn_output_67_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 2048, 512]> V_expanded_23_cast_fp16 = transpose(perm = V_expanded_23_perm_0, x = reshape_47_cast_fp16)[name = string("transpose_98")];
+            tensor<fp16, [1, 8, 1, 512]> attn_output_67_cast_fp16 = matmul(transpose_x = attn_output_67_transpose_x_0, transpose_y = attn_output_67_transpose_y_0, x = var_7572_cast_fp16, y = V_expanded_23_cast_fp16)[name = string("attn_output_67_cast_fp16")];
+            tensor<int32, [4]> var_7583 = const()[name = string("op_7583"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_7590 = const()[name = string("op_7590"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 512]> var_7584_cast_fp16 = transpose(perm = var_7583, x = attn_output_67_cast_fp16)[name = string("transpose_97")];
+            tensor<fp16, [1, 1, 4096]> attn_output_69_cast_fp16 = reshape(shape = var_7590, x = var_7584_cast_fp16)[name = string("attn_output_69_cast_fp16")];
+            tensor<int32, [3]> var_7595 = const()[name = string("op_7595"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_7611_pad_type_0 = const()[name = string("op_7611_pad_type_0"), val = string("valid")];
+            int32 var_7611_groups_0 = const()[name = string("op_7611_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_7611_strides_0 = const()[name = string("op_7611_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_7611_pad_0 = const()[name = string("op_7611_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_7611_dilations_0 = const()[name = string("op_7611_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 4096, 1]> squeeze_11_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 4096, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(949945984))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(955188928))))[name = string("squeeze_11_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 4096, 1]> var_7596_cast_fp16 = transpose(perm = var_7595, x = attn_output_69_cast_fp16)[name = string("transpose_96")];
+            tensor<fp16, [1, 2560, 1]> var_7611_cast_fp16 = conv(dilations = var_7611_dilations_0, groups = var_7611_groups_0, pad = var_7611_pad_0, pad_type = var_7611_pad_type_0, strides = var_7611_strides_0, weight = squeeze_11_cast_fp16_to_fp32_to_fp16_palettized, x = var_7596_cast_fp16)[name = string("op_7611_cast_fp16")];
+            tensor<int32, [3]> var_7615 = const()[name = string("op_7615"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_7621 = const()[name = string("op_7621"), val = int32(-1)];
+            fp16 const_135_promoted_to_fp16 = const()[name = string("const_135_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_231_cast_fp16 = transpose(perm = var_7615, x = var_7611_cast_fp16)[name = string("transpose_95")];
+            tensor<fp16, [1, 1, 2560]> var_7623_cast_fp16 = mul(x = x_231_cast_fp16, y = const_135_promoted_to_fp16)[name = string("op_7623_cast_fp16")];
+            bool input_337_interleave_0 = const()[name = string("input_337_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_337_cast_fp16 = concat(axis = var_7621, interleave = input_337_interleave_0, values = (x_231_cast_fp16, var_7623_cast_fp16))[name = string("input_337_cast_fp16")];
+            tensor<int32, [1]> normed_321_axes_0 = const()[name = string("normed_321_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7618_to_fp16 = const()[name = string("op_7618_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_321_cast_fp16 = layer_norm(axes = normed_321_axes_0, epsilon = var_7618_to_fp16, x = input_337_cast_fp16)[name = string("normed_321_cast_fp16")];
+            tensor<int32, [2]> var_7628_split_sizes_0 = const()[name = string("op_7628_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7628_axis_0 = const()[name = string("op_7628_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_7628_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_7628_cast_fp16_1 = split(axis = var_7628_axis_0, split_sizes = var_7628_split_sizes_0, x = normed_321_cast_fp16)[name = string("op_7628_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_11_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_11_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(955191552)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_71_cast_fp16 = mul(x = var_7628_cast_fp16_0, y = layers_c2_11_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_71_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_233_cast_fp16 = add(x = x_219_cast_fp16, y = attn_output_71_cast_fp16)[name = string("x_233_cast_fp16")];
+            int32 var_7637 = const()[name = string("op_7637"), val = int32(-1)];
+            fp16 const_136_promoted_to_fp16 = const()[name = string("const_136_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_7639_cast_fp16 = mul(x = x_233_cast_fp16, y = const_136_promoted_to_fp16)[name = string("op_7639_cast_fp16")];
+            bool input_339_interleave_0 = const()[name = string("input_339_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_339_cast_fp16 = concat(axis = var_7637, interleave = input_339_interleave_0, values = (x_233_cast_fp16, var_7639_cast_fp16))[name = string("input_339_cast_fp16")];
+            tensor<int32, [1]> normed_325_axes_0 = const()[name = string("normed_325_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7634_to_fp16 = const()[name = string("op_7634_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_325_cast_fp16 = layer_norm(axes = normed_325_axes_0, epsilon = var_7634_to_fp16, x = input_339_cast_fp16)[name = string("normed_325_cast_fp16")];
+            tensor<int32, [2]> var_7644_split_sizes_0 = const()[name = string("op_7644_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7644_axis_0 = const()[name = string("op_7644_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_7644_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_7644_cast_fp16_1 = split(axis = var_7644_axis_0, split_sizes = var_7644_split_sizes_0, x = normed_325_cast_fp16)[name = string("op_7644_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_11_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c2_11_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(955196736)))];
+            tensor<fp16, [1, 1, 2560]> h_69_cast_fp16 = mul(x = var_7644_cast_fp16_0, y = layers_c2_11_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_69_cast_fp16")];
+            tensor<int32, [3]> var_7655 = const()[name = string("op_7655"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_341_axes_0 = const()[name = string("input_341_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_7656 = transpose(perm = var_7655, x = h_69_cast_fp16)[name = string("transpose_94")];
+            tensor<fp16, [1, 2560, 1, 1]> input_341 = expand_dims(axes = input_341_axes_0, x = var_7656)[name = string("input_341")];
+            string gate_45_pad_type_0 = const()[name = string("gate_45_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_45_strides_0 = const()[name = string("gate_45_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_45_pad_0 = const()[name = string("gate_45_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_45_dilations_0 = const()[name = string("gate_45_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_45_groups_0 = const()[name = string("gate_45_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_45 = conv(dilations = gate_45_dilations_0, groups = gate_45_groups_0, pad = gate_45_pad_0, pad_type = gate_45_pad_type_0, strides = gate_45_strides_0, weight = layers_c2_11_mlp_gate_proj_weight_palettized, x = input_341)[name = string("gate_45")];
+            string up_23_pad_type_0 = const()[name = string("up_23_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_23_strides_0 = const()[name = string("up_23_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_23_pad_0 = const()[name = string("up_23_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_23_dilations_0 = const()[name = string("up_23_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_23_groups_0 = const()[name = string("up_23_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_23 = conv(dilations = up_23_dilations_0, groups = up_23_groups_0, pad = up_23_pad_0, pad_type = up_23_pad_type_0, strides = up_23_strides_0, weight = layers_c2_11_mlp_up_proj_weight_palettized, x = input_341)[name = string("up_23")];
+            string gate_47_mode_0 = const()[name = string("gate_47_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_47 = gelu(mode = gate_47_mode_0, x = gate_45)[name = string("gate_47")];
+            tensor<fp16, [1, 10240, 1, 1]> input_343 = mul(x = gate_47, y = up_23)[name = string("input_343")];
+            string mlp_out_23_pad_type_0 = const()[name = string("mlp_out_23_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_23_strides_0 = const()[name = string("mlp_out_23_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_23_pad_0 = const()[name = string("mlp_out_23_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_23_dilations_0 = const()[name = string("mlp_out_23_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_23_groups_0 = const()[name = string("mlp_out_23_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_23 = conv(dilations = mlp_out_23_dilations_0, groups = mlp_out_23_groups_0, pad = mlp_out_23_pad_0, pad_type = mlp_out_23_pad_type_0, strides = mlp_out_23_strides_0, weight = layers_c2_11_mlp_down_proj_weight_palettized, x = input_343)[name = string("mlp_out_23")];
+            tensor<int32, [1]> var_7696_axes_0 = const()[name = string("op_7696_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_7696 = squeeze(axes = var_7696_axes_0, x = mlp_out_23)[name = string("op_7696")];
+            tensor<int32, [3]> var_7700 = const()[name = string("op_7700"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_7706 = const()[name = string("op_7706"), val = int32(-1)];
+            fp16 const_137_promoted = const()[name = string("const_137_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_235 = transpose(perm = var_7700, x = var_7696)[name = string("transpose_93")];
+            tensor<fp16, [1, 1, 2560]> var_7708 = mul(x = x_235, y = const_137_promoted)[name = string("op_7708")];
+            bool input_345_interleave_0 = const()[name = string("input_345_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_345 = concat(axis = var_7706, interleave = input_345_interleave_0, values = (x_235, var_7708))[name = string("input_345")];
+            tensor<int32, [1]> normed_329_axes_0 = const()[name = string("normed_329_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7703_to_fp16 = const()[name = string("op_7703_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_329_cast_fp16 = layer_norm(axes = normed_329_axes_0, epsilon = var_7703_to_fp16, x = input_345)[name = string("normed_329_cast_fp16")];
+            tensor<int32, [2]> var_7713_split_sizes_0 = const()[name = string("op_7713_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7713_axis_0 = const()[name = string("op_7713_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_7713_0, tensor<fp16, [1, 1, 2560]> var_7713_1 = split(axis = var_7713_axis_0, split_sizes = var_7713_split_sizes_0, x = normed_329_cast_fp16)[name = string("op_7713")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_113 = mul(x = var_7713_0, y = layers_c2_11_post_feedforward_layernorm_weight)[name = string("hidden_states_113")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_115_cast_fp16 = add(x = x_233_cast_fp16, y = hidden_states_113)[name = string("hidden_states_115_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_23_begin_0 = const()[name = string("per_layer_slice_23_begin_0"), val = tensor<int32, [3]>([0, 0, 5888])];
+            tensor<int32, [3]> per_layer_slice_23_end_0 = const()[name = string("per_layer_slice_23_end_0"), val = tensor<int32, [3]>([1, 1, 6144])];
+            tensor<bool, [3]> per_layer_slice_23_end_mask_0 = const()[name = string("per_layer_slice_23_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_23_cast_fp16 = slice_by_index(begin = per_layer_slice_23_begin_0, end = per_layer_slice_23_end_0, end_mask = per_layer_slice_23_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_23_cast_fp16")];
+            tensor<int32, [3]> var_7741 = const()[name = string("op_7741"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_347_axes_0 = const()[name = string("input_347_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_7742 = transpose(perm = var_7741, x = hidden_states_115_cast_fp16)[name = string("transpose_92")];
+            tensor<fp16, [1, 2560, 1, 1]> input_347 = expand_dims(axes = input_347_axes_0, x = var_7742)[name = string("input_347")];
+            string gated_67_pad_type_0 = const()[name = string("gated_67_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_67_strides_0 = const()[name = string("gated_67_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_67_pad_0 = const()[name = string("gated_67_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_67_dilations_0 = const()[name = string("gated_67_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_67_groups_0 = const()[name = string("gated_67_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_67 = conv(dilations = gated_67_dilations_0, groups = gated_67_groups_0, pad = gated_67_pad_0, pad_type = gated_67_pad_type_0, strides = gated_67_strides_0, weight = layers_c2_11_per_layer_input_gate_weight_palettized, x = input_347)[name = string("gated_67")];
+            string gated_69_mode_0 = const()[name = string("gated_69_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_69 = gelu(mode = gated_69_mode_0, x = gated_67)[name = string("gated_69")];
+            tensor<int32, [3]> var_7761 = const()[name = string("op_7761"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_23_axes_0 = const()[name = string("per_layer_slice_conv_23_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_7762_cast_fp16 = transpose(perm = var_7761, x = per_layer_slice_23_cast_fp16)[name = string("transpose_91")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_23_cast_fp16 = expand_dims(axes = per_layer_slice_conv_23_axes_0, x = var_7762_cast_fp16)[name = string("per_layer_slice_conv_23_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_349_cast_fp16 = mul(x = gated_69, y = per_layer_slice_conv_23_cast_fp16)[name = string("input_349_cast_fp16")];
+            string gated_71_pad_type_0 = const()[name = string("gated_71_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_71_strides_0 = const()[name = string("gated_71_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_71_pad_0 = const()[name = string("gated_71_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_71_dilations_0 = const()[name = string("gated_71_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_71_groups_0 = const()[name = string("gated_71_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_c2_11_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(955201920))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(955529664))))[name = string("layers_c2_11_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_71_cast_fp16 = conv(dilations = gated_71_dilations_0, groups = gated_71_groups_0, pad = gated_71_pad_0, pad_type = gated_71_pad_type_0, strides = gated_71_strides_0, weight = layers_c2_11_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_349_cast_fp16)[name = string("gated_71_cast_fp16")];
+            tensor<int32, [1]> var_7778_axes_0 = const()[name = string("op_7778_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_7778_cast_fp16 = squeeze(axes = var_7778_axes_0, x = gated_71_cast_fp16)[name = string("op_7778_cast_fp16")];
+            tensor<int32, [3]> var_7782 = const()[name = string("op_7782"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_7788 = const()[name = string("op_7788"), val = int32(-1)];
+            fp16 const_138_promoted_to_fp16 = const()[name = string("const_138_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_237_cast_fp16 = transpose(perm = var_7782, x = var_7778_cast_fp16)[name = string("transpose_90")];
+            tensor<fp16, [1, 1, 2560]> var_7790_cast_fp16 = mul(x = x_237_cast_fp16, y = const_138_promoted_to_fp16)[name = string("op_7790_cast_fp16")];
+            bool input_351_interleave_0 = const()[name = string("input_351_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_351_cast_fp16 = concat(axis = var_7788, interleave = input_351_interleave_0, values = (x_237_cast_fp16, var_7790_cast_fp16))[name = string("input_351_cast_fp16")];
+            tensor<int32, [1]> normed_333_axes_0 = const()[name = string("normed_333_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7785_to_fp16 = const()[name = string("op_7785_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_333_cast_fp16 = layer_norm(axes = normed_333_axes_0, epsilon = var_7785_to_fp16, x = input_351_cast_fp16)[name = string("normed_333_cast_fp16")];
+            tensor<int32, [2]> var_7795_split_sizes_0 = const()[name = string("op_7795_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7795_axis_0 = const()[name = string("op_7795_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_7795_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_7795_cast_fp16_1 = split(axis = var_7795_axis_0, split_sizes = var_7795_split_sizes_0, x = normed_333_cast_fp16)[name = string("op_7795_cast_fp16")];
+            tensor<fp16, [2560]> layers_c2_11_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_c2_11_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(955532288)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_119_cast_fp16 = mul(x = var_7795_cast_fp16_0, y = layers_c2_11_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_119_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_121_cast_fp16 = add(x = hidden_states_115_cast_fp16, y = hidden_states_119_cast_fp16)[name = string("hidden_states_121_cast_fp16")];
+            tensor<fp16, [1]> const_139_promoted_to_fp16 = const()[name = string("const_139_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.0cp-4])];
+            tensor<fp16, [1, 1, 2560]> x_239_cast_fp16 = mul(x = hidden_states_121_cast_fp16, y = const_139_promoted_to_fp16)[name = string("x_239_cast_fp16")];
+            tensor<int32, [1]> var_7807_axes_0 = const()[name = string("op_7807_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 2048, 512]> var_7807_cast_fp16 = squeeze(axes = var_7807_axes_0, x = kv14_k)[name = string("op_7807_cast_fp16")];
+            tensor<int32, [1]> var_7809_axes_0 = const()[name = string("op_7809_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [2, 2048, 512]> var_7809_cast_fp16 = squeeze(axes = var_7809_axes_0, x = kv14_v)[name = string("op_7809_cast_fp16")];
+            int32 var_7814 = const()[name = string("op_7814"), val = int32(-1)];
+            fp16 const_140_promoted_to_fp16 = const()[name = string("const_140_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_7816_cast_fp16 = mul(x = x_239_cast_fp16, y = const_140_promoted_to_fp16)[name = string("op_7816_cast_fp16")];
+            bool input_353_interleave_0 = const()[name = string("input_353_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_353_cast_fp16 = concat(axis = var_7814, interleave = input_353_interleave_0, values = (x_239_cast_fp16, var_7816_cast_fp16))[name = string("input_353_cast_fp16")];
+            tensor<int32, [1]> normed_337_axes_0 = const()[name = string("normed_337_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7811_to_fp16 = const()[name = string("op_7811_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_337_cast_fp16 = layer_norm(axes = normed_337_axes_0, epsilon = var_7811_to_fp16, x = input_353_cast_fp16)[name = string("normed_337_cast_fp16")];
+            tensor<int32, [2]> var_7821_split_sizes_0 = const()[name = string("op_7821_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_7821_axis_0 = const()[name = string("op_7821_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_7821_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_7821_cast_fp16_1 = split(axis = var_7821_axis_0, split_sizes = var_7821_split_sizes_0, x = normed_337_cast_fp16)[name = string("op_7821_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_0_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c3_0_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(955537472)))];
+            tensor<fp16, [1, 1, 2560]> h_73_cast_fp16 = mul(x = var_7821_cast_fp16_0, y = layers_c3_0_input_layernorm_weight_promoted_to_fp16)[name = string("h_73_cast_fp16")];
+            tensor<int32, [3]> var_7827 = const()[name = string("op_7827"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_7830_axes_0 = const()[name = string("op_7830_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_7828_cast_fp16 = transpose(perm = var_7827, x = h_73_cast_fp16)[name = string("transpose_89")];
+            tensor<fp16, [1, 2560, 1, 1]> var_7830_cast_fp16 = expand_dims(axes = var_7830_axes_0, x = var_7828_cast_fp16)[name = string("op_7830_cast_fp16")];
+            string var_7846_pad_type_0 = const()[name = string("op_7846_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_7846_strides_0 = const()[name = string("op_7846_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_7846_pad_0 = const()[name = string("op_7846_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_7846_dilations_0 = const()[name = string("op_7846_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_7846_groups_0 = const()[name = string("op_7846_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_7846 = conv(dilations = var_7846_dilations_0, groups = var_7846_groups_0, pad = var_7846_pad_0, pad_type = var_7846_pad_type_0, strides = var_7846_strides_0, weight = layers_c3_0_self_attn_q_proj_weight_palettized, x = var_7830_cast_fp16)[name = string("op_7846")];
+            tensor<int32, [4]> var_7851 = const()[name = string("op_7851"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_7852 = reshape(shape = var_7851, x = var_7846)[name = string("op_7852")];
+            tensor<int32, [4]> var_7857 = const()[name = string("op_7857"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_7867 = const()[name = string("op_7867"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_7858 = transpose(perm = var_7857, x = var_7852)[name = string("transpose_88")];
+            tensor<fp16, [1, 8, 256]> x_241 = reshape(shape = var_7867, x = var_7858)[name = string("x_241")];
+            int32 var_7873 = const()[name = string("op_7873"), val = int32(-1)];
+            fp16 const_141_promoted = const()[name = string("const_141_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_7875 = mul(x = x_241, y = const_141_promoted)[name = string("op_7875")];
+            bool input_357_interleave_0 = const()[name = string("input_357_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_357 = concat(axis = var_7873, interleave = input_357_interleave_0, values = (x_241, var_7875))[name = string("input_357")];
+            tensor<int32, [1]> normed_341_axes_0 = const()[name = string("normed_341_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7870_to_fp16 = const()[name = string("op_7870_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_341_cast_fp16 = layer_norm(axes = normed_341_axes_0, epsilon = var_7870_to_fp16, x = input_357)[name = string("normed_341_cast_fp16")];
+            tensor<int32, [2]> var_7880_split_sizes_0 = const()[name = string("op_7880_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_7880_axis_0 = const()[name = string("op_7880_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_7880_0, tensor<fp16, [1, 8, 256]> var_7880_1 = split(axis = var_7880_axis_0, split_sizes = var_7880_split_sizes_0, x = normed_341_cast_fp16)[name = string("op_7880")];
+            tensor<fp16, [1, 8, 256]> var_7882 = mul(x = var_7880_0, y = layers_c2_10_self_attn_q_norm_weight)[name = string("op_7882")];
+            tensor<int32, [4]> var_7887 = const()[name = string("op_7887"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_99 = reshape(shape = var_7887, x = var_7882)[name = string("q_99")];
+            tensor<fp16, [1, 8, 1, 256]> var_7889_cast_fp16 = mul(x = q_99, y = cos_s)[name = string("op_7889_cast_fp16")];
+            tensor<int32, [2]> var_7890_split_sizes_0 = const()[name = string("op_7890_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_7890_axis_0 = const()[name = string("op_7890_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_7890_0, tensor<fp16, [1, 8, 1, 128]> var_7890_1 = split(axis = var_7890_axis_0, split_sizes = var_7890_split_sizes_0, x = q_99)[name = string("op_7890")];
+            fp16 const_142_promoted = const()[name = string("const_142_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_7892 = mul(x = var_7890_1, y = const_142_promoted)[name = string("op_7892")];
+            int32 var_7894 = const()[name = string("op_7894"), val = int32(-1)];
+            bool var_7895_interleave_0 = const()[name = string("op_7895_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_7895 = concat(axis = var_7894, interleave = var_7895_interleave_0, values = (var_7892, var_7890_0))[name = string("op_7895")];
+            tensor<fp16, [1, 8, 1, 256]> var_7896_cast_fp16 = mul(x = var_7895, y = sin_s)[name = string("op_7896_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_101_cast_fp16 = add(x = var_7889_cast_fp16, y = var_7896_cast_fp16)[name = string("q_101_cast_fp16")];
+            bool attn_weights_49_transpose_x_0 = const()[name = string("attn_weights_49_transpose_x_0"), val = bool(false)];
+            bool attn_weights_49_transpose_y_0 = const()[name = string("attn_weights_49_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_49_cast_fp16 = matmul(transpose_x = attn_weights_49_transpose_x_0, transpose_y = attn_weights_49_transpose_y_0, x = q_101_cast_fp16, y = transpose_94_cast_fp16)[name = string("attn_weights_49_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_243_cast_fp16 = add(x = attn_weights_49_cast_fp16, y = causal_mask_sliding)[name = string("x_243_cast_fp16")];
+            tensor<int32, [1]> reduce_max_12_axes_0 = const()[name = string("reduce_max_12_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_12_keep_dims_0 = const()[name = string("reduce_max_12_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_12 = reduce_max(axes = reduce_max_12_axes_0, keep_dims = reduce_max_12_keep_dims_0, x = x_243_cast_fp16)[name = string("reduce_max_12")];
+            tensor<fp16, [1, 8, 1, 512]> var_7928 = sub(x = x_243_cast_fp16, y = reduce_max_12)[name = string("op_7928")];
+            tensor<fp16, [1, 8, 1, 512]> var_7934 = exp(x = var_7928)[name = string("op_7934")];
+            tensor<int32, [1]> var_7944_axes_0 = const()[name = string("op_7944_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_7944_keep_dims_0 = const()[name = string("op_7944_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_7944 = reduce_sum(axes = var_7944_axes_0, keep_dims = var_7944_keep_dims_0, x = var_7934)[name = string("op_7944")];
+            tensor<fp16, [1, 8, 1, 512]> var_7950_cast_fp16 = real_div(x = var_7934, y = var_7944)[name = string("op_7950_cast_fp16")];
+            bool attn_output_73_transpose_x_0 = const()[name = string("attn_output_73_transpose_x_0"), val = bool(false)];
+            bool attn_output_73_transpose_y_0 = const()[name = string("attn_output_73_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_73_cast_fp16 = matmul(transpose_x = attn_output_73_transpose_x_0, transpose_y = attn_output_73_transpose_y_0, x = var_7950_cast_fp16, y = V_expanded_21_cast_fp16)[name = string("attn_output_73_cast_fp16")];
+            tensor<int32, [4]> var_7961 = const()[name = string("op_7961"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_7968 = const()[name = string("op_7968"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_7962_cast_fp16 = transpose(perm = var_7961, x = attn_output_73_cast_fp16)[name = string("transpose_87")];
+            tensor<fp16, [1, 1, 2048]> attn_output_75_cast_fp16 = reshape(shape = var_7968, x = var_7962_cast_fp16)[name = string("attn_output_75_cast_fp16")];
+            tensor<int32, [3]> var_7973 = const()[name = string("op_7973"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_7989_pad_type_0 = const()[name = string("op_7989_pad_type_0"), val = string("valid")];
+            int32 var_7989_groups_0 = const()[name = string("op_7989_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_7989_strides_0 = const()[name = string("op_7989_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_7989_pad_0 = const()[name = string("op_7989_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_7989_dilations_0 = const()[name = string("op_7989_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_12_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(955542656))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(958164160))))[name = string("squeeze_12_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_7974_cast_fp16 = transpose(perm = var_7973, x = attn_output_75_cast_fp16)[name = string("transpose_86")];
+            tensor<fp16, [1, 2560, 1]> var_7989_cast_fp16 = conv(dilations = var_7989_dilations_0, groups = var_7989_groups_0, pad = var_7989_pad_0, pad_type = var_7989_pad_type_0, strides = var_7989_strides_0, weight = squeeze_12_cast_fp16_to_fp32_to_fp16_palettized, x = var_7974_cast_fp16)[name = string("op_7989_cast_fp16")];
+            tensor<int32, [3]> var_7993 = const()[name = string("op_7993"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_7999 = const()[name = string("op_7999"), val = int32(-1)];
+            fp16 const_143_promoted_to_fp16 = const()[name = string("const_143_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_247_cast_fp16 = transpose(perm = var_7993, x = var_7989_cast_fp16)[name = string("transpose_85")];
+            tensor<fp16, [1, 1, 2560]> var_8001_cast_fp16 = mul(x = x_247_cast_fp16, y = const_143_promoted_to_fp16)[name = string("op_8001_cast_fp16")];
+            bool input_361_interleave_0 = const()[name = string("input_361_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_361_cast_fp16 = concat(axis = var_7999, interleave = input_361_interleave_0, values = (x_247_cast_fp16, var_8001_cast_fp16))[name = string("input_361_cast_fp16")];
+            tensor<int32, [1]> normed_345_axes_0 = const()[name = string("normed_345_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_7996_to_fp16 = const()[name = string("op_7996_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_345_cast_fp16 = layer_norm(axes = normed_345_axes_0, epsilon = var_7996_to_fp16, x = input_361_cast_fp16)[name = string("normed_345_cast_fp16")];
+            tensor<int32, [2]> var_8006_split_sizes_0 = const()[name = string("op_8006_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_8006_axis_0 = const()[name = string("op_8006_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_8006_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_8006_cast_fp16_1 = split(axis = var_8006_axis_0, split_sizes = var_8006_split_sizes_0, x = normed_345_cast_fp16)[name = string("op_8006_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_0_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c3_0_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(958166784)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_77_cast_fp16 = mul(x = var_8006_cast_fp16_0, y = layers_c3_0_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_77_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_249_cast_fp16 = add(x = x_239_cast_fp16, y = attn_output_77_cast_fp16)[name = string("x_249_cast_fp16")];
+            int32 var_8015 = const()[name = string("op_8015"), val = int32(-1)];
+            fp16 const_144_promoted_to_fp16 = const()[name = string("const_144_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_8017_cast_fp16 = mul(x = x_249_cast_fp16, y = const_144_promoted_to_fp16)[name = string("op_8017_cast_fp16")];
+            bool input_363_interleave_0 = const()[name = string("input_363_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_363_cast_fp16 = concat(axis = var_8015, interleave = input_363_interleave_0, values = (x_249_cast_fp16, var_8017_cast_fp16))[name = string("input_363_cast_fp16")];
+            tensor<int32, [1]> normed_349_axes_0 = const()[name = string("normed_349_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_8012_to_fp16 = const()[name = string("op_8012_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_349_cast_fp16 = layer_norm(axes = normed_349_axes_0, epsilon = var_8012_to_fp16, x = input_363_cast_fp16)[name = string("normed_349_cast_fp16")];
+            tensor<int32, [2]> var_8022_split_sizes_0 = const()[name = string("op_8022_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_8022_axis_0 = const()[name = string("op_8022_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_8022_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_8022_cast_fp16_1 = split(axis = var_8022_axis_0, split_sizes = var_8022_split_sizes_0, x = normed_349_cast_fp16)[name = string("op_8022_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_0_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c3_0_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(958171968)))];
+            tensor<fp16, [1, 1, 2560]> h_75_cast_fp16 = mul(x = var_8022_cast_fp16_0, y = layers_c3_0_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_75_cast_fp16")];
+            tensor<int32, [3]> var_8033 = const()[name = string("op_8033"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_365_axes_0 = const()[name = string("input_365_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_8034 = transpose(perm = var_8033, x = h_75_cast_fp16)[name = string("transpose_84")];
+            tensor<fp16, [1, 2560, 1, 1]> input_365 = expand_dims(axes = input_365_axes_0, x = var_8034)[name = string("input_365")];
+            string gate_49_pad_type_0 = const()[name = string("gate_49_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_49_strides_0 = const()[name = string("gate_49_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_49_pad_0 = const()[name = string("gate_49_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_49_dilations_0 = const()[name = string("gate_49_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_49_groups_0 = const()[name = string("gate_49_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_49 = conv(dilations = gate_49_dilations_0, groups = gate_49_groups_0, pad = gate_49_pad_0, pad_type = gate_49_pad_type_0, strides = gate_49_strides_0, weight = layers_c3_0_mlp_gate_proj_weight_palettized, x = input_365)[name = string("gate_49")];
+            string up_25_pad_type_0 = const()[name = string("up_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_25_strides_0 = const()[name = string("up_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_25_pad_0 = const()[name = string("up_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_25_dilations_0 = const()[name = string("up_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_25_groups_0 = const()[name = string("up_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_25 = conv(dilations = up_25_dilations_0, groups = up_25_groups_0, pad = up_25_pad_0, pad_type = up_25_pad_type_0, strides = up_25_strides_0, weight = layers_c3_0_mlp_up_proj_weight_palettized, x = input_365)[name = string("up_25")];
+            string gate_51_mode_0 = const()[name = string("gate_51_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_51 = gelu(mode = gate_51_mode_0, x = gate_49)[name = string("gate_51")];
+            tensor<fp16, [1, 10240, 1, 1]> input_367 = mul(x = gate_51, y = up_25)[name = string("input_367")];
+            string mlp_out_25_pad_type_0 = const()[name = string("mlp_out_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_25_strides_0 = const()[name = string("mlp_out_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_25_pad_0 = const()[name = string("mlp_out_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_25_dilations_0 = const()[name = string("mlp_out_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_25_groups_0 = const()[name = string("mlp_out_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_25 = conv(dilations = mlp_out_25_dilations_0, groups = mlp_out_25_groups_0, pad = mlp_out_25_pad_0, pad_type = mlp_out_25_pad_type_0, strides = mlp_out_25_strides_0, weight = layers_c3_0_mlp_down_proj_weight_palettized, x = input_367)[name = string("mlp_out_25")];
+            tensor<int32, [1]> var_8074_axes_0 = const()[name = string("op_8074_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_8074 = squeeze(axes = var_8074_axes_0, x = mlp_out_25)[name = string("op_8074")];
+            tensor<int32, [3]> var_8078 = const()[name = string("op_8078"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_8084 = const()[name = string("op_8084"), val = int32(-1)];
+            fp16 const_145_promoted = const()[name = string("const_145_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_251 = transpose(perm = var_8078, x = var_8074)[name = string("transpose_83")];
+            tensor<fp16, [1, 1, 2560]> var_8086 = mul(x = x_251, y = const_145_promoted)[name = string("op_8086")];
+            bool input_369_interleave_0 = const()[name = string("input_369_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_369 = concat(axis = var_8084, interleave = input_369_interleave_0, values = (x_251, var_8086))[name = string("input_369")];
+            tensor<int32, [1]> normed_353_axes_0 = const()[name = string("normed_353_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_8081_to_fp16 = const()[name = string("op_8081_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_353_cast_fp16 = layer_norm(axes = normed_353_axes_0, epsilon = var_8081_to_fp16, x = input_369)[name = string("normed_353_cast_fp16")];
+            tensor<int32, [2]> var_8091_split_sizes_0 = const()[name = string("op_8091_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_8091_axis_0 = const()[name = string("op_8091_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_8091_0, tensor<fp16, [1, 1, 2560]> var_8091_1 = split(axis = var_8091_axis_0, split_sizes = var_8091_split_sizes_0, x = normed_353_cast_fp16)[name = string("op_8091")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_123 = mul(x = var_8091_0, y = layers_c3_0_post_feedforward_layernorm_weight)[name = string("hidden_states_123")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_125_cast_fp16 = add(x = x_249_cast_fp16, y = hidden_states_123)[name = string("hidden_states_125_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_25_begin_0 = const()[name = string("per_layer_slice_25_begin_0"), val = tensor<int32, [3]>([0, 0, 6144])];
+            tensor<int32, [3]> per_layer_slice_25_end_0 = const()[name = string("per_layer_slice_25_end_0"), val = tensor<int32, [3]>([1, 1, 6400])];
+            tensor<bool, [3]> per_layer_slice_25_end_mask_0 = const()[name = string("per_layer_slice_25_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_25_cast_fp16 = slice_by_index(begin = per_layer_slice_25_begin_0, end = per_layer_slice_25_end_0, end_mask = per_layer_slice_25_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_25_cast_fp16")];
+            tensor<int32, [3]> var_8119 = const()[name = string("op_8119"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_371_axes_0 = const()[name = string("input_371_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_8120 = transpose(perm = var_8119, x = hidden_states_125_cast_fp16)[name = string("transpose_82")];
+            tensor<fp16, [1, 2560, 1, 1]> input_371 = expand_dims(axes = input_371_axes_0, x = var_8120)[name = string("input_371")];
+            string gated_73_pad_type_0 = const()[name = string("gated_73_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_73_strides_0 = const()[name = string("gated_73_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_73_pad_0 = const()[name = string("gated_73_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_73_dilations_0 = const()[name = string("gated_73_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_73_groups_0 = const()[name = string("gated_73_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_73 = conv(dilations = gated_73_dilations_0, groups = gated_73_groups_0, pad = gated_73_pad_0, pad_type = gated_73_pad_type_0, strides = gated_73_strides_0, weight = layers_c3_0_per_layer_input_gate_weight_palettized, x = input_371)[name = string("gated_73")];
+            string gated_75_mode_0 = const()[name = string("gated_75_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_75 = gelu(mode = gated_75_mode_0, x = gated_73)[name = string("gated_75")];
+            tensor<int32, [3]> var_8139 = const()[name = string("op_8139"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_25_axes_0 = const()[name = string("per_layer_slice_conv_25_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_8140_cast_fp16 = transpose(perm = var_8139, x = per_layer_slice_25_cast_fp16)[name = string("transpose_81")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_25_cast_fp16 = expand_dims(axes = per_layer_slice_conv_25_axes_0, x = var_8140_cast_fp16)[name = string("per_layer_slice_conv_25_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_373_cast_fp16 = mul(x = gated_75, y = per_layer_slice_conv_25_cast_fp16)[name = string("input_373_cast_fp16")];
+            string gated_77_pad_type_0 = const()[name = string("gated_77_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_77_strides_0 = const()[name = string("gated_77_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_77_pad_0 = const()[name = string("gated_77_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_77_dilations_0 = const()[name = string("gated_77_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_77_groups_0 = const()[name = string("gated_77_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_c3_0_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(958177152))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(958504896))))[name = string("layers_c3_0_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_77_cast_fp16 = conv(dilations = gated_77_dilations_0, groups = gated_77_groups_0, pad = gated_77_pad_0, pad_type = gated_77_pad_type_0, strides = gated_77_strides_0, weight = layers_c3_0_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_373_cast_fp16)[name = string("gated_77_cast_fp16")];
+            tensor<int32, [1]> var_8156_axes_0 = const()[name = string("op_8156_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_8156_cast_fp16 = squeeze(axes = var_8156_axes_0, x = gated_77_cast_fp16)[name = string("op_8156_cast_fp16")];
+            tensor<int32, [3]> var_8160 = const()[name = string("op_8160"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_8166 = const()[name = string("op_8166"), val = int32(-1)];
+            fp16 const_146_promoted_to_fp16 = const()[name = string("const_146_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_253_cast_fp16 = transpose(perm = var_8160, x = var_8156_cast_fp16)[name = string("transpose_80")];
+            tensor<fp16, [1, 1, 2560]> var_8168_cast_fp16 = mul(x = x_253_cast_fp16, y = const_146_promoted_to_fp16)[name = string("op_8168_cast_fp16")];
+            bool input_375_interleave_0 = const()[name = string("input_375_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_375_cast_fp16 = concat(axis = var_8166, interleave = input_375_interleave_0, values = (x_253_cast_fp16, var_8168_cast_fp16))[name = string("input_375_cast_fp16")];
+            tensor<int32, [1]> normed_357_axes_0 = const()[name = string("normed_357_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_8163_to_fp16 = const()[name = string("op_8163_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_357_cast_fp16 = layer_norm(axes = normed_357_axes_0, epsilon = var_8163_to_fp16, x = input_375_cast_fp16)[name = string("normed_357_cast_fp16")];
+            tensor<int32, [2]> var_8173_split_sizes_0 = const()[name = string("op_8173_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_8173_axis_0 = const()[name = string("op_8173_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_8173_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_8173_cast_fp16_1 = split(axis = var_8173_axis_0, split_sizes = var_8173_split_sizes_0, x = normed_357_cast_fp16)[name = string("op_8173_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_0_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_c3_0_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(958507520)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_129_cast_fp16 = mul(x = var_8173_cast_fp16_0, y = layers_c3_0_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_129_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_131_cast_fp16 = add(x = hidden_states_125_cast_fp16, y = hidden_states_129_cast_fp16)[name = string("hidden_states_131_cast_fp16")];
+            tensor<fp16, [1]> const_147_promoted_to_fp16 = const()[name = string("const_147_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.02p-1])];
+            tensor<fp16, [1, 1, 2560]> x_255_cast_fp16 = mul(x = hidden_states_131_cast_fp16, y = const_147_promoted_to_fp16)[name = string("x_255_cast_fp16")];
+            int32 var_8188 = const()[name = string("op_8188"), val = int32(-1)];
+            fp16 const_148_promoted_to_fp16 = const()[name = string("const_148_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_8190_cast_fp16 = mul(x = x_255_cast_fp16, y = const_148_promoted_to_fp16)[name = string("op_8190_cast_fp16")];
+            bool input_377_interleave_0 = const()[name = string("input_377_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_377_cast_fp16 = concat(axis = var_8188, interleave = input_377_interleave_0, values = (x_255_cast_fp16, var_8190_cast_fp16))[name = string("input_377_cast_fp16")];
+            tensor<int32, [1]> normed_361_axes_0 = const()[name = string("normed_361_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_8185_to_fp16 = const()[name = string("op_8185_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_361_cast_fp16 = layer_norm(axes = normed_361_axes_0, epsilon = var_8185_to_fp16, x = input_377_cast_fp16)[name = string("normed_361_cast_fp16")];
+            tensor<int32, [2]> var_8195_split_sizes_0 = const()[name = string("op_8195_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_8195_axis_0 = const()[name = string("op_8195_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_8195_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_8195_cast_fp16_1 = split(axis = var_8195_axis_0, split_sizes = var_8195_split_sizes_0, x = normed_361_cast_fp16)[name = string("op_8195_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_1_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c3_1_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(958512704)))];
+            tensor<fp16, [1, 1, 2560]> h_79_cast_fp16 = mul(x = var_8195_cast_fp16_0, y = layers_c3_1_input_layernorm_weight_promoted_to_fp16)[name = string("h_79_cast_fp16")];
+            tensor<int32, [3]> var_8201 = const()[name = string("op_8201"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_8204_axes_0 = const()[name = string("op_8204_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_8202_cast_fp16 = transpose(perm = var_8201, x = h_79_cast_fp16)[name = string("transpose_79")];
+            tensor<fp16, [1, 2560, 1, 1]> var_8204_cast_fp16 = expand_dims(axes = var_8204_axes_0, x = var_8202_cast_fp16)[name = string("op_8204_cast_fp16")];
+            string var_8220_pad_type_0 = const()[name = string("op_8220_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_8220_strides_0 = const()[name = string("op_8220_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_8220_pad_0 = const()[name = string("op_8220_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_8220_dilations_0 = const()[name = string("op_8220_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_8220_groups_0 = const()[name = string("op_8220_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_8220 = conv(dilations = var_8220_dilations_0, groups = var_8220_groups_0, pad = var_8220_pad_0, pad_type = var_8220_pad_type_0, strides = var_8220_strides_0, weight = layers_c3_1_self_attn_q_proj_weight_palettized, x = var_8204_cast_fp16)[name = string("op_8220")];
+            tensor<int32, [4]> var_8225 = const()[name = string("op_8225"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_8226 = reshape(shape = var_8225, x = var_8220)[name = string("op_8226")];
+            tensor<int32, [4]> var_8231 = const()[name = string("op_8231"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_8241 = const()[name = string("op_8241"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_8232 = transpose(perm = var_8231, x = var_8226)[name = string("transpose_78")];
+            tensor<fp16, [1, 8, 256]> x_257 = reshape(shape = var_8241, x = var_8232)[name = string("x_257")];
+            int32 var_8247 = const()[name = string("op_8247"), val = int32(-1)];
+            fp16 const_149_promoted = const()[name = string("const_149_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_8249 = mul(x = x_257, y = const_149_promoted)[name = string("op_8249")];
+            bool input_381_interleave_0 = const()[name = string("input_381_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_381 = concat(axis = var_8247, interleave = input_381_interleave_0, values = (x_257, var_8249))[name = string("input_381")];
+            tensor<int32, [1]> normed_365_axes_0 = const()[name = string("normed_365_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_8244_to_fp16 = const()[name = string("op_8244_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_365_cast_fp16 = layer_norm(axes = normed_365_axes_0, epsilon = var_8244_to_fp16, x = input_381)[name = string("normed_365_cast_fp16")];
+            tensor<int32, [2]> var_8254_split_sizes_0 = const()[name = string("op_8254_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_8254_axis_0 = const()[name = string("op_8254_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_8254_0, tensor<fp16, [1, 8, 256]> var_8254_1 = split(axis = var_8254_axis_0, split_sizes = var_8254_split_sizes_0, x = normed_365_cast_fp16)[name = string("op_8254")];
+            tensor<fp16, [1, 8, 256]> var_8256 = mul(x = var_8254_0, y = layers_c2_10_self_attn_q_norm_weight)[name = string("op_8256")];
+            tensor<int32, [4]> var_8261 = const()[name = string("op_8261"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_105 = reshape(shape = var_8261, x = var_8256)[name = string("q_105")];
+            tensor<fp16, [1, 8, 1, 256]> var_8263_cast_fp16 = mul(x = q_105, y = cos_s)[name = string("op_8263_cast_fp16")];
+            tensor<int32, [2]> var_8264_split_sizes_0 = const()[name = string("op_8264_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_8264_axis_0 = const()[name = string("op_8264_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_8264_0, tensor<fp16, [1, 8, 1, 128]> var_8264_1 = split(axis = var_8264_axis_0, split_sizes = var_8264_split_sizes_0, x = q_105)[name = string("op_8264")];
+            fp16 const_150_promoted = const()[name = string("const_150_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_8266 = mul(x = var_8264_1, y = const_150_promoted)[name = string("op_8266")];
+            int32 var_8268 = const()[name = string("op_8268"), val = int32(-1)];
+            bool var_8269_interleave_0 = const()[name = string("op_8269_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_8269 = concat(axis = var_8268, interleave = var_8269_interleave_0, values = (var_8266, var_8264_0))[name = string("op_8269")];
+            tensor<fp16, [1, 8, 1, 256]> var_8270_cast_fp16 = mul(x = var_8269, y = sin_s)[name = string("op_8270_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_107_cast_fp16 = add(x = var_8263_cast_fp16, y = var_8270_cast_fp16)[name = string("q_107_cast_fp16")];
+            bool attn_weights_53_transpose_x_0 = const()[name = string("attn_weights_53_transpose_x_0"), val = bool(false)];
+            bool attn_weights_53_transpose_y_0 = const()[name = string("attn_weights_53_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_53_cast_fp16 = matmul(transpose_x = attn_weights_53_transpose_x_0, transpose_y = attn_weights_53_transpose_y_0, x = q_107_cast_fp16, y = transpose_94_cast_fp16)[name = string("attn_weights_53_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_259_cast_fp16 = add(x = attn_weights_53_cast_fp16, y = causal_mask_sliding)[name = string("x_259_cast_fp16")];
+            tensor<int32, [1]> reduce_max_13_axes_0 = const()[name = string("reduce_max_13_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_13_keep_dims_0 = const()[name = string("reduce_max_13_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_13 = reduce_max(axes = reduce_max_13_axes_0, keep_dims = reduce_max_13_keep_dims_0, x = x_259_cast_fp16)[name = string("reduce_max_13")];
+            tensor<fp16, [1, 8, 1, 512]> var_8302 = sub(x = x_259_cast_fp16, y = reduce_max_13)[name = string("op_8302")];
+            tensor<fp16, [1, 8, 1, 512]> var_8308 = exp(x = var_8302)[name = string("op_8308")];
+            tensor<int32, [1]> var_8318_axes_0 = const()[name = string("op_8318_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_8318_keep_dims_0 = const()[name = string("op_8318_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_8318 = reduce_sum(axes = var_8318_axes_0, keep_dims = var_8318_keep_dims_0, x = var_8308)[name = string("op_8318")];
+            tensor<fp16, [1, 8, 1, 512]> var_8324_cast_fp16 = real_div(x = var_8308, y = var_8318)[name = string("op_8324_cast_fp16")];
+            bool attn_output_79_transpose_x_0 = const()[name = string("attn_output_79_transpose_x_0"), val = bool(false)];
+            bool attn_output_79_transpose_y_0 = const()[name = string("attn_output_79_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_79_cast_fp16 = matmul(transpose_x = attn_output_79_transpose_x_0, transpose_y = attn_output_79_transpose_y_0, x = var_8324_cast_fp16, y = V_expanded_21_cast_fp16)[name = string("attn_output_79_cast_fp16")];
+            tensor<int32, [4]> var_8335 = const()[name = string("op_8335"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_8342 = const()[name = string("op_8342"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_8336_cast_fp16 = transpose(perm = var_8335, x = attn_output_79_cast_fp16)[name = string("transpose_77")];
+            tensor<fp16, [1, 1, 2048]> attn_output_81_cast_fp16 = reshape(shape = var_8342, x = var_8336_cast_fp16)[name = string("attn_output_81_cast_fp16")];
+            tensor<int32, [3]> var_8347 = const()[name = string("op_8347"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_8363_pad_type_0 = const()[name = string("op_8363_pad_type_0"), val = string("valid")];
+            int32 var_8363_groups_0 = const()[name = string("op_8363_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_8363_strides_0 = const()[name = string("op_8363_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_8363_pad_0 = const()[name = string("op_8363_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_8363_dilations_0 = const()[name = string("op_8363_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_13_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(958517888))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(961139392))))[name = string("squeeze_13_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_8348_cast_fp16 = transpose(perm = var_8347, x = attn_output_81_cast_fp16)[name = string("transpose_76")];
+            tensor<fp16, [1, 2560, 1]> var_8363_cast_fp16 = conv(dilations = var_8363_dilations_0, groups = var_8363_groups_0, pad = var_8363_pad_0, pad_type = var_8363_pad_type_0, strides = var_8363_strides_0, weight = squeeze_13_cast_fp16_to_fp32_to_fp16_palettized, x = var_8348_cast_fp16)[name = string("op_8363_cast_fp16")];
+            tensor<int32, [3]> var_8367 = const()[name = string("op_8367"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_8373 = const()[name = string("op_8373"), val = int32(-1)];
+            fp16 const_151_promoted_to_fp16 = const()[name = string("const_151_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_263_cast_fp16 = transpose(perm = var_8367, x = var_8363_cast_fp16)[name = string("transpose_75")];
+            tensor<fp16, [1, 1, 2560]> var_8375_cast_fp16 = mul(x = x_263_cast_fp16, y = const_151_promoted_to_fp16)[name = string("op_8375_cast_fp16")];
+            bool input_385_interleave_0 = const()[name = string("input_385_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_385_cast_fp16 = concat(axis = var_8373, interleave = input_385_interleave_0, values = (x_263_cast_fp16, var_8375_cast_fp16))[name = string("input_385_cast_fp16")];
+            tensor<int32, [1]> normed_369_axes_0 = const()[name = string("normed_369_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_8370_to_fp16 = const()[name = string("op_8370_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_369_cast_fp16 = layer_norm(axes = normed_369_axes_0, epsilon = var_8370_to_fp16, x = input_385_cast_fp16)[name = string("normed_369_cast_fp16")];
+            tensor<int32, [2]> var_8380_split_sizes_0 = const()[name = string("op_8380_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_8380_axis_0 = const()[name = string("op_8380_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_8380_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_8380_cast_fp16_1 = split(axis = var_8380_axis_0, split_sizes = var_8380_split_sizes_0, x = normed_369_cast_fp16)[name = string("op_8380_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_1_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c3_1_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(961142016)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_83_cast_fp16 = mul(x = var_8380_cast_fp16_0, y = layers_c3_1_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_83_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_265_cast_fp16 = add(x = x_255_cast_fp16, y = attn_output_83_cast_fp16)[name = string("x_265_cast_fp16")];
+            int32 var_8389 = const()[name = string("op_8389"), val = int32(-1)];
+            fp16 const_152_promoted_to_fp16 = const()[name = string("const_152_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_8391_cast_fp16 = mul(x = x_265_cast_fp16, y = const_152_promoted_to_fp16)[name = string("op_8391_cast_fp16")];
+            bool input_387_interleave_0 = const()[name = string("input_387_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_387_cast_fp16 = concat(axis = var_8389, interleave = input_387_interleave_0, values = (x_265_cast_fp16, var_8391_cast_fp16))[name = string("input_387_cast_fp16")];
+            tensor<int32, [1]> normed_373_axes_0 = const()[name = string("normed_373_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_8386_to_fp16 = const()[name = string("op_8386_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_373_cast_fp16 = layer_norm(axes = normed_373_axes_0, epsilon = var_8386_to_fp16, x = input_387_cast_fp16)[name = string("normed_373_cast_fp16")];
+            tensor<int32, [2]> var_8396_split_sizes_0 = const()[name = string("op_8396_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_8396_axis_0 = const()[name = string("op_8396_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_8396_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_8396_cast_fp16_1 = split(axis = var_8396_axis_0, split_sizes = var_8396_split_sizes_0, x = normed_373_cast_fp16)[name = string("op_8396_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_1_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c3_1_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(961147200)))];
+            tensor<fp16, [1, 1, 2560]> h_81_cast_fp16 = mul(x = var_8396_cast_fp16_0, y = layers_c3_1_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_81_cast_fp16")];
+            tensor<int32, [3]> var_8407 = const()[name = string("op_8407"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_389_axes_0 = const()[name = string("input_389_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_8408 = transpose(perm = var_8407, x = h_81_cast_fp16)[name = string("transpose_74")];
+            tensor<fp16, [1, 2560, 1, 1]> input_389 = expand_dims(axes = input_389_axes_0, x = var_8408)[name = string("input_389")];
+            string gate_53_pad_type_0 = const()[name = string("gate_53_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_53_strides_0 = const()[name = string("gate_53_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_53_pad_0 = const()[name = string("gate_53_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_53_dilations_0 = const()[name = string("gate_53_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_53_groups_0 = const()[name = string("gate_53_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_53 = conv(dilations = gate_53_dilations_0, groups = gate_53_groups_0, pad = gate_53_pad_0, pad_type = gate_53_pad_type_0, strides = gate_53_strides_0, weight = layers_c3_1_mlp_gate_proj_weight_palettized, x = input_389)[name = string("gate_53")];
+            string up_27_pad_type_0 = const()[name = string("up_27_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_27_strides_0 = const()[name = string("up_27_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_27_pad_0 = const()[name = string("up_27_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_27_dilations_0 = const()[name = string("up_27_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_27_groups_0 = const()[name = string("up_27_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_27 = conv(dilations = up_27_dilations_0, groups = up_27_groups_0, pad = up_27_pad_0, pad_type = up_27_pad_type_0, strides = up_27_strides_0, weight = layers_c3_1_mlp_up_proj_weight_palettized, x = input_389)[name = string("up_27")];
+            string gate_55_mode_0 = const()[name = string("gate_55_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_55 = gelu(mode = gate_55_mode_0, x = gate_53)[name = string("gate_55")];
+            tensor<fp16, [1, 10240, 1, 1]> input_391 = mul(x = gate_55, y = up_27)[name = string("input_391")];
+            string mlp_out_27_pad_type_0 = const()[name = string("mlp_out_27_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_27_strides_0 = const()[name = string("mlp_out_27_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_27_pad_0 = const()[name = string("mlp_out_27_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_27_dilations_0 = const()[name = string("mlp_out_27_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_27_groups_0 = const()[name = string("mlp_out_27_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_27 = conv(dilations = mlp_out_27_dilations_0, groups = mlp_out_27_groups_0, pad = mlp_out_27_pad_0, pad_type = mlp_out_27_pad_type_0, strides = mlp_out_27_strides_0, weight = layers_c3_1_mlp_down_proj_weight_palettized, x = input_391)[name = string("mlp_out_27")];
+            tensor<int32, [1]> var_8448_axes_0 = const()[name = string("op_8448_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_8448 = squeeze(axes = var_8448_axes_0, x = mlp_out_27)[name = string("op_8448")];
+            tensor<int32, [3]> var_8452 = const()[name = string("op_8452"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_8458 = const()[name = string("op_8458"), val = int32(-1)];
+            fp16 const_153_promoted = const()[name = string("const_153_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_267 = transpose(perm = var_8452, x = var_8448)[name = string("transpose_73")];
+            tensor<fp16, [1, 1, 2560]> var_8460 = mul(x = x_267, y = const_153_promoted)[name = string("op_8460")];
+            bool input_393_interleave_0 = const()[name = string("input_393_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_393 = concat(axis = var_8458, interleave = input_393_interleave_0, values = (x_267, var_8460))[name = string("input_393")];
+            tensor<int32, [1]> normed_377_axes_0 = const()[name = string("normed_377_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_8455_to_fp16 = const()[name = string("op_8455_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_377_cast_fp16 = layer_norm(axes = normed_377_axes_0, epsilon = var_8455_to_fp16, x = input_393)[name = string("normed_377_cast_fp16")];
+            tensor<int32, [2]> var_8465_split_sizes_0 = const()[name = string("op_8465_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_8465_axis_0 = const()[name = string("op_8465_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_8465_0, tensor<fp16, [1, 1, 2560]> var_8465_1 = split(axis = var_8465_axis_0, split_sizes = var_8465_split_sizes_0, x = normed_377_cast_fp16)[name = string("op_8465")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_133 = mul(x = var_8465_0, y = layers_c3_1_post_feedforward_layernorm_weight)[name = string("hidden_states_133")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_135_cast_fp16 = add(x = x_265_cast_fp16, y = hidden_states_133)[name = string("hidden_states_135_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_27_begin_0 = const()[name = string("per_layer_slice_27_begin_0"), val = tensor<int32, [3]>([0, 0, 6400])];
+            tensor<int32, [3]> per_layer_slice_27_end_0 = const()[name = string("per_layer_slice_27_end_0"), val = tensor<int32, [3]>([1, 1, 6656])];
+            tensor<bool, [3]> per_layer_slice_27_end_mask_0 = const()[name = string("per_layer_slice_27_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_27_cast_fp16 = slice_by_index(begin = per_layer_slice_27_begin_0, end = per_layer_slice_27_end_0, end_mask = per_layer_slice_27_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_27_cast_fp16")];
+            tensor<int32, [3]> var_8493 = const()[name = string("op_8493"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_395_axes_0 = const()[name = string("input_395_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_8494 = transpose(perm = var_8493, x = hidden_states_135_cast_fp16)[name = string("transpose_72")];
+            tensor<fp16, [1, 2560, 1, 1]> input_395 = expand_dims(axes = input_395_axes_0, x = var_8494)[name = string("input_395")];
+            string gated_79_pad_type_0 = const()[name = string("gated_79_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_79_strides_0 = const()[name = string("gated_79_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_79_pad_0 = const()[name = string("gated_79_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_79_dilations_0 = const()[name = string("gated_79_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_79_groups_0 = const()[name = string("gated_79_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_79 = conv(dilations = gated_79_dilations_0, groups = gated_79_groups_0, pad = gated_79_pad_0, pad_type = gated_79_pad_type_0, strides = gated_79_strides_0, weight = layers_c3_1_per_layer_input_gate_weight_palettized, x = input_395)[name = string("gated_79")];
+            string gated_81_mode_0 = const()[name = string("gated_81_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_81 = gelu(mode = gated_81_mode_0, x = gated_79)[name = string("gated_81")];
+            tensor<int32, [3]> var_8513 = const()[name = string("op_8513"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_27_axes_0 = const()[name = string("per_layer_slice_conv_27_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_8514_cast_fp16 = transpose(perm = var_8513, x = per_layer_slice_27_cast_fp16)[name = string("transpose_71")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_27_cast_fp16 = expand_dims(axes = per_layer_slice_conv_27_axes_0, x = var_8514_cast_fp16)[name = string("per_layer_slice_conv_27_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_397_cast_fp16 = mul(x = gated_81, y = per_layer_slice_conv_27_cast_fp16)[name = string("input_397_cast_fp16")];
+            string gated_83_pad_type_0 = const()[name = string("gated_83_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_83_strides_0 = const()[name = string("gated_83_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_83_pad_0 = const()[name = string("gated_83_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_83_dilations_0 = const()[name = string("gated_83_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_83_groups_0 = const()[name = string("gated_83_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_c3_1_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(961152384))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(961480128))))[name = string("layers_c3_1_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_83_cast_fp16 = conv(dilations = gated_83_dilations_0, groups = gated_83_groups_0, pad = gated_83_pad_0, pad_type = gated_83_pad_type_0, strides = gated_83_strides_0, weight = layers_c3_1_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_397_cast_fp16)[name = string("gated_83_cast_fp16")];
+            tensor<int32, [1]> var_8530_axes_0 = const()[name = string("op_8530_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_8530_cast_fp16 = squeeze(axes = var_8530_axes_0, x = gated_83_cast_fp16)[name = string("op_8530_cast_fp16")];
+            tensor<int32, [3]> var_8534 = const()[name = string("op_8534"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_8540 = const()[name = string("op_8540"), val = int32(-1)];
+            fp16 const_154_promoted_to_fp16 = const()[name = string("const_154_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_269_cast_fp16 = transpose(perm = var_8534, x = var_8530_cast_fp16)[name = string("transpose_70")];
+            tensor<fp16, [1, 1, 2560]> var_8542_cast_fp16 = mul(x = x_269_cast_fp16, y = const_154_promoted_to_fp16)[name = string("op_8542_cast_fp16")];
+            bool input_399_interleave_0 = const()[name = string("input_399_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_399_cast_fp16 = concat(axis = var_8540, interleave = input_399_interleave_0, values = (x_269_cast_fp16, var_8542_cast_fp16))[name = string("input_399_cast_fp16")];
+            tensor<int32, [1]> normed_381_axes_0 = const()[name = string("normed_381_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_8537_to_fp16 = const()[name = string("op_8537_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_381_cast_fp16 = layer_norm(axes = normed_381_axes_0, epsilon = var_8537_to_fp16, x = input_399_cast_fp16)[name = string("normed_381_cast_fp16")];
+            tensor<int32, [2]> var_8547_split_sizes_0 = const()[name = string("op_8547_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_8547_axis_0 = const()[name = string("op_8547_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_8547_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_8547_cast_fp16_1 = split(axis = var_8547_axis_0, split_sizes = var_8547_split_sizes_0, x = normed_381_cast_fp16)[name = string("op_8547_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_1_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_c3_1_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(961482752)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_139_cast_fp16 = mul(x = var_8547_cast_fp16_0, y = layers_c3_1_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_139_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_141_cast_fp16 = add(x = hidden_states_135_cast_fp16, y = hidden_states_139_cast_fp16)[name = string("hidden_states_141_cast_fp16")];
+            tensor<fp16, [1]> const_155_promoted_to_fp16 = const()[name = string("const_155_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.6ep-1])];
+            tensor<fp16, [1, 1, 2560]> x_271_cast_fp16 = mul(x = hidden_states_141_cast_fp16, y = const_155_promoted_to_fp16)[name = string("x_271_cast_fp16")];
+            int32 var_8562 = const()[name = string("op_8562"), val = int32(-1)];
+            fp16 const_156_promoted_to_fp16 = const()[name = string("const_156_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_8564_cast_fp16 = mul(x = x_271_cast_fp16, y = const_156_promoted_to_fp16)[name = string("op_8564_cast_fp16")];
+            bool input_401_interleave_0 = const()[name = string("input_401_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_401_cast_fp16 = concat(axis = var_8562, interleave = input_401_interleave_0, values = (x_271_cast_fp16, var_8564_cast_fp16))[name = string("input_401_cast_fp16")];
+            tensor<int32, [1]> normed_385_axes_0 = const()[name = string("normed_385_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_8559_to_fp16 = const()[name = string("op_8559_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_385_cast_fp16 = layer_norm(axes = normed_385_axes_0, epsilon = var_8559_to_fp16, x = input_401_cast_fp16)[name = string("normed_385_cast_fp16")];
+            tensor<int32, [2]> var_8569_split_sizes_0 = const()[name = string("op_8569_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_8569_axis_0 = const()[name = string("op_8569_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_8569_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_8569_cast_fp16_1 = split(axis = var_8569_axis_0, split_sizes = var_8569_split_sizes_0, x = normed_385_cast_fp16)[name = string("op_8569_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_2_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c3_2_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(961487936)))];
+            tensor<fp16, [1, 1, 2560]> h_85_cast_fp16 = mul(x = var_8569_cast_fp16_0, y = layers_c3_2_input_layernorm_weight_promoted_to_fp16)[name = string("h_85_cast_fp16")];
+            tensor<int32, [3]> var_8575 = const()[name = string("op_8575"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_8578_axes_0 = const()[name = string("op_8578_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_8576_cast_fp16 = transpose(perm = var_8575, x = h_85_cast_fp16)[name = string("transpose_69")];
+            tensor<fp16, [1, 2560, 1, 1]> var_8578_cast_fp16 = expand_dims(axes = var_8578_axes_0, x = var_8576_cast_fp16)[name = string("op_8578_cast_fp16")];
+            string var_8594_pad_type_0 = const()[name = string("op_8594_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_8594_strides_0 = const()[name = string("op_8594_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_8594_pad_0 = const()[name = string("op_8594_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_8594_dilations_0 = const()[name = string("op_8594_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_8594_groups_0 = const()[name = string("op_8594_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_8594 = conv(dilations = var_8594_dilations_0, groups = var_8594_groups_0, pad = var_8594_pad_0, pad_type = var_8594_pad_type_0, strides = var_8594_strides_0, weight = layers_c3_2_self_attn_q_proj_weight_palettized, x = var_8578_cast_fp16)[name = string("op_8594")];
+            tensor<int32, [4]> var_8599 = const()[name = string("op_8599"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_8600 = reshape(shape = var_8599, x = var_8594)[name = string("op_8600")];
+            tensor<int32, [4]> var_8605 = const()[name = string("op_8605"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_8615 = const()[name = string("op_8615"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_8606 = transpose(perm = var_8605, x = var_8600)[name = string("transpose_68")];
+            tensor<fp16, [1, 8, 256]> x_273 = reshape(shape = var_8615, x = var_8606)[name = string("x_273")];
+            int32 var_8621 = const()[name = string("op_8621"), val = int32(-1)];
+            fp16 const_157_promoted = const()[name = string("const_157_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_8623 = mul(x = x_273, y = const_157_promoted)[name = string("op_8623")];
+            bool input_405_interleave_0 = const()[name = string("input_405_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_405 = concat(axis = var_8621, interleave = input_405_interleave_0, values = (x_273, var_8623))[name = string("input_405")];
+            tensor<int32, [1]> normed_389_axes_0 = const()[name = string("normed_389_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_8618_to_fp16 = const()[name = string("op_8618_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_389_cast_fp16 = layer_norm(axes = normed_389_axes_0, epsilon = var_8618_to_fp16, x = input_405)[name = string("normed_389_cast_fp16")];
+            tensor<int32, [2]> var_8628_split_sizes_0 = const()[name = string("op_8628_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_8628_axis_0 = const()[name = string("op_8628_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_8628_0, tensor<fp16, [1, 8, 256]> var_8628_1 = split(axis = var_8628_axis_0, split_sizes = var_8628_split_sizes_0, x = normed_389_cast_fp16)[name = string("op_8628")];
+            tensor<fp16, [1, 8, 256]> var_8630 = mul(x = var_8628_0, y = layers_c2_10_self_attn_q_norm_weight)[name = string("op_8630")];
+            tensor<int32, [4]> var_8635 = const()[name = string("op_8635"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_111 = reshape(shape = var_8635, x = var_8630)[name = string("q_111")];
+            tensor<fp16, [1, 8, 1, 256]> var_8637_cast_fp16 = mul(x = q_111, y = cos_s)[name = string("op_8637_cast_fp16")];
+            tensor<int32, [2]> var_8638_split_sizes_0 = const()[name = string("op_8638_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_8638_axis_0 = const()[name = string("op_8638_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_8638_0, tensor<fp16, [1, 8, 1, 128]> var_8638_1 = split(axis = var_8638_axis_0, split_sizes = var_8638_split_sizes_0, x = q_111)[name = string("op_8638")];
+            fp16 const_158_promoted = const()[name = string("const_158_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_8640 = mul(x = var_8638_1, y = const_158_promoted)[name = string("op_8640")];
+            int32 var_8642 = const()[name = string("op_8642"), val = int32(-1)];
+            bool var_8643_interleave_0 = const()[name = string("op_8643_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_8643 = concat(axis = var_8642, interleave = var_8643_interleave_0, values = (var_8640, var_8638_0))[name = string("op_8643")];
+            tensor<fp16, [1, 8, 1, 256]> var_8644_cast_fp16 = mul(x = var_8643, y = sin_s)[name = string("op_8644_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_113_cast_fp16 = add(x = var_8637_cast_fp16, y = var_8644_cast_fp16)[name = string("q_113_cast_fp16")];
+            bool attn_weights_57_transpose_x_0 = const()[name = string("attn_weights_57_transpose_x_0"), val = bool(false)];
+            bool attn_weights_57_transpose_y_0 = const()[name = string("attn_weights_57_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_57_cast_fp16 = matmul(transpose_x = attn_weights_57_transpose_x_0, transpose_y = attn_weights_57_transpose_y_0, x = q_113_cast_fp16, y = transpose_94_cast_fp16)[name = string("attn_weights_57_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_275_cast_fp16 = add(x = attn_weights_57_cast_fp16, y = causal_mask_sliding)[name = string("x_275_cast_fp16")];
+            tensor<int32, [1]> reduce_max_14_axes_0 = const()[name = string("reduce_max_14_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_14_keep_dims_0 = const()[name = string("reduce_max_14_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_14 = reduce_max(axes = reduce_max_14_axes_0, keep_dims = reduce_max_14_keep_dims_0, x = x_275_cast_fp16)[name = string("reduce_max_14")];
+            tensor<fp16, [1, 8, 1, 512]> var_8676 = sub(x = x_275_cast_fp16, y = reduce_max_14)[name = string("op_8676")];
+            tensor<fp16, [1, 8, 1, 512]> var_8682 = exp(x = var_8676)[name = string("op_8682")];
+            tensor<int32, [1]> var_8692_axes_0 = const()[name = string("op_8692_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_8692_keep_dims_0 = const()[name = string("op_8692_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_8692 = reduce_sum(axes = var_8692_axes_0, keep_dims = var_8692_keep_dims_0, x = var_8682)[name = string("op_8692")];
+            tensor<fp16, [1, 8, 1, 512]> var_8698_cast_fp16 = real_div(x = var_8682, y = var_8692)[name = string("op_8698_cast_fp16")];
+            bool attn_output_85_transpose_x_0 = const()[name = string("attn_output_85_transpose_x_0"), val = bool(false)];
+            bool attn_output_85_transpose_y_0 = const()[name = string("attn_output_85_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_85_cast_fp16 = matmul(transpose_x = attn_output_85_transpose_x_0, transpose_y = attn_output_85_transpose_y_0, x = var_8698_cast_fp16, y = V_expanded_21_cast_fp16)[name = string("attn_output_85_cast_fp16")];
+            tensor<int32, [4]> var_8709 = const()[name = string("op_8709"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_8716 = const()[name = string("op_8716"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_8710_cast_fp16 = transpose(perm = var_8709, x = attn_output_85_cast_fp16)[name = string("transpose_67")];
+            tensor<fp16, [1, 1, 2048]> attn_output_87_cast_fp16 = reshape(shape = var_8716, x = var_8710_cast_fp16)[name = string("attn_output_87_cast_fp16")];
+            tensor<int32, [3]> var_8721 = const()[name = string("op_8721"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_8737_pad_type_0 = const()[name = string("op_8737_pad_type_0"), val = string("valid")];
+            int32 var_8737_groups_0 = const()[name = string("op_8737_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_8737_strides_0 = const()[name = string("op_8737_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_8737_pad_0 = const()[name = string("op_8737_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_8737_dilations_0 = const()[name = string("op_8737_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_14_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(961493120))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(964114624))))[name = string("squeeze_14_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_8722_cast_fp16 = transpose(perm = var_8721, x = attn_output_87_cast_fp16)[name = string("transpose_66")];
+            tensor<fp16, [1, 2560, 1]> var_8737_cast_fp16 = conv(dilations = var_8737_dilations_0, groups = var_8737_groups_0, pad = var_8737_pad_0, pad_type = var_8737_pad_type_0, strides = var_8737_strides_0, weight = squeeze_14_cast_fp16_to_fp32_to_fp16_palettized, x = var_8722_cast_fp16)[name = string("op_8737_cast_fp16")];
+            tensor<int32, [3]> var_8741 = const()[name = string("op_8741"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_8747 = const()[name = string("op_8747"), val = int32(-1)];
+            fp16 const_159_promoted_to_fp16 = const()[name = string("const_159_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_279_cast_fp16 = transpose(perm = var_8741, x = var_8737_cast_fp16)[name = string("transpose_65")];
+            tensor<fp16, [1, 1, 2560]> var_8749_cast_fp16 = mul(x = x_279_cast_fp16, y = const_159_promoted_to_fp16)[name = string("op_8749_cast_fp16")];
+            bool input_409_interleave_0 = const()[name = string("input_409_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_409_cast_fp16 = concat(axis = var_8747, interleave = input_409_interleave_0, values = (x_279_cast_fp16, var_8749_cast_fp16))[name = string("input_409_cast_fp16")];
+            tensor<int32, [1]> normed_393_axes_0 = const()[name = string("normed_393_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_8744_to_fp16 = const()[name = string("op_8744_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_393_cast_fp16 = layer_norm(axes = normed_393_axes_0, epsilon = var_8744_to_fp16, x = input_409_cast_fp16)[name = string("normed_393_cast_fp16")];
+            tensor<int32, [2]> var_8754_split_sizes_0 = const()[name = string("op_8754_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_8754_axis_0 = const()[name = string("op_8754_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_8754_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_8754_cast_fp16_1 = split(axis = var_8754_axis_0, split_sizes = var_8754_split_sizes_0, x = normed_393_cast_fp16)[name = string("op_8754_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_2_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c3_2_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(964117248)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_89_cast_fp16 = mul(x = var_8754_cast_fp16_0, y = layers_c3_2_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_89_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_281_cast_fp16 = add(x = x_271_cast_fp16, y = attn_output_89_cast_fp16)[name = string("x_281_cast_fp16")];
+            int32 var_8763 = const()[name = string("op_8763"), val = int32(-1)];
+            fp16 const_160_promoted_to_fp16 = const()[name = string("const_160_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_8765_cast_fp16 = mul(x = x_281_cast_fp16, y = const_160_promoted_to_fp16)[name = string("op_8765_cast_fp16")];
+            bool input_411_interleave_0 = const()[name = string("input_411_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_411_cast_fp16 = concat(axis = var_8763, interleave = input_411_interleave_0, values = (x_281_cast_fp16, var_8765_cast_fp16))[name = string("input_411_cast_fp16")];
+            tensor<int32, [1]> normed_397_axes_0 = const()[name = string("normed_397_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_8760_to_fp16 = const()[name = string("op_8760_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_397_cast_fp16 = layer_norm(axes = normed_397_axes_0, epsilon = var_8760_to_fp16, x = input_411_cast_fp16)[name = string("normed_397_cast_fp16")];
+            tensor<int32, [2]> var_8770_split_sizes_0 = const()[name = string("op_8770_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_8770_axis_0 = const()[name = string("op_8770_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_8770_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_8770_cast_fp16_1 = split(axis = var_8770_axis_0, split_sizes = var_8770_split_sizes_0, x = normed_397_cast_fp16)[name = string("op_8770_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_2_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c3_2_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(964122432)))];
+            tensor<fp16, [1, 1, 2560]> h_87_cast_fp16 = mul(x = var_8770_cast_fp16_0, y = layers_c3_2_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_87_cast_fp16")];
+            tensor<int32, [3]> var_8781 = const()[name = string("op_8781"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_413_axes_0 = const()[name = string("input_413_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_8782 = transpose(perm = var_8781, x = h_87_cast_fp16)[name = string("transpose_64")];
+            tensor<fp16, [1, 2560, 1, 1]> input_413 = expand_dims(axes = input_413_axes_0, x = var_8782)[name = string("input_413")];
+            string gate_57_pad_type_0 = const()[name = string("gate_57_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_57_strides_0 = const()[name = string("gate_57_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_57_pad_0 = const()[name = string("gate_57_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_57_dilations_0 = const()[name = string("gate_57_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_57_groups_0 = const()[name = string("gate_57_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_57 = conv(dilations = gate_57_dilations_0, groups = gate_57_groups_0, pad = gate_57_pad_0, pad_type = gate_57_pad_type_0, strides = gate_57_strides_0, weight = layers_c3_2_mlp_gate_proj_weight_palettized, x = input_413)[name = string("gate_57")];
+            string up_29_pad_type_0 = const()[name = string("up_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_29_strides_0 = const()[name = string("up_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_29_pad_0 = const()[name = string("up_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_29_dilations_0 = const()[name = string("up_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_29_groups_0 = const()[name = string("up_29_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_29 = conv(dilations = up_29_dilations_0, groups = up_29_groups_0, pad = up_29_pad_0, pad_type = up_29_pad_type_0, strides = up_29_strides_0, weight = layers_c3_2_mlp_up_proj_weight_palettized, x = input_413)[name = string("up_29")];
+            string gate_59_mode_0 = const()[name = string("gate_59_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_59 = gelu(mode = gate_59_mode_0, x = gate_57)[name = string("gate_59")];
+            tensor<fp16, [1, 10240, 1, 1]> input_415 = mul(x = gate_59, y = up_29)[name = string("input_415")];
+            string mlp_out_29_pad_type_0 = const()[name = string("mlp_out_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_29_strides_0 = const()[name = string("mlp_out_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_29_pad_0 = const()[name = string("mlp_out_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_29_dilations_0 = const()[name = string("mlp_out_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_29_groups_0 = const()[name = string("mlp_out_29_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_29 = conv(dilations = mlp_out_29_dilations_0, groups = mlp_out_29_groups_0, pad = mlp_out_29_pad_0, pad_type = mlp_out_29_pad_type_0, strides = mlp_out_29_strides_0, weight = layers_c3_2_mlp_down_proj_weight_palettized, x = input_415)[name = string("mlp_out_29")];
+            tensor<int32, [1]> var_8822_axes_0 = const()[name = string("op_8822_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_8822 = squeeze(axes = var_8822_axes_0, x = mlp_out_29)[name = string("op_8822")];
+            tensor<int32, [3]> var_8826 = const()[name = string("op_8826"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_8832 = const()[name = string("op_8832"), val = int32(-1)];
+            fp16 const_161_promoted = const()[name = string("const_161_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_283 = transpose(perm = var_8826, x = var_8822)[name = string("transpose_63")];
+            tensor<fp16, [1, 1, 2560]> var_8834 = mul(x = x_283, y = const_161_promoted)[name = string("op_8834")];
+            bool input_417_interleave_0 = const()[name = string("input_417_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_417 = concat(axis = var_8832, interleave = input_417_interleave_0, values = (x_283, var_8834))[name = string("input_417")];
+            tensor<int32, [1]> normed_401_axes_0 = const()[name = string("normed_401_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_8829_to_fp16 = const()[name = string("op_8829_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_401_cast_fp16 = layer_norm(axes = normed_401_axes_0, epsilon = var_8829_to_fp16, x = input_417)[name = string("normed_401_cast_fp16")];
+            tensor<int32, [2]> var_8839_split_sizes_0 = const()[name = string("op_8839_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_8839_axis_0 = const()[name = string("op_8839_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_8839_0, tensor<fp16, [1, 1, 2560]> var_8839_1 = split(axis = var_8839_axis_0, split_sizes = var_8839_split_sizes_0, x = normed_401_cast_fp16)[name = string("op_8839")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_143 = mul(x = var_8839_0, y = layers_c3_2_post_feedforward_layernorm_weight)[name = string("hidden_states_143")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_145_cast_fp16 = add(x = x_281_cast_fp16, y = hidden_states_143)[name = string("hidden_states_145_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_29_begin_0 = const()[name = string("per_layer_slice_29_begin_0"), val = tensor<int32, [3]>([0, 0, 6656])];
+            tensor<int32, [3]> per_layer_slice_29_end_0 = const()[name = string("per_layer_slice_29_end_0"), val = tensor<int32, [3]>([1, 1, 6912])];
+            tensor<bool, [3]> per_layer_slice_29_end_mask_0 = const()[name = string("per_layer_slice_29_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_29_cast_fp16 = slice_by_index(begin = per_layer_slice_29_begin_0, end = per_layer_slice_29_end_0, end_mask = per_layer_slice_29_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_29_cast_fp16")];
+            tensor<int32, [3]> var_8867 = const()[name = string("op_8867"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_419_axes_0 = const()[name = string("input_419_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_8868 = transpose(perm = var_8867, x = hidden_states_145_cast_fp16)[name = string("transpose_62")];
+            tensor<fp16, [1, 2560, 1, 1]> input_419 = expand_dims(axes = input_419_axes_0, x = var_8868)[name = string("input_419")];
+            string gated_85_pad_type_0 = const()[name = string("gated_85_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_85_strides_0 = const()[name = string("gated_85_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_85_pad_0 = const()[name = string("gated_85_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_85_dilations_0 = const()[name = string("gated_85_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_85_groups_0 = const()[name = string("gated_85_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_85 = conv(dilations = gated_85_dilations_0, groups = gated_85_groups_0, pad = gated_85_pad_0, pad_type = gated_85_pad_type_0, strides = gated_85_strides_0, weight = layers_c3_2_per_layer_input_gate_weight_palettized, x = input_419)[name = string("gated_85")];
+            string gated_87_mode_0 = const()[name = string("gated_87_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_87 = gelu(mode = gated_87_mode_0, x = gated_85)[name = string("gated_87")];
+            tensor<int32, [3]> var_8887 = const()[name = string("op_8887"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_29_axes_0 = const()[name = string("per_layer_slice_conv_29_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_8888_cast_fp16 = transpose(perm = var_8887, x = per_layer_slice_29_cast_fp16)[name = string("transpose_61")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_29_cast_fp16 = expand_dims(axes = per_layer_slice_conv_29_axes_0, x = var_8888_cast_fp16)[name = string("per_layer_slice_conv_29_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_421_cast_fp16 = mul(x = gated_87, y = per_layer_slice_conv_29_cast_fp16)[name = string("input_421_cast_fp16")];
+            string gated_89_pad_type_0 = const()[name = string("gated_89_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_89_strides_0 = const()[name = string("gated_89_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_89_pad_0 = const()[name = string("gated_89_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_89_dilations_0 = const()[name = string("gated_89_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_89_groups_0 = const()[name = string("gated_89_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_c3_2_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(964127616))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(964455360))))[name = string("layers_c3_2_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_89_cast_fp16 = conv(dilations = gated_89_dilations_0, groups = gated_89_groups_0, pad = gated_89_pad_0, pad_type = gated_89_pad_type_0, strides = gated_89_strides_0, weight = layers_c3_2_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_421_cast_fp16)[name = string("gated_89_cast_fp16")];
+            tensor<int32, [1]> var_8904_axes_0 = const()[name = string("op_8904_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_8904_cast_fp16 = squeeze(axes = var_8904_axes_0, x = gated_89_cast_fp16)[name = string("op_8904_cast_fp16")];
+            tensor<int32, [3]> var_8908 = const()[name = string("op_8908"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_8914 = const()[name = string("op_8914"), val = int32(-1)];
+            fp16 const_162_promoted_to_fp16 = const()[name = string("const_162_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_285_cast_fp16 = transpose(perm = var_8908, x = var_8904_cast_fp16)[name = string("transpose_60")];
+            tensor<fp16, [1, 1, 2560]> var_8916_cast_fp16 = mul(x = x_285_cast_fp16, y = const_162_promoted_to_fp16)[name = string("op_8916_cast_fp16")];
+            bool input_423_interleave_0 = const()[name = string("input_423_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_423_cast_fp16 = concat(axis = var_8914, interleave = input_423_interleave_0, values = (x_285_cast_fp16, var_8916_cast_fp16))[name = string("input_423_cast_fp16")];
+            tensor<int32, [1]> normed_405_axes_0 = const()[name = string("normed_405_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_8911_to_fp16 = const()[name = string("op_8911_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_405_cast_fp16 = layer_norm(axes = normed_405_axes_0, epsilon = var_8911_to_fp16, x = input_423_cast_fp16)[name = string("normed_405_cast_fp16")];
+            tensor<int32, [2]> var_8921_split_sizes_0 = const()[name = string("op_8921_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_8921_axis_0 = const()[name = string("op_8921_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_8921_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_8921_cast_fp16_1 = split(axis = var_8921_axis_0, split_sizes = var_8921_split_sizes_0, x = normed_405_cast_fp16)[name = string("op_8921_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_2_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_c3_2_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(964457984)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_149_cast_fp16 = mul(x = var_8921_cast_fp16_0, y = layers_c3_2_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_149_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_151_cast_fp16 = add(x = hidden_states_145_cast_fp16, y = hidden_states_149_cast_fp16)[name = string("hidden_states_151_cast_fp16")];
+            tensor<fp16, [1]> const_163_promoted_to_fp16 = const()[name = string("const_163_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.6ep-1])];
+            tensor<fp16, [1, 1, 2560]> x_287_cast_fp16 = mul(x = hidden_states_151_cast_fp16, y = const_163_promoted_to_fp16)[name = string("x_287_cast_fp16")];
+            int32 var_8936 = const()[name = string("op_8936"), val = int32(-1)];
+            fp16 const_164_promoted_to_fp16 = const()[name = string("const_164_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_8938_cast_fp16 = mul(x = x_287_cast_fp16, y = const_164_promoted_to_fp16)[name = string("op_8938_cast_fp16")];
+            bool input_425_interleave_0 = const()[name = string("input_425_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_425_cast_fp16 = concat(axis = var_8936, interleave = input_425_interleave_0, values = (x_287_cast_fp16, var_8938_cast_fp16))[name = string("input_425_cast_fp16")];
+            tensor<int32, [1]> normed_409_axes_0 = const()[name = string("normed_409_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_8933_to_fp16 = const()[name = string("op_8933_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_409_cast_fp16 = layer_norm(axes = normed_409_axes_0, epsilon = var_8933_to_fp16, x = input_425_cast_fp16)[name = string("normed_409_cast_fp16")];
+            tensor<int32, [2]> var_8943_split_sizes_0 = const()[name = string("op_8943_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_8943_axis_0 = const()[name = string("op_8943_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_8943_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_8943_cast_fp16_1 = split(axis = var_8943_axis_0, split_sizes = var_8943_split_sizes_0, x = normed_409_cast_fp16)[name = string("op_8943_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_3_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c3_3_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(964463168)))];
+            tensor<fp16, [1, 1, 2560]> h_91_cast_fp16 = mul(x = var_8943_cast_fp16_0, y = layers_c3_3_input_layernorm_weight_promoted_to_fp16)[name = string("h_91_cast_fp16")];
+            tensor<int32, [3]> var_8949 = const()[name = string("op_8949"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_8952_axes_0 = const()[name = string("op_8952_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_8950_cast_fp16 = transpose(perm = var_8949, x = h_91_cast_fp16)[name = string("transpose_59")];
+            tensor<fp16, [1, 2560, 1, 1]> var_8952_cast_fp16 = expand_dims(axes = var_8952_axes_0, x = var_8950_cast_fp16)[name = string("op_8952_cast_fp16")];
+            string var_8968_pad_type_0 = const()[name = string("op_8968_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_8968_strides_0 = const()[name = string("op_8968_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_8968_pad_0 = const()[name = string("op_8968_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_8968_dilations_0 = const()[name = string("op_8968_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_8968_groups_0 = const()[name = string("op_8968_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_8968 = conv(dilations = var_8968_dilations_0, groups = var_8968_groups_0, pad = var_8968_pad_0, pad_type = var_8968_pad_type_0, strides = var_8968_strides_0, weight = layers_c3_3_self_attn_q_proj_weight_palettized, x = var_8952_cast_fp16)[name = string("op_8968")];
+            tensor<int32, [4]> var_8973 = const()[name = string("op_8973"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_8974 = reshape(shape = var_8973, x = var_8968)[name = string("op_8974")];
+            tensor<int32, [4]> var_8979 = const()[name = string("op_8979"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_8989 = const()[name = string("op_8989"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_8980 = transpose(perm = var_8979, x = var_8974)[name = string("transpose_58")];
+            tensor<fp16, [1, 8, 256]> x_289 = reshape(shape = var_8989, x = var_8980)[name = string("x_289")];
+            int32 var_8995 = const()[name = string("op_8995"), val = int32(-1)];
+            fp16 const_165_promoted = const()[name = string("const_165_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_8997 = mul(x = x_289, y = const_165_promoted)[name = string("op_8997")];
+            bool input_429_interleave_0 = const()[name = string("input_429_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_429 = concat(axis = var_8995, interleave = input_429_interleave_0, values = (x_289, var_8997))[name = string("input_429")];
+            tensor<int32, [1]> normed_413_axes_0 = const()[name = string("normed_413_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_8992_to_fp16 = const()[name = string("op_8992_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_413_cast_fp16 = layer_norm(axes = normed_413_axes_0, epsilon = var_8992_to_fp16, x = input_429)[name = string("normed_413_cast_fp16")];
+            tensor<int32, [2]> var_9002_split_sizes_0 = const()[name = string("op_9002_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_9002_axis_0 = const()[name = string("op_9002_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_9002_0, tensor<fp16, [1, 8, 256]> var_9002_1 = split(axis = var_9002_axis_0, split_sizes = var_9002_split_sizes_0, x = normed_413_cast_fp16)[name = string("op_9002")];
+            tensor<fp16, [1, 8, 256]> var_9004 = mul(x = var_9002_0, y = layers_c2_10_self_attn_q_norm_weight)[name = string("op_9004")];
+            tensor<int32, [4]> var_9009 = const()[name = string("op_9009"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_117 = reshape(shape = var_9009, x = var_9004)[name = string("q_117")];
+            tensor<fp16, [1, 8, 1, 256]> var_9011_cast_fp16 = mul(x = q_117, y = cos_s)[name = string("op_9011_cast_fp16")];
+            tensor<int32, [2]> var_9012_split_sizes_0 = const()[name = string("op_9012_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_9012_axis_0 = const()[name = string("op_9012_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_9012_0, tensor<fp16, [1, 8, 1, 128]> var_9012_1 = split(axis = var_9012_axis_0, split_sizes = var_9012_split_sizes_0, x = q_117)[name = string("op_9012")];
+            fp16 const_166_promoted = const()[name = string("const_166_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_9014 = mul(x = var_9012_1, y = const_166_promoted)[name = string("op_9014")];
+            int32 var_9016 = const()[name = string("op_9016"), val = int32(-1)];
+            bool var_9017_interleave_0 = const()[name = string("op_9017_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_9017 = concat(axis = var_9016, interleave = var_9017_interleave_0, values = (var_9014, var_9012_0))[name = string("op_9017")];
+            tensor<fp16, [1, 8, 1, 256]> var_9018_cast_fp16 = mul(x = var_9017, y = sin_s)[name = string("op_9018_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_119_cast_fp16 = add(x = var_9011_cast_fp16, y = var_9018_cast_fp16)[name = string("q_119_cast_fp16")];
+            bool attn_weights_61_transpose_x_0 = const()[name = string("attn_weights_61_transpose_x_0"), val = bool(false)];
+            bool attn_weights_61_transpose_y_0 = const()[name = string("attn_weights_61_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_61_cast_fp16 = matmul(transpose_x = attn_weights_61_transpose_x_0, transpose_y = attn_weights_61_transpose_y_0, x = q_119_cast_fp16, y = transpose_94_cast_fp16)[name = string("attn_weights_61_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_291_cast_fp16 = add(x = attn_weights_61_cast_fp16, y = causal_mask_sliding)[name = string("x_291_cast_fp16")];
+            tensor<int32, [1]> reduce_max_15_axes_0 = const()[name = string("reduce_max_15_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_15_keep_dims_0 = const()[name = string("reduce_max_15_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_15 = reduce_max(axes = reduce_max_15_axes_0, keep_dims = reduce_max_15_keep_dims_0, x = x_291_cast_fp16)[name = string("reduce_max_15")];
+            tensor<fp16, [1, 8, 1, 512]> var_9050 = sub(x = x_291_cast_fp16, y = reduce_max_15)[name = string("op_9050")];
+            tensor<fp16, [1, 8, 1, 512]> var_9056 = exp(x = var_9050)[name = string("op_9056")];
+            tensor<int32, [1]> var_9066_axes_0 = const()[name = string("op_9066_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_9066_keep_dims_0 = const()[name = string("op_9066_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_9066 = reduce_sum(axes = var_9066_axes_0, keep_dims = var_9066_keep_dims_0, x = var_9056)[name = string("op_9066")];
+            tensor<fp16, [1, 8, 1, 512]> var_9072_cast_fp16 = real_div(x = var_9056, y = var_9066)[name = string("op_9072_cast_fp16")];
+            bool attn_output_91_transpose_x_0 = const()[name = string("attn_output_91_transpose_x_0"), val = bool(false)];
+            bool attn_output_91_transpose_y_0 = const()[name = string("attn_output_91_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_91_cast_fp16 = matmul(transpose_x = attn_output_91_transpose_x_0, transpose_y = attn_output_91_transpose_y_0, x = var_9072_cast_fp16, y = V_expanded_21_cast_fp16)[name = string("attn_output_91_cast_fp16")];
+            tensor<int32, [4]> var_9083 = const()[name = string("op_9083"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_9090 = const()[name = string("op_9090"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_9084_cast_fp16 = transpose(perm = var_9083, x = attn_output_91_cast_fp16)[name = string("transpose_57")];
+            tensor<fp16, [1, 1, 2048]> attn_output_93_cast_fp16 = reshape(shape = var_9090, x = var_9084_cast_fp16)[name = string("attn_output_93_cast_fp16")];
+            tensor<int32, [3]> var_9095 = const()[name = string("op_9095"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_9111_pad_type_0 = const()[name = string("op_9111_pad_type_0"), val = string("valid")];
+            int32 var_9111_groups_0 = const()[name = string("op_9111_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_9111_strides_0 = const()[name = string("op_9111_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_9111_pad_0 = const()[name = string("op_9111_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_9111_dilations_0 = const()[name = string("op_9111_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_15_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(964468352))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(967089856))))[name = string("squeeze_15_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_9096_cast_fp16 = transpose(perm = var_9095, x = attn_output_93_cast_fp16)[name = string("transpose_56")];
+            tensor<fp16, [1, 2560, 1]> var_9111_cast_fp16 = conv(dilations = var_9111_dilations_0, groups = var_9111_groups_0, pad = var_9111_pad_0, pad_type = var_9111_pad_type_0, strides = var_9111_strides_0, weight = squeeze_15_cast_fp16_to_fp32_to_fp16_palettized, x = var_9096_cast_fp16)[name = string("op_9111_cast_fp16")];
+            tensor<int32, [3]> var_9115 = const()[name = string("op_9115"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_9121 = const()[name = string("op_9121"), val = int32(-1)];
+            fp16 const_167_promoted_to_fp16 = const()[name = string("const_167_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_295_cast_fp16 = transpose(perm = var_9115, x = var_9111_cast_fp16)[name = string("transpose_55")];
+            tensor<fp16, [1, 1, 2560]> var_9123_cast_fp16 = mul(x = x_295_cast_fp16, y = const_167_promoted_to_fp16)[name = string("op_9123_cast_fp16")];
+            bool input_433_interleave_0 = const()[name = string("input_433_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_433_cast_fp16 = concat(axis = var_9121, interleave = input_433_interleave_0, values = (x_295_cast_fp16, var_9123_cast_fp16))[name = string("input_433_cast_fp16")];
+            tensor<int32, [1]> normed_417_axes_0 = const()[name = string("normed_417_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_9118_to_fp16 = const()[name = string("op_9118_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_417_cast_fp16 = layer_norm(axes = normed_417_axes_0, epsilon = var_9118_to_fp16, x = input_433_cast_fp16)[name = string("normed_417_cast_fp16")];
+            tensor<int32, [2]> var_9128_split_sizes_0 = const()[name = string("op_9128_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_9128_axis_0 = const()[name = string("op_9128_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_9128_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_9128_cast_fp16_1 = split(axis = var_9128_axis_0, split_sizes = var_9128_split_sizes_0, x = normed_417_cast_fp16)[name = string("op_9128_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_3_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c3_3_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(967092480)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_95_cast_fp16 = mul(x = var_9128_cast_fp16_0, y = layers_c3_3_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_95_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_297_cast_fp16 = add(x = x_287_cast_fp16, y = attn_output_95_cast_fp16)[name = string("x_297_cast_fp16")];
+            int32 var_9137 = const()[name = string("op_9137"), val = int32(-1)];
+            fp16 const_168_promoted_to_fp16 = const()[name = string("const_168_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_9139_cast_fp16 = mul(x = x_297_cast_fp16, y = const_168_promoted_to_fp16)[name = string("op_9139_cast_fp16")];
+            bool input_435_interleave_0 = const()[name = string("input_435_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_435_cast_fp16 = concat(axis = var_9137, interleave = input_435_interleave_0, values = (x_297_cast_fp16, var_9139_cast_fp16))[name = string("input_435_cast_fp16")];
+            tensor<int32, [1]> normed_421_axes_0 = const()[name = string("normed_421_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_9134_to_fp16 = const()[name = string("op_9134_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_421_cast_fp16 = layer_norm(axes = normed_421_axes_0, epsilon = var_9134_to_fp16, x = input_435_cast_fp16)[name = string("normed_421_cast_fp16")];
+            tensor<int32, [2]> var_9144_split_sizes_0 = const()[name = string("op_9144_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_9144_axis_0 = const()[name = string("op_9144_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_9144_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_9144_cast_fp16_1 = split(axis = var_9144_axis_0, split_sizes = var_9144_split_sizes_0, x = normed_421_cast_fp16)[name = string("op_9144_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_3_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c3_3_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(967097664)))];
+            tensor<fp16, [1, 1, 2560]> h_93_cast_fp16 = mul(x = var_9144_cast_fp16_0, y = layers_c3_3_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_93_cast_fp16")];
+            tensor<int32, [3]> var_9155 = const()[name = string("op_9155"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_437_axes_0 = const()[name = string("input_437_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_9156 = transpose(perm = var_9155, x = h_93_cast_fp16)[name = string("transpose_54")];
+            tensor<fp16, [1, 2560, 1, 1]> input_437 = expand_dims(axes = input_437_axes_0, x = var_9156)[name = string("input_437")];
+            string gate_61_pad_type_0 = const()[name = string("gate_61_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_61_strides_0 = const()[name = string("gate_61_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_61_pad_0 = const()[name = string("gate_61_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_61_dilations_0 = const()[name = string("gate_61_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_61_groups_0 = const()[name = string("gate_61_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_61 = conv(dilations = gate_61_dilations_0, groups = gate_61_groups_0, pad = gate_61_pad_0, pad_type = gate_61_pad_type_0, strides = gate_61_strides_0, weight = layers_c3_3_mlp_gate_proj_weight_palettized, x = input_437)[name = string("gate_61")];
+            string up_31_pad_type_0 = const()[name = string("up_31_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_31_strides_0 = const()[name = string("up_31_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_31_pad_0 = const()[name = string("up_31_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_31_dilations_0 = const()[name = string("up_31_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_31_groups_0 = const()[name = string("up_31_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_31 = conv(dilations = up_31_dilations_0, groups = up_31_groups_0, pad = up_31_pad_0, pad_type = up_31_pad_type_0, strides = up_31_strides_0, weight = layers_c3_3_mlp_up_proj_weight_palettized, x = input_437)[name = string("up_31")];
+            string gate_63_mode_0 = const()[name = string("gate_63_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_63 = gelu(mode = gate_63_mode_0, x = gate_61)[name = string("gate_63")];
+            tensor<fp16, [1, 10240, 1, 1]> input_439 = mul(x = gate_63, y = up_31)[name = string("input_439")];
+            string mlp_out_31_pad_type_0 = const()[name = string("mlp_out_31_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_31_strides_0 = const()[name = string("mlp_out_31_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_31_pad_0 = const()[name = string("mlp_out_31_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_31_dilations_0 = const()[name = string("mlp_out_31_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_31_groups_0 = const()[name = string("mlp_out_31_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_31 = conv(dilations = mlp_out_31_dilations_0, groups = mlp_out_31_groups_0, pad = mlp_out_31_pad_0, pad_type = mlp_out_31_pad_type_0, strides = mlp_out_31_strides_0, weight = layers_c3_3_mlp_down_proj_weight_palettized, x = input_439)[name = string("mlp_out_31")];
+            tensor<int32, [1]> var_9196_axes_0 = const()[name = string("op_9196_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_9196 = squeeze(axes = var_9196_axes_0, x = mlp_out_31)[name = string("op_9196")];
+            tensor<int32, [3]> var_9200 = const()[name = string("op_9200"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_9206 = const()[name = string("op_9206"), val = int32(-1)];
+            fp16 const_169_promoted = const()[name = string("const_169_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_299 = transpose(perm = var_9200, x = var_9196)[name = string("transpose_53")];
+            tensor<fp16, [1, 1, 2560]> var_9208 = mul(x = x_299, y = const_169_promoted)[name = string("op_9208")];
+            bool input_441_interleave_0 = const()[name = string("input_441_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_441 = concat(axis = var_9206, interleave = input_441_interleave_0, values = (x_299, var_9208))[name = string("input_441")];
+            tensor<int32, [1]> normed_425_axes_0 = const()[name = string("normed_425_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_9203_to_fp16 = const()[name = string("op_9203_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_425_cast_fp16 = layer_norm(axes = normed_425_axes_0, epsilon = var_9203_to_fp16, x = input_441)[name = string("normed_425_cast_fp16")];
+            tensor<int32, [2]> var_9213_split_sizes_0 = const()[name = string("op_9213_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_9213_axis_0 = const()[name = string("op_9213_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_9213_0, tensor<fp16, [1, 1, 2560]> var_9213_1 = split(axis = var_9213_axis_0, split_sizes = var_9213_split_sizes_0, x = normed_425_cast_fp16)[name = string("op_9213")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_153 = mul(x = var_9213_0, y = layers_c3_3_post_feedforward_layernorm_weight)[name = string("hidden_states_153")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_155_cast_fp16 = add(x = x_297_cast_fp16, y = hidden_states_153)[name = string("hidden_states_155_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_31_begin_0 = const()[name = string("per_layer_slice_31_begin_0"), val = tensor<int32, [3]>([0, 0, 6912])];
+            tensor<int32, [3]> per_layer_slice_31_end_0 = const()[name = string("per_layer_slice_31_end_0"), val = tensor<int32, [3]>([1, 1, 7168])];
+            tensor<bool, [3]> per_layer_slice_31_end_mask_0 = const()[name = string("per_layer_slice_31_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_31_cast_fp16 = slice_by_index(begin = per_layer_slice_31_begin_0, end = per_layer_slice_31_end_0, end_mask = per_layer_slice_31_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_31_cast_fp16")];
+            tensor<int32, [3]> var_9241 = const()[name = string("op_9241"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_443_axes_0 = const()[name = string("input_443_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_9242 = transpose(perm = var_9241, x = hidden_states_155_cast_fp16)[name = string("transpose_52")];
+            tensor<fp16, [1, 2560, 1, 1]> input_443 = expand_dims(axes = input_443_axes_0, x = var_9242)[name = string("input_443")];
+            string gated_91_pad_type_0 = const()[name = string("gated_91_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_91_strides_0 = const()[name = string("gated_91_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_91_pad_0 = const()[name = string("gated_91_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_91_dilations_0 = const()[name = string("gated_91_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_91_groups_0 = const()[name = string("gated_91_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_91 = conv(dilations = gated_91_dilations_0, groups = gated_91_groups_0, pad = gated_91_pad_0, pad_type = gated_91_pad_type_0, strides = gated_91_strides_0, weight = layers_c3_3_per_layer_input_gate_weight_palettized, x = input_443)[name = string("gated_91")];
+            string gated_93_mode_0 = const()[name = string("gated_93_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_93 = gelu(mode = gated_93_mode_0, x = gated_91)[name = string("gated_93")];
+            tensor<int32, [3]> var_9261 = const()[name = string("op_9261"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_31_axes_0 = const()[name = string("per_layer_slice_conv_31_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_9262_cast_fp16 = transpose(perm = var_9261, x = per_layer_slice_31_cast_fp16)[name = string("transpose_51")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_31_cast_fp16 = expand_dims(axes = per_layer_slice_conv_31_axes_0, x = var_9262_cast_fp16)[name = string("per_layer_slice_conv_31_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_445_cast_fp16 = mul(x = gated_93, y = per_layer_slice_conv_31_cast_fp16)[name = string("input_445_cast_fp16")];
+            string gated_95_pad_type_0 = const()[name = string("gated_95_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_95_strides_0 = const()[name = string("gated_95_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_95_pad_0 = const()[name = string("gated_95_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_95_dilations_0 = const()[name = string("gated_95_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_95_groups_0 = const()[name = string("gated_95_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_c3_3_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(967102848))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(967430592))))[name = string("layers_c3_3_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_95_cast_fp16 = conv(dilations = gated_95_dilations_0, groups = gated_95_groups_0, pad = gated_95_pad_0, pad_type = gated_95_pad_type_0, strides = gated_95_strides_0, weight = layers_c3_3_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_445_cast_fp16)[name = string("gated_95_cast_fp16")];
+            tensor<int32, [1]> var_9278_axes_0 = const()[name = string("op_9278_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_9278_cast_fp16 = squeeze(axes = var_9278_axes_0, x = gated_95_cast_fp16)[name = string("op_9278_cast_fp16")];
+            tensor<int32, [3]> var_9282 = const()[name = string("op_9282"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_9288 = const()[name = string("op_9288"), val = int32(-1)];
+            fp16 const_170_promoted_to_fp16 = const()[name = string("const_170_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_301_cast_fp16 = transpose(perm = var_9282, x = var_9278_cast_fp16)[name = string("transpose_50")];
+            tensor<fp16, [1, 1, 2560]> var_9290_cast_fp16 = mul(x = x_301_cast_fp16, y = const_170_promoted_to_fp16)[name = string("op_9290_cast_fp16")];
+            bool input_447_interleave_0 = const()[name = string("input_447_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_447_cast_fp16 = concat(axis = var_9288, interleave = input_447_interleave_0, values = (x_301_cast_fp16, var_9290_cast_fp16))[name = string("input_447_cast_fp16")];
+            tensor<int32, [1]> normed_429_axes_0 = const()[name = string("normed_429_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_9285_to_fp16 = const()[name = string("op_9285_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_429_cast_fp16 = layer_norm(axes = normed_429_axes_0, epsilon = var_9285_to_fp16, x = input_447_cast_fp16)[name = string("normed_429_cast_fp16")];
+            tensor<int32, [2]> var_9295_split_sizes_0 = const()[name = string("op_9295_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_9295_axis_0 = const()[name = string("op_9295_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_9295_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_9295_cast_fp16_1 = split(axis = var_9295_axis_0, split_sizes = var_9295_split_sizes_0, x = normed_429_cast_fp16)[name = string("op_9295_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_3_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_c3_3_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(967433216)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_159_cast_fp16 = mul(x = var_9295_cast_fp16_0, y = layers_c3_3_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_159_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_161_cast_fp16 = add(x = hidden_states_155_cast_fp16, y = hidden_states_159_cast_fp16)[name = string("hidden_states_161_cast_fp16")];
+            tensor<fp16, [1]> const_171_promoted_to_fp16 = const()[name = string("const_171_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.62p-1])];
+            tensor<fp16, [1, 1, 2560]> x_303_cast_fp16 = mul(x = hidden_states_161_cast_fp16, y = const_171_promoted_to_fp16)[name = string("x_303_cast_fp16")];
+            int32 var_9310 = const()[name = string("op_9310"), val = int32(-1)];
+            fp16 const_172_promoted_to_fp16 = const()[name = string("const_172_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_9312_cast_fp16 = mul(x = x_303_cast_fp16, y = const_172_promoted_to_fp16)[name = string("op_9312_cast_fp16")];
+            bool input_449_interleave_0 = const()[name = string("input_449_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_449_cast_fp16 = concat(axis = var_9310, interleave = input_449_interleave_0, values = (x_303_cast_fp16, var_9312_cast_fp16))[name = string("input_449_cast_fp16")];
+            tensor<int32, [1]> normed_433_axes_0 = const()[name = string("normed_433_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_9307_to_fp16 = const()[name = string("op_9307_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_433_cast_fp16 = layer_norm(axes = normed_433_axes_0, epsilon = var_9307_to_fp16, x = input_449_cast_fp16)[name = string("normed_433_cast_fp16")];
+            tensor<int32, [2]> var_9317_split_sizes_0 = const()[name = string("op_9317_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_9317_axis_0 = const()[name = string("op_9317_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_9317_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_9317_cast_fp16_1 = split(axis = var_9317_axis_0, split_sizes = var_9317_split_sizes_0, x = normed_433_cast_fp16)[name = string("op_9317_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_4_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c3_4_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(967438400)))];
+            tensor<fp16, [1, 1, 2560]> h_97_cast_fp16 = mul(x = var_9317_cast_fp16_0, y = layers_c3_4_input_layernorm_weight_promoted_to_fp16)[name = string("h_97_cast_fp16")];
+            tensor<int32, [3]> var_9323 = const()[name = string("op_9323"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_9326_axes_0 = const()[name = string("op_9326_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_9324_cast_fp16 = transpose(perm = var_9323, x = h_97_cast_fp16)[name = string("transpose_49")];
+            tensor<fp16, [1, 2560, 1, 1]> var_9326_cast_fp16 = expand_dims(axes = var_9326_axes_0, x = var_9324_cast_fp16)[name = string("op_9326_cast_fp16")];
+            string var_9342_pad_type_0 = const()[name = string("op_9342_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_9342_strides_0 = const()[name = string("op_9342_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_9342_pad_0 = const()[name = string("op_9342_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_9342_dilations_0 = const()[name = string("op_9342_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_9342_groups_0 = const()[name = string("op_9342_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_9342 = conv(dilations = var_9342_dilations_0, groups = var_9342_groups_0, pad = var_9342_pad_0, pad_type = var_9342_pad_type_0, strides = var_9342_strides_0, weight = layers_c3_4_self_attn_q_proj_weight_palettized, x = var_9326_cast_fp16)[name = string("op_9342")];
+            tensor<int32, [4]> var_9347 = const()[name = string("op_9347"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_9348 = reshape(shape = var_9347, x = var_9342)[name = string("op_9348")];
+            tensor<int32, [4]> var_9353 = const()[name = string("op_9353"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_9363 = const()[name = string("op_9363"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_9354 = transpose(perm = var_9353, x = var_9348)[name = string("transpose_48")];
+            tensor<fp16, [1, 8, 256]> x_305 = reshape(shape = var_9363, x = var_9354)[name = string("x_305")];
+            int32 var_9369 = const()[name = string("op_9369"), val = int32(-1)];
+            fp16 const_173_promoted = const()[name = string("const_173_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_9371 = mul(x = x_305, y = const_173_promoted)[name = string("op_9371")];
+            bool input_453_interleave_0 = const()[name = string("input_453_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_453 = concat(axis = var_9369, interleave = input_453_interleave_0, values = (x_305, var_9371))[name = string("input_453")];
+            tensor<int32, [1]> normed_437_axes_0 = const()[name = string("normed_437_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_9366_to_fp16 = const()[name = string("op_9366_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_437_cast_fp16 = layer_norm(axes = normed_437_axes_0, epsilon = var_9366_to_fp16, x = input_453)[name = string("normed_437_cast_fp16")];
+            tensor<int32, [2]> var_9376_split_sizes_0 = const()[name = string("op_9376_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_9376_axis_0 = const()[name = string("op_9376_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_9376_0, tensor<fp16, [1, 8, 256]> var_9376_1 = split(axis = var_9376_axis_0, split_sizes = var_9376_split_sizes_0, x = normed_437_cast_fp16)[name = string("op_9376")];
+            tensor<fp16, [1, 8, 256]> var_9378 = mul(x = var_9376_0, y = layers_c2_10_self_attn_q_norm_weight)[name = string("op_9378")];
+            tensor<int32, [4]> var_9383 = const()[name = string("op_9383"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_123 = reshape(shape = var_9383, x = var_9378)[name = string("q_123")];
+            tensor<fp16, [1, 8, 1, 256]> var_9385_cast_fp16 = mul(x = q_123, y = cos_s)[name = string("op_9385_cast_fp16")];
+            tensor<int32, [2]> var_9386_split_sizes_0 = const()[name = string("op_9386_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_9386_axis_0 = const()[name = string("op_9386_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_9386_0, tensor<fp16, [1, 8, 1, 128]> var_9386_1 = split(axis = var_9386_axis_0, split_sizes = var_9386_split_sizes_0, x = q_123)[name = string("op_9386")];
+            fp16 const_174_promoted = const()[name = string("const_174_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_9388 = mul(x = var_9386_1, y = const_174_promoted)[name = string("op_9388")];
+            int32 var_9390 = const()[name = string("op_9390"), val = int32(-1)];
+            bool var_9391_interleave_0 = const()[name = string("op_9391_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_9391 = concat(axis = var_9390, interleave = var_9391_interleave_0, values = (var_9388, var_9386_0))[name = string("op_9391")];
+            tensor<fp16, [1, 8, 1, 256]> var_9392_cast_fp16 = mul(x = var_9391, y = sin_s)[name = string("op_9392_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_125_cast_fp16 = add(x = var_9385_cast_fp16, y = var_9392_cast_fp16)[name = string("q_125_cast_fp16")];
+            bool attn_weights_65_transpose_x_0 = const()[name = string("attn_weights_65_transpose_x_0"), val = bool(false)];
+            bool attn_weights_65_transpose_y_0 = const()[name = string("attn_weights_65_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_65_cast_fp16 = matmul(transpose_x = attn_weights_65_transpose_x_0, transpose_y = attn_weights_65_transpose_y_0, x = q_125_cast_fp16, y = transpose_94_cast_fp16)[name = string("attn_weights_65_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_307_cast_fp16 = add(x = attn_weights_65_cast_fp16, y = causal_mask_sliding)[name = string("x_307_cast_fp16")];
+            tensor<int32, [1]> reduce_max_16_axes_0 = const()[name = string("reduce_max_16_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_16_keep_dims_0 = const()[name = string("reduce_max_16_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_16 = reduce_max(axes = reduce_max_16_axes_0, keep_dims = reduce_max_16_keep_dims_0, x = x_307_cast_fp16)[name = string("reduce_max_16")];
+            tensor<fp16, [1, 8, 1, 512]> var_9424 = sub(x = x_307_cast_fp16, y = reduce_max_16)[name = string("op_9424")];
+            tensor<fp16, [1, 8, 1, 512]> var_9430 = exp(x = var_9424)[name = string("op_9430")];
+            tensor<int32, [1]> var_9440_axes_0 = const()[name = string("op_9440_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_9440_keep_dims_0 = const()[name = string("op_9440_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_9440 = reduce_sum(axes = var_9440_axes_0, keep_dims = var_9440_keep_dims_0, x = var_9430)[name = string("op_9440")];
+            tensor<fp16, [1, 8, 1, 512]> var_9446_cast_fp16 = real_div(x = var_9430, y = var_9440)[name = string("op_9446_cast_fp16")];
+            bool attn_output_97_transpose_x_0 = const()[name = string("attn_output_97_transpose_x_0"), val = bool(false)];
+            bool attn_output_97_transpose_y_0 = const()[name = string("attn_output_97_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_97_cast_fp16 = matmul(transpose_x = attn_output_97_transpose_x_0, transpose_y = attn_output_97_transpose_y_0, x = var_9446_cast_fp16, y = V_expanded_21_cast_fp16)[name = string("attn_output_97_cast_fp16")];
+            tensor<int32, [4]> var_9457 = const()[name = string("op_9457"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_9464 = const()[name = string("op_9464"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_9458_cast_fp16 = transpose(perm = var_9457, x = attn_output_97_cast_fp16)[name = string("transpose_47")];
+            tensor<fp16, [1, 1, 2048]> attn_output_99_cast_fp16 = reshape(shape = var_9464, x = var_9458_cast_fp16)[name = string("attn_output_99_cast_fp16")];
+            tensor<int32, [3]> var_9469 = const()[name = string("op_9469"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_9485_pad_type_0 = const()[name = string("op_9485_pad_type_0"), val = string("valid")];
+            int32 var_9485_groups_0 = const()[name = string("op_9485_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_9485_strides_0 = const()[name = string("op_9485_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_9485_pad_0 = const()[name = string("op_9485_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_9485_dilations_0 = const()[name = string("op_9485_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_16_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(967443584))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(970065088))))[name = string("squeeze_16_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_9470_cast_fp16 = transpose(perm = var_9469, x = attn_output_99_cast_fp16)[name = string("transpose_46")];
+            tensor<fp16, [1, 2560, 1]> var_9485_cast_fp16 = conv(dilations = var_9485_dilations_0, groups = var_9485_groups_0, pad = var_9485_pad_0, pad_type = var_9485_pad_type_0, strides = var_9485_strides_0, weight = squeeze_16_cast_fp16_to_fp32_to_fp16_palettized, x = var_9470_cast_fp16)[name = string("op_9485_cast_fp16")];
+            tensor<int32, [3]> var_9489 = const()[name = string("op_9489"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_9495 = const()[name = string("op_9495"), val = int32(-1)];
+            fp16 const_175_promoted_to_fp16 = const()[name = string("const_175_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_311_cast_fp16 = transpose(perm = var_9489, x = var_9485_cast_fp16)[name = string("transpose_45")];
+            tensor<fp16, [1, 1, 2560]> var_9497_cast_fp16 = mul(x = x_311_cast_fp16, y = const_175_promoted_to_fp16)[name = string("op_9497_cast_fp16")];
+            bool input_457_interleave_0 = const()[name = string("input_457_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_457_cast_fp16 = concat(axis = var_9495, interleave = input_457_interleave_0, values = (x_311_cast_fp16, var_9497_cast_fp16))[name = string("input_457_cast_fp16")];
+            tensor<int32, [1]> normed_441_axes_0 = const()[name = string("normed_441_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_9492_to_fp16 = const()[name = string("op_9492_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_441_cast_fp16 = layer_norm(axes = normed_441_axes_0, epsilon = var_9492_to_fp16, x = input_457_cast_fp16)[name = string("normed_441_cast_fp16")];
+            tensor<int32, [2]> var_9502_split_sizes_0 = const()[name = string("op_9502_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_9502_axis_0 = const()[name = string("op_9502_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_9502_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_9502_cast_fp16_1 = split(axis = var_9502_axis_0, split_sizes = var_9502_split_sizes_0, x = normed_441_cast_fp16)[name = string("op_9502_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_4_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c3_4_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(970067712)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_101_cast_fp16 = mul(x = var_9502_cast_fp16_0, y = layers_c3_4_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_101_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_313_cast_fp16 = add(x = x_303_cast_fp16, y = attn_output_101_cast_fp16)[name = string("x_313_cast_fp16")];
+            int32 var_9511 = const()[name = string("op_9511"), val = int32(-1)];
+            fp16 const_176_promoted_to_fp16 = const()[name = string("const_176_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_9513_cast_fp16 = mul(x = x_313_cast_fp16, y = const_176_promoted_to_fp16)[name = string("op_9513_cast_fp16")];
+            bool input_459_interleave_0 = const()[name = string("input_459_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_459_cast_fp16 = concat(axis = var_9511, interleave = input_459_interleave_0, values = (x_313_cast_fp16, var_9513_cast_fp16))[name = string("input_459_cast_fp16")];
+            tensor<int32, [1]> normed_445_axes_0 = const()[name = string("normed_445_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_9508_to_fp16 = const()[name = string("op_9508_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_445_cast_fp16 = layer_norm(axes = normed_445_axes_0, epsilon = var_9508_to_fp16, x = input_459_cast_fp16)[name = string("normed_445_cast_fp16")];
+            tensor<int32, [2]> var_9518_split_sizes_0 = const()[name = string("op_9518_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_9518_axis_0 = const()[name = string("op_9518_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_9518_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_9518_cast_fp16_1 = split(axis = var_9518_axis_0, split_sizes = var_9518_split_sizes_0, x = normed_445_cast_fp16)[name = string("op_9518_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_4_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c3_4_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(970072896)))];
+            tensor<fp16, [1, 1, 2560]> h_99_cast_fp16 = mul(x = var_9518_cast_fp16_0, y = layers_c3_4_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_99_cast_fp16")];
+            tensor<int32, [3]> var_9529 = const()[name = string("op_9529"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_461_axes_0 = const()[name = string("input_461_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_9530 = transpose(perm = var_9529, x = h_99_cast_fp16)[name = string("transpose_44")];
+            tensor<fp16, [1, 2560, 1, 1]> input_461 = expand_dims(axes = input_461_axes_0, x = var_9530)[name = string("input_461")];
+            string gate_65_pad_type_0 = const()[name = string("gate_65_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_65_strides_0 = const()[name = string("gate_65_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_65_pad_0 = const()[name = string("gate_65_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_65_dilations_0 = const()[name = string("gate_65_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_65_groups_0 = const()[name = string("gate_65_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_65 = conv(dilations = gate_65_dilations_0, groups = gate_65_groups_0, pad = gate_65_pad_0, pad_type = gate_65_pad_type_0, strides = gate_65_strides_0, weight = layers_c3_4_mlp_gate_proj_weight_palettized, x = input_461)[name = string("gate_65")];
+            string up_33_pad_type_0 = const()[name = string("up_33_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_33_strides_0 = const()[name = string("up_33_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_33_pad_0 = const()[name = string("up_33_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_33_dilations_0 = const()[name = string("up_33_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_33_groups_0 = const()[name = string("up_33_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_33 = conv(dilations = up_33_dilations_0, groups = up_33_groups_0, pad = up_33_pad_0, pad_type = up_33_pad_type_0, strides = up_33_strides_0, weight = layers_c3_4_mlp_up_proj_weight_palettized, x = input_461)[name = string("up_33")];
+            string gate_67_mode_0 = const()[name = string("gate_67_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_67 = gelu(mode = gate_67_mode_0, x = gate_65)[name = string("gate_67")];
+            tensor<fp16, [1, 10240, 1, 1]> input_463 = mul(x = gate_67, y = up_33)[name = string("input_463")];
+            string mlp_out_33_pad_type_0 = const()[name = string("mlp_out_33_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_33_strides_0 = const()[name = string("mlp_out_33_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_33_pad_0 = const()[name = string("mlp_out_33_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_33_dilations_0 = const()[name = string("mlp_out_33_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_33_groups_0 = const()[name = string("mlp_out_33_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_33 = conv(dilations = mlp_out_33_dilations_0, groups = mlp_out_33_groups_0, pad = mlp_out_33_pad_0, pad_type = mlp_out_33_pad_type_0, strides = mlp_out_33_strides_0, weight = layers_c3_4_mlp_down_proj_weight_palettized, x = input_463)[name = string("mlp_out_33")];
+            tensor<int32, [1]> var_9570_axes_0 = const()[name = string("op_9570_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_9570 = squeeze(axes = var_9570_axes_0, x = mlp_out_33)[name = string("op_9570")];
+            tensor<int32, [3]> var_9574 = const()[name = string("op_9574"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_9580 = const()[name = string("op_9580"), val = int32(-1)];
+            fp16 const_177_promoted = const()[name = string("const_177_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_315 = transpose(perm = var_9574, x = var_9570)[name = string("transpose_43")];
+            tensor<fp16, [1, 1, 2560]> var_9582 = mul(x = x_315, y = const_177_promoted)[name = string("op_9582")];
+            bool input_465_interleave_0 = const()[name = string("input_465_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_465 = concat(axis = var_9580, interleave = input_465_interleave_0, values = (x_315, var_9582))[name = string("input_465")];
+            tensor<int32, [1]> normed_449_axes_0 = const()[name = string("normed_449_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_9577_to_fp16 = const()[name = string("op_9577_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_449_cast_fp16 = layer_norm(axes = normed_449_axes_0, epsilon = var_9577_to_fp16, x = input_465)[name = string("normed_449_cast_fp16")];
+            tensor<int32, [2]> var_9587_split_sizes_0 = const()[name = string("op_9587_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_9587_axis_0 = const()[name = string("op_9587_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_9587_0, tensor<fp16, [1, 1, 2560]> var_9587_1 = split(axis = var_9587_axis_0, split_sizes = var_9587_split_sizes_0, x = normed_449_cast_fp16)[name = string("op_9587")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_163 = mul(x = var_9587_0, y = layers_c3_4_post_feedforward_layernorm_weight)[name = string("hidden_states_163")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_165_cast_fp16 = add(x = x_313_cast_fp16, y = hidden_states_163)[name = string("hidden_states_165_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_33_begin_0 = const()[name = string("per_layer_slice_33_begin_0"), val = tensor<int32, [3]>([0, 0, 7168])];
+            tensor<int32, [3]> per_layer_slice_33_end_0 = const()[name = string("per_layer_slice_33_end_0"), val = tensor<int32, [3]>([1, 1, 7424])];
+            tensor<bool, [3]> per_layer_slice_33_end_mask_0 = const()[name = string("per_layer_slice_33_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_33_cast_fp16 = slice_by_index(begin = per_layer_slice_33_begin_0, end = per_layer_slice_33_end_0, end_mask = per_layer_slice_33_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_33_cast_fp16")];
+            tensor<int32, [3]> var_9615 = const()[name = string("op_9615"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_467_axes_0 = const()[name = string("input_467_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_9616 = transpose(perm = var_9615, x = hidden_states_165_cast_fp16)[name = string("transpose_42")];
+            tensor<fp16, [1, 2560, 1, 1]> input_467 = expand_dims(axes = input_467_axes_0, x = var_9616)[name = string("input_467")];
+            string gated_97_pad_type_0 = const()[name = string("gated_97_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_97_strides_0 = const()[name = string("gated_97_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_97_pad_0 = const()[name = string("gated_97_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_97_dilations_0 = const()[name = string("gated_97_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_97_groups_0 = const()[name = string("gated_97_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_97 = conv(dilations = gated_97_dilations_0, groups = gated_97_groups_0, pad = gated_97_pad_0, pad_type = gated_97_pad_type_0, strides = gated_97_strides_0, weight = layers_c3_4_per_layer_input_gate_weight_palettized, x = input_467)[name = string("gated_97")];
+            string gated_99_mode_0 = const()[name = string("gated_99_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_99 = gelu(mode = gated_99_mode_0, x = gated_97)[name = string("gated_99")];
+            tensor<int32, [3]> var_9635 = const()[name = string("op_9635"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_33_axes_0 = const()[name = string("per_layer_slice_conv_33_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_9636_cast_fp16 = transpose(perm = var_9635, x = per_layer_slice_33_cast_fp16)[name = string("transpose_41")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_33_cast_fp16 = expand_dims(axes = per_layer_slice_conv_33_axes_0, x = var_9636_cast_fp16)[name = string("per_layer_slice_conv_33_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_469_cast_fp16 = mul(x = gated_99, y = per_layer_slice_conv_33_cast_fp16)[name = string("input_469_cast_fp16")];
+            string gated_101_pad_type_0 = const()[name = string("gated_101_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_101_strides_0 = const()[name = string("gated_101_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_101_pad_0 = const()[name = string("gated_101_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_101_dilations_0 = const()[name = string("gated_101_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_101_groups_0 = const()[name = string("gated_101_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_c3_4_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(970078080))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(970405824))))[name = string("layers_c3_4_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_101_cast_fp16 = conv(dilations = gated_101_dilations_0, groups = gated_101_groups_0, pad = gated_101_pad_0, pad_type = gated_101_pad_type_0, strides = gated_101_strides_0, weight = layers_c3_4_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_469_cast_fp16)[name = string("gated_101_cast_fp16")];
+            tensor<int32, [1]> var_9652_axes_0 = const()[name = string("op_9652_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_9652_cast_fp16 = squeeze(axes = var_9652_axes_0, x = gated_101_cast_fp16)[name = string("op_9652_cast_fp16")];
+            tensor<int32, [3]> var_9656 = const()[name = string("op_9656"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_9662 = const()[name = string("op_9662"), val = int32(-1)];
+            fp16 const_178_promoted_to_fp16 = const()[name = string("const_178_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_317_cast_fp16 = transpose(perm = var_9656, x = var_9652_cast_fp16)[name = string("transpose_40")];
+            tensor<fp16, [1, 1, 2560]> var_9664_cast_fp16 = mul(x = x_317_cast_fp16, y = const_178_promoted_to_fp16)[name = string("op_9664_cast_fp16")];
+            bool input_471_interleave_0 = const()[name = string("input_471_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_471_cast_fp16 = concat(axis = var_9662, interleave = input_471_interleave_0, values = (x_317_cast_fp16, var_9664_cast_fp16))[name = string("input_471_cast_fp16")];
+            tensor<int32, [1]> normed_453_axes_0 = const()[name = string("normed_453_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_9659_to_fp16 = const()[name = string("op_9659_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_453_cast_fp16 = layer_norm(axes = normed_453_axes_0, epsilon = var_9659_to_fp16, x = input_471_cast_fp16)[name = string("normed_453_cast_fp16")];
+            tensor<int32, [2]> var_9669_split_sizes_0 = const()[name = string("op_9669_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_9669_axis_0 = const()[name = string("op_9669_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_9669_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_9669_cast_fp16_1 = split(axis = var_9669_axis_0, split_sizes = var_9669_split_sizes_0, x = normed_453_cast_fp16)[name = string("op_9669_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_4_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_c3_4_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(970408448)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_169_cast_fp16 = mul(x = var_9669_cast_fp16_0, y = layers_c3_4_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_169_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_171_cast_fp16 = add(x = hidden_states_165_cast_fp16, y = hidden_states_169_cast_fp16)[name = string("hidden_states_171_cast_fp16")];
+            tensor<fp16, [1]> const_179_promoted_to_fp16 = const()[name = string("const_179_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.3ap-1])];
+            tensor<fp16, [1, 1, 2560]> x_319_cast_fp16 = mul(x = hidden_states_171_cast_fp16, y = const_179_promoted_to_fp16)[name = string("x_319_cast_fp16")];
+            int32 var_9684 = const()[name = string("op_9684"), val = int32(-1)];
+            fp16 const_180_promoted_to_fp16 = const()[name = string("const_180_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_9686_cast_fp16 = mul(x = x_319_cast_fp16, y = const_180_promoted_to_fp16)[name = string("op_9686_cast_fp16")];
+            bool input_473_interleave_0 = const()[name = string("input_473_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_473_cast_fp16 = concat(axis = var_9684, interleave = input_473_interleave_0, values = (x_319_cast_fp16, var_9686_cast_fp16))[name = string("input_473_cast_fp16")];
+            tensor<int32, [1]> normed_457_axes_0 = const()[name = string("normed_457_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_9681_to_fp16 = const()[name = string("op_9681_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_457_cast_fp16 = layer_norm(axes = normed_457_axes_0, epsilon = var_9681_to_fp16, x = input_473_cast_fp16)[name = string("normed_457_cast_fp16")];
+            tensor<int32, [2]> var_9691_split_sizes_0 = const()[name = string("op_9691_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_9691_axis_0 = const()[name = string("op_9691_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_9691_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_9691_cast_fp16_1 = split(axis = var_9691_axis_0, split_sizes = var_9691_split_sizes_0, x = normed_457_cast_fp16)[name = string("op_9691_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_5_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c3_5_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(970413632)))];
+            tensor<fp16, [1, 1, 2560]> h_103_cast_fp16 = mul(x = var_9691_cast_fp16_0, y = layers_c3_5_input_layernorm_weight_promoted_to_fp16)[name = string("h_103_cast_fp16")];
+            tensor<int32, [3]> var_9697 = const()[name = string("op_9697"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_9700_axes_0 = const()[name = string("op_9700_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_9698_cast_fp16 = transpose(perm = var_9697, x = h_103_cast_fp16)[name = string("transpose_39")];
+            tensor<fp16, [1, 2560, 1, 1]> var_9700_cast_fp16 = expand_dims(axes = var_9700_axes_0, x = var_9698_cast_fp16)[name = string("op_9700_cast_fp16")];
+            string var_9716_pad_type_0 = const()[name = string("op_9716_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_9716_strides_0 = const()[name = string("op_9716_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_9716_pad_0 = const()[name = string("op_9716_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_9716_dilations_0 = const()[name = string("op_9716_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_9716_groups_0 = const()[name = string("op_9716_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 4096, 1, 1]> var_9716 = conv(dilations = var_9716_dilations_0, groups = var_9716_groups_0, pad = var_9716_pad_0, pad_type = var_9716_pad_type_0, strides = var_9716_strides_0, weight = layers_c3_5_self_attn_q_proj_weight_palettized, x = var_9700_cast_fp16)[name = string("op_9716")];
+            tensor<int32, [4]> var_9721 = const()[name = string("op_9721"), val = tensor<int32, [4]>([1, 8, 512, 1])];
+            tensor<fp16, [1, 8, 512, 1]> var_9722 = reshape(shape = var_9721, x = var_9716)[name = string("op_9722")];
+            tensor<int32, [4]> var_9727 = const()[name = string("op_9727"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_9737 = const()[name = string("op_9737"), val = tensor<int32, [3]>([1, 8, 512])];
+            tensor<fp16, [1, 8, 1, 512]> var_9728 = transpose(perm = var_9727, x = var_9722)[name = string("transpose_38")];
+            tensor<fp16, [1, 8, 512]> x_321 = reshape(shape = var_9737, x = var_9728)[name = string("x_321")];
+            int32 var_9743 = const()[name = string("op_9743"), val = int32(-1)];
+            fp16 const_181_promoted = const()[name = string("const_181_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 512]> var_9745 = mul(x = x_321, y = const_181_promoted)[name = string("op_9745")];
+            bool input_477_interleave_0 = const()[name = string("input_477_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1024]> input_477 = concat(axis = var_9743, interleave = input_477_interleave_0, values = (x_321, var_9745))[name = string("input_477")];
+            tensor<int32, [1]> normed_461_axes_0 = const()[name = string("normed_461_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_9740_to_fp16 = const()[name = string("op_9740_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 1024]> normed_461_cast_fp16 = layer_norm(axes = normed_461_axes_0, epsilon = var_9740_to_fp16, x = input_477)[name = string("normed_461_cast_fp16")];
+            tensor<int32, [2]> var_9750_split_sizes_0 = const()[name = string("op_9750_split_sizes_0"), val = tensor<int32, [2]>([512, 512])];
+            int32 var_9750_axis_0 = const()[name = string("op_9750_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 512]> var_9750_0, tensor<fp16, [1, 8, 512]> var_9750_1 = split(axis = var_9750_axis_0, split_sizes = var_9750_split_sizes_0, x = normed_461_cast_fp16)[name = string("op_9750")];
+            tensor<fp16, [1, 8, 512]> var_9752 = mul(x = var_9750_0, y = layers_c2_11_self_attn_q_norm_weight)[name = string("op_9752")];
+            tensor<int32, [4]> var_9757 = const()[name = string("op_9757"), val = tensor<int32, [4]>([1, 8, 1, 512])];
+            tensor<fp16, [1, 8, 1, 512]> q_129 = reshape(shape = var_9757, x = var_9752)[name = string("q_129")];
+            tensor<fp16, [1, 8, 1, 512]> var_9759_cast_fp16 = mul(x = q_129, y = cos_f)[name = string("op_9759_cast_fp16")];
+            tensor<int32, [2]> var_9760_split_sizes_0 = const()[name = string("op_9760_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_9760_axis_0 = const()[name = string("op_9760_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 256]> var_9760_0, tensor<fp16, [1, 8, 1, 256]> var_9760_1 = split(axis = var_9760_axis_0, split_sizes = var_9760_split_sizes_0, x = q_129)[name = string("op_9760")];
+            fp16 const_182_promoted = const()[name = string("const_182_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 256]> var_9762 = mul(x = var_9760_1, y = const_182_promoted)[name = string("op_9762")];
+            int32 var_9764 = const()[name = string("op_9764"), val = int32(-1)];
+            bool var_9765_interleave_0 = const()[name = string("op_9765_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> var_9765 = concat(axis = var_9764, interleave = var_9765_interleave_0, values = (var_9762, var_9760_0))[name = string("op_9765")];
+            tensor<fp16, [1, 8, 1, 512]> var_9766_cast_fp16 = mul(x = var_9765, y = sin_f)[name = string("op_9766_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> q_131_cast_fp16 = add(x = var_9759_cast_fp16, y = var_9766_cast_fp16)[name = string("q_131_cast_fp16")];
+            bool attn_weights_69_transpose_x_0 = const()[name = string("attn_weights_69_transpose_x_0"), val = bool(false)];
+            bool attn_weights_69_transpose_y_0 = const()[name = string("attn_weights_69_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 2048]> attn_weights_69_cast_fp16 = matmul(transpose_x = attn_weights_69_transpose_x_0, transpose_y = attn_weights_69_transpose_y_0, x = q_131_cast_fp16, y = transpose_95_cast_fp16)[name = string("attn_weights_69_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 2048]> x_323_cast_fp16 = add(x = attn_weights_69_cast_fp16, y = causal_mask_full)[name = string("x_323_cast_fp16")];
+            tensor<int32, [1]> reduce_max_17_axes_0 = const()[name = string("reduce_max_17_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_17_keep_dims_0 = const()[name = string("reduce_max_17_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_17 = reduce_max(axes = reduce_max_17_axes_0, keep_dims = reduce_max_17_keep_dims_0, x = x_323_cast_fp16)[name = string("reduce_max_17")];
+            tensor<fp16, [1, 8, 1, 2048]> var_9798 = sub(x = x_323_cast_fp16, y = reduce_max_17)[name = string("op_9798")];
+            tensor<fp16, [1, 8, 1, 2048]> var_9804 = exp(x = var_9798)[name = string("op_9804")];
+            tensor<int32, [1]> var_9814_axes_0 = const()[name = string("op_9814_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_9814_keep_dims_0 = const()[name = string("op_9814_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_9814 = reduce_sum(axes = var_9814_axes_0, keep_dims = var_9814_keep_dims_0, x = var_9804)[name = string("op_9814")];
+            tensor<fp16, [1, 8, 1, 2048]> var_9820_cast_fp16 = real_div(x = var_9804, y = var_9814)[name = string("op_9820_cast_fp16")];
+            bool attn_output_103_transpose_x_0 = const()[name = string("attn_output_103_transpose_x_0"), val = bool(false)];
+            bool attn_output_103_transpose_y_0 = const()[name = string("attn_output_103_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> attn_output_103_cast_fp16 = matmul(transpose_x = attn_output_103_transpose_x_0, transpose_y = attn_output_103_transpose_y_0, x = var_9820_cast_fp16, y = V_expanded_23_cast_fp16)[name = string("attn_output_103_cast_fp16")];
+            tensor<int32, [4]> var_9831 = const()[name = string("op_9831"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_9838 = const()[name = string("op_9838"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 512]> var_9832_cast_fp16 = transpose(perm = var_9831, x = attn_output_103_cast_fp16)[name = string("transpose_37")];
+            tensor<fp16, [1, 1, 4096]> attn_output_105_cast_fp16 = reshape(shape = var_9838, x = var_9832_cast_fp16)[name = string("attn_output_105_cast_fp16")];
+            tensor<int32, [3]> var_9843 = const()[name = string("op_9843"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_9859_pad_type_0 = const()[name = string("op_9859_pad_type_0"), val = string("valid")];
+            int32 var_9859_groups_0 = const()[name = string("op_9859_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_9859_strides_0 = const()[name = string("op_9859_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_9859_pad_0 = const()[name = string("op_9859_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_9859_dilations_0 = const()[name = string("op_9859_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 4096, 1]> squeeze_17_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 4096, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(970418816))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(975661760))))[name = string("squeeze_17_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 4096, 1]> var_9844_cast_fp16 = transpose(perm = var_9843, x = attn_output_105_cast_fp16)[name = string("transpose_36")];
+            tensor<fp16, [1, 2560, 1]> var_9859_cast_fp16 = conv(dilations = var_9859_dilations_0, groups = var_9859_groups_0, pad = var_9859_pad_0, pad_type = var_9859_pad_type_0, strides = var_9859_strides_0, weight = squeeze_17_cast_fp16_to_fp32_to_fp16_palettized, x = var_9844_cast_fp16)[name = string("op_9859_cast_fp16")];
+            tensor<int32, [3]> var_9863 = const()[name = string("op_9863"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_9869 = const()[name = string("op_9869"), val = int32(-1)];
+            fp16 const_183_promoted_to_fp16 = const()[name = string("const_183_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_327_cast_fp16 = transpose(perm = var_9863, x = var_9859_cast_fp16)[name = string("transpose_35")];
+            tensor<fp16, [1, 1, 2560]> var_9871_cast_fp16 = mul(x = x_327_cast_fp16, y = const_183_promoted_to_fp16)[name = string("op_9871_cast_fp16")];
+            bool input_481_interleave_0 = const()[name = string("input_481_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_481_cast_fp16 = concat(axis = var_9869, interleave = input_481_interleave_0, values = (x_327_cast_fp16, var_9871_cast_fp16))[name = string("input_481_cast_fp16")];
+            tensor<int32, [1]> normed_465_axes_0 = const()[name = string("normed_465_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_9866_to_fp16 = const()[name = string("op_9866_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_465_cast_fp16 = layer_norm(axes = normed_465_axes_0, epsilon = var_9866_to_fp16, x = input_481_cast_fp16)[name = string("normed_465_cast_fp16")];
+            tensor<int32, [2]> var_9876_split_sizes_0 = const()[name = string("op_9876_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_9876_axis_0 = const()[name = string("op_9876_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_9876_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_9876_cast_fp16_1 = split(axis = var_9876_axis_0, split_sizes = var_9876_split_sizes_0, x = normed_465_cast_fp16)[name = string("op_9876_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_5_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c3_5_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(975664384)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_107_cast_fp16 = mul(x = var_9876_cast_fp16_0, y = layers_c3_5_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_107_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_329_cast_fp16 = add(x = x_319_cast_fp16, y = attn_output_107_cast_fp16)[name = string("x_329_cast_fp16")];
+            int32 var_9885 = const()[name = string("op_9885"), val = int32(-1)];
+            fp16 const_184_promoted_to_fp16 = const()[name = string("const_184_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_9887_cast_fp16 = mul(x = x_329_cast_fp16, y = const_184_promoted_to_fp16)[name = string("op_9887_cast_fp16")];
+            bool input_483_interleave_0 = const()[name = string("input_483_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_483_cast_fp16 = concat(axis = var_9885, interleave = input_483_interleave_0, values = (x_329_cast_fp16, var_9887_cast_fp16))[name = string("input_483_cast_fp16")];
+            tensor<int32, [1]> normed_469_axes_0 = const()[name = string("normed_469_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_9882_to_fp16 = const()[name = string("op_9882_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_469_cast_fp16 = layer_norm(axes = normed_469_axes_0, epsilon = var_9882_to_fp16, x = input_483_cast_fp16)[name = string("normed_469_cast_fp16")];
+            tensor<int32, [2]> var_9892_split_sizes_0 = const()[name = string("op_9892_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_9892_axis_0 = const()[name = string("op_9892_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_9892_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_9892_cast_fp16_1 = split(axis = var_9892_axis_0, split_sizes = var_9892_split_sizes_0, x = normed_469_cast_fp16)[name = string("op_9892_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_5_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c3_5_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(975669568)))];
+            tensor<fp16, [1, 1, 2560]> h_105_cast_fp16 = mul(x = var_9892_cast_fp16_0, y = layers_c3_5_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_105_cast_fp16")];
+            tensor<int32, [3]> var_9903 = const()[name = string("op_9903"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_485_axes_0 = const()[name = string("input_485_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_9904 = transpose(perm = var_9903, x = h_105_cast_fp16)[name = string("transpose_34")];
+            tensor<fp16, [1, 2560, 1, 1]> input_485 = expand_dims(axes = input_485_axes_0, x = var_9904)[name = string("input_485")];
+            string gate_69_pad_type_0 = const()[name = string("gate_69_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_69_strides_0 = const()[name = string("gate_69_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_69_pad_0 = const()[name = string("gate_69_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_69_dilations_0 = const()[name = string("gate_69_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_69_groups_0 = const()[name = string("gate_69_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_69 = conv(dilations = gate_69_dilations_0, groups = gate_69_groups_0, pad = gate_69_pad_0, pad_type = gate_69_pad_type_0, strides = gate_69_strides_0, weight = layers_c3_5_mlp_gate_proj_weight_palettized, x = input_485)[name = string("gate_69")];
+            string up_35_pad_type_0 = const()[name = string("up_35_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_35_strides_0 = const()[name = string("up_35_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_35_pad_0 = const()[name = string("up_35_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_35_dilations_0 = const()[name = string("up_35_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_35_groups_0 = const()[name = string("up_35_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_35 = conv(dilations = up_35_dilations_0, groups = up_35_groups_0, pad = up_35_pad_0, pad_type = up_35_pad_type_0, strides = up_35_strides_0, weight = layers_c3_5_mlp_up_proj_weight_palettized, x = input_485)[name = string("up_35")];
+            string gate_71_mode_0 = const()[name = string("gate_71_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_71 = gelu(mode = gate_71_mode_0, x = gate_69)[name = string("gate_71")];
+            tensor<fp16, [1, 10240, 1, 1]> input_487 = mul(x = gate_71, y = up_35)[name = string("input_487")];
+            string mlp_out_35_pad_type_0 = const()[name = string("mlp_out_35_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_35_strides_0 = const()[name = string("mlp_out_35_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_35_pad_0 = const()[name = string("mlp_out_35_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_35_dilations_0 = const()[name = string("mlp_out_35_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_35_groups_0 = const()[name = string("mlp_out_35_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_35 = conv(dilations = mlp_out_35_dilations_0, groups = mlp_out_35_groups_0, pad = mlp_out_35_pad_0, pad_type = mlp_out_35_pad_type_0, strides = mlp_out_35_strides_0, weight = layers_c3_5_mlp_down_proj_weight_palettized, x = input_487)[name = string("mlp_out_35")];
+            tensor<int32, [1]> var_9944_axes_0 = const()[name = string("op_9944_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_9944 = squeeze(axes = var_9944_axes_0, x = mlp_out_35)[name = string("op_9944")];
+            tensor<int32, [3]> var_9948 = const()[name = string("op_9948"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_9954 = const()[name = string("op_9954"), val = int32(-1)];
+            fp16 const_185_promoted = const()[name = string("const_185_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_331 = transpose(perm = var_9948, x = var_9944)[name = string("transpose_33")];
+            tensor<fp16, [1, 1, 2560]> var_9956 = mul(x = x_331, y = const_185_promoted)[name = string("op_9956")];
+            bool input_489_interleave_0 = const()[name = string("input_489_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_489 = concat(axis = var_9954, interleave = input_489_interleave_0, values = (x_331, var_9956))[name = string("input_489")];
+            tensor<int32, [1]> normed_473_axes_0 = const()[name = string("normed_473_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_9951_to_fp16 = const()[name = string("op_9951_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_473_cast_fp16 = layer_norm(axes = normed_473_axes_0, epsilon = var_9951_to_fp16, x = input_489)[name = string("normed_473_cast_fp16")];
+            tensor<int32, [2]> var_9961_split_sizes_0 = const()[name = string("op_9961_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_9961_axis_0 = const()[name = string("op_9961_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_9961_0, tensor<fp16, [1, 1, 2560]> var_9961_1 = split(axis = var_9961_axis_0, split_sizes = var_9961_split_sizes_0, x = normed_473_cast_fp16)[name = string("op_9961")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_173 = mul(x = var_9961_0, y = layers_c3_5_post_feedforward_layernorm_weight)[name = string("hidden_states_173")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_175_cast_fp16 = add(x = x_329_cast_fp16, y = hidden_states_173)[name = string("hidden_states_175_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_35_begin_0 = const()[name = string("per_layer_slice_35_begin_0"), val = tensor<int32, [3]>([0, 0, 7424])];
+            tensor<int32, [3]> per_layer_slice_35_end_0 = const()[name = string("per_layer_slice_35_end_0"), val = tensor<int32, [3]>([1, 1, 7680])];
+            tensor<bool, [3]> per_layer_slice_35_end_mask_0 = const()[name = string("per_layer_slice_35_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_35_cast_fp16 = slice_by_index(begin = per_layer_slice_35_begin_0, end = per_layer_slice_35_end_0, end_mask = per_layer_slice_35_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_35_cast_fp16")];
+            tensor<int32, [3]> var_9989 = const()[name = string("op_9989"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_491_axes_0 = const()[name = string("input_491_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_9990 = transpose(perm = var_9989, x = hidden_states_175_cast_fp16)[name = string("transpose_32")];
+            tensor<fp16, [1, 2560, 1, 1]> input_491 = expand_dims(axes = input_491_axes_0, x = var_9990)[name = string("input_491")];
+            string gated_103_pad_type_0 = const()[name = string("gated_103_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_103_strides_0 = const()[name = string("gated_103_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_103_pad_0 = const()[name = string("gated_103_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_103_dilations_0 = const()[name = string("gated_103_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_103_groups_0 = const()[name = string("gated_103_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_103 = conv(dilations = gated_103_dilations_0, groups = gated_103_groups_0, pad = gated_103_pad_0, pad_type = gated_103_pad_type_0, strides = gated_103_strides_0, weight = layers_c3_5_per_layer_input_gate_weight_palettized, x = input_491)[name = string("gated_103")];
+            string gated_105_mode_0 = const()[name = string("gated_105_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_105 = gelu(mode = gated_105_mode_0, x = gated_103)[name = string("gated_105")];
+            tensor<int32, [3]> var_10009 = const()[name = string("op_10009"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_35_axes_0 = const()[name = string("per_layer_slice_conv_35_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_10010_cast_fp16 = transpose(perm = var_10009, x = per_layer_slice_35_cast_fp16)[name = string("transpose_31")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_35_cast_fp16 = expand_dims(axes = per_layer_slice_conv_35_axes_0, x = var_10010_cast_fp16)[name = string("per_layer_slice_conv_35_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_493_cast_fp16 = mul(x = gated_105, y = per_layer_slice_conv_35_cast_fp16)[name = string("input_493_cast_fp16")];
+            string gated_107_pad_type_0 = const()[name = string("gated_107_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_107_strides_0 = const()[name = string("gated_107_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_107_pad_0 = const()[name = string("gated_107_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_107_dilations_0 = const()[name = string("gated_107_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_107_groups_0 = const()[name = string("gated_107_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_c3_5_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(975674752))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(976002496))))[name = string("layers_c3_5_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_107_cast_fp16 = conv(dilations = gated_107_dilations_0, groups = gated_107_groups_0, pad = gated_107_pad_0, pad_type = gated_107_pad_type_0, strides = gated_107_strides_0, weight = layers_c3_5_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_493_cast_fp16)[name = string("gated_107_cast_fp16")];
+            tensor<int32, [1]> var_10026_axes_0 = const()[name = string("op_10026_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_10026_cast_fp16 = squeeze(axes = var_10026_axes_0, x = gated_107_cast_fp16)[name = string("op_10026_cast_fp16")];
+            tensor<int32, [3]> var_10030 = const()[name = string("op_10030"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_10036 = const()[name = string("op_10036"), val = int32(-1)];
+            fp16 const_186_promoted_to_fp16 = const()[name = string("const_186_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_333_cast_fp16 = transpose(perm = var_10030, x = var_10026_cast_fp16)[name = string("transpose_30")];
+            tensor<fp16, [1, 1, 2560]> var_10038_cast_fp16 = mul(x = x_333_cast_fp16, y = const_186_promoted_to_fp16)[name = string("op_10038_cast_fp16")];
+            bool input_495_interleave_0 = const()[name = string("input_495_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_495_cast_fp16 = concat(axis = var_10036, interleave = input_495_interleave_0, values = (x_333_cast_fp16, var_10038_cast_fp16))[name = string("input_495_cast_fp16")];
+            tensor<int32, [1]> normed_477_axes_0 = const()[name = string("normed_477_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_10033_to_fp16 = const()[name = string("op_10033_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_477_cast_fp16 = layer_norm(axes = normed_477_axes_0, epsilon = var_10033_to_fp16, x = input_495_cast_fp16)[name = string("normed_477_cast_fp16")];
+            tensor<int32, [2]> var_10043_split_sizes_0 = const()[name = string("op_10043_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_10043_axis_0 = const()[name = string("op_10043_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_10043_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_10043_cast_fp16_1 = split(axis = var_10043_axis_0, split_sizes = var_10043_split_sizes_0, x = normed_477_cast_fp16)[name = string("op_10043_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_5_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_c3_5_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(976005120)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_179_cast_fp16 = mul(x = var_10043_cast_fp16_0, y = layers_c3_5_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_179_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_181_cast_fp16 = add(x = hidden_states_175_cast_fp16, y = hidden_states_179_cast_fp16)[name = string("hidden_states_181_cast_fp16")];
+            tensor<fp16, [1]> const_187_promoted_to_fp16 = const()[name = string("const_187_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.aep-2])];
+            tensor<fp16, [1, 1, 2560]> x_335_cast_fp16 = mul(x = hidden_states_181_cast_fp16, y = const_187_promoted_to_fp16)[name = string("x_335_cast_fp16")];
+            int32 var_10058 = const()[name = string("op_10058"), val = int32(-1)];
+            fp16 const_188_promoted_to_fp16 = const()[name = string("const_188_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_10060_cast_fp16 = mul(x = x_335_cast_fp16, y = const_188_promoted_to_fp16)[name = string("op_10060_cast_fp16")];
+            bool input_497_interleave_0 = const()[name = string("input_497_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_497_cast_fp16 = concat(axis = var_10058, interleave = input_497_interleave_0, values = (x_335_cast_fp16, var_10060_cast_fp16))[name = string("input_497_cast_fp16")];
+            tensor<int32, [1]> normed_481_axes_0 = const()[name = string("normed_481_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_10055_to_fp16 = const()[name = string("op_10055_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_481_cast_fp16 = layer_norm(axes = normed_481_axes_0, epsilon = var_10055_to_fp16, x = input_497_cast_fp16)[name = string("normed_481_cast_fp16")];
+            tensor<int32, [2]> var_10065_split_sizes_0 = const()[name = string("op_10065_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_10065_axis_0 = const()[name = string("op_10065_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_10065_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_10065_cast_fp16_1 = split(axis = var_10065_axis_0, split_sizes = var_10065_split_sizes_0, x = normed_481_cast_fp16)[name = string("op_10065_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_6_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c3_6_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(976010304)))];
+            tensor<fp16, [1, 1, 2560]> h_109_cast_fp16 = mul(x = var_10065_cast_fp16_0, y = layers_c3_6_input_layernorm_weight_promoted_to_fp16)[name = string("h_109_cast_fp16")];
+            tensor<int32, [3]> var_10071 = const()[name = string("op_10071"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_10074_axes_0 = const()[name = string("op_10074_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_10072_cast_fp16 = transpose(perm = var_10071, x = h_109_cast_fp16)[name = string("transpose_29")];
+            tensor<fp16, [1, 2560, 1, 1]> var_10074_cast_fp16 = expand_dims(axes = var_10074_axes_0, x = var_10072_cast_fp16)[name = string("op_10074_cast_fp16")];
+            string var_10090_pad_type_0 = const()[name = string("op_10090_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_10090_strides_0 = const()[name = string("op_10090_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_10090_pad_0 = const()[name = string("op_10090_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_10090_dilations_0 = const()[name = string("op_10090_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_10090_groups_0 = const()[name = string("op_10090_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_10090 = conv(dilations = var_10090_dilations_0, groups = var_10090_groups_0, pad = var_10090_pad_0, pad_type = var_10090_pad_type_0, strides = var_10090_strides_0, weight = layers_c3_6_self_attn_q_proj_weight_palettized, x = var_10074_cast_fp16)[name = string("op_10090")];
+            tensor<int32, [4]> var_10095 = const()[name = string("op_10095"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_10096 = reshape(shape = var_10095, x = var_10090)[name = string("op_10096")];
+            tensor<int32, [4]> var_10101 = const()[name = string("op_10101"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_10111 = const()[name = string("op_10111"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_10102 = transpose(perm = var_10101, x = var_10096)[name = string("transpose_28")];
+            tensor<fp16, [1, 8, 256]> x_337 = reshape(shape = var_10111, x = var_10102)[name = string("x_337")];
+            int32 var_10117 = const()[name = string("op_10117"), val = int32(-1)];
+            fp16 const_189_promoted = const()[name = string("const_189_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_10119 = mul(x = x_337, y = const_189_promoted)[name = string("op_10119")];
+            bool input_501_interleave_0 = const()[name = string("input_501_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_501 = concat(axis = var_10117, interleave = input_501_interleave_0, values = (x_337, var_10119))[name = string("input_501")];
+            tensor<int32, [1]> normed_485_axes_0 = const()[name = string("normed_485_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_10114_to_fp16 = const()[name = string("op_10114_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_485_cast_fp16 = layer_norm(axes = normed_485_axes_0, epsilon = var_10114_to_fp16, x = input_501)[name = string("normed_485_cast_fp16")];
+            tensor<int32, [2]> var_10124_split_sizes_0 = const()[name = string("op_10124_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_10124_axis_0 = const()[name = string("op_10124_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_10124_0, tensor<fp16, [1, 8, 256]> var_10124_1 = split(axis = var_10124_axis_0, split_sizes = var_10124_split_sizes_0, x = normed_485_cast_fp16)[name = string("op_10124")];
+            tensor<fp16, [1, 8, 256]> var_10126 = mul(x = var_10124_0, y = layers_c2_10_self_attn_q_norm_weight)[name = string("op_10126")];
+            tensor<int32, [4]> var_10131 = const()[name = string("op_10131"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_135 = reshape(shape = var_10131, x = var_10126)[name = string("q_135")];
+            tensor<fp16, [1, 8, 1, 256]> var_10133_cast_fp16 = mul(x = q_135, y = cos_s)[name = string("op_10133_cast_fp16")];
+            tensor<int32, [2]> var_10134_split_sizes_0 = const()[name = string("op_10134_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_10134_axis_0 = const()[name = string("op_10134_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_10134_0, tensor<fp16, [1, 8, 1, 128]> var_10134_1 = split(axis = var_10134_axis_0, split_sizes = var_10134_split_sizes_0, x = q_135)[name = string("op_10134")];
+            fp16 const_190_promoted = const()[name = string("const_190_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_10136 = mul(x = var_10134_1, y = const_190_promoted)[name = string("op_10136")];
+            int32 var_10138 = const()[name = string("op_10138"), val = int32(-1)];
+            bool var_10139_interleave_0 = const()[name = string("op_10139_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_10139 = concat(axis = var_10138, interleave = var_10139_interleave_0, values = (var_10136, var_10134_0))[name = string("op_10139")];
+            tensor<fp16, [1, 8, 1, 256]> var_10140_cast_fp16 = mul(x = var_10139, y = sin_s)[name = string("op_10140_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_137_cast_fp16 = add(x = var_10133_cast_fp16, y = var_10140_cast_fp16)[name = string("q_137_cast_fp16")];
+            bool attn_weights_73_transpose_x_0 = const()[name = string("attn_weights_73_transpose_x_0"), val = bool(false)];
+            bool attn_weights_73_transpose_y_0 = const()[name = string("attn_weights_73_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_73_cast_fp16 = matmul(transpose_x = attn_weights_73_transpose_x_0, transpose_y = attn_weights_73_transpose_y_0, x = q_137_cast_fp16, y = transpose_94_cast_fp16)[name = string("attn_weights_73_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_339_cast_fp16 = add(x = attn_weights_73_cast_fp16, y = causal_mask_sliding)[name = string("x_339_cast_fp16")];
+            tensor<int32, [1]> reduce_max_18_axes_0 = const()[name = string("reduce_max_18_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_18_keep_dims_0 = const()[name = string("reduce_max_18_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_18 = reduce_max(axes = reduce_max_18_axes_0, keep_dims = reduce_max_18_keep_dims_0, x = x_339_cast_fp16)[name = string("reduce_max_18")];
+            tensor<fp16, [1, 8, 1, 512]> var_10172 = sub(x = x_339_cast_fp16, y = reduce_max_18)[name = string("op_10172")];
+            tensor<fp16, [1, 8, 1, 512]> var_10178 = exp(x = var_10172)[name = string("op_10178")];
+            tensor<int32, [1]> var_10188_axes_0 = const()[name = string("op_10188_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_10188_keep_dims_0 = const()[name = string("op_10188_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_10188 = reduce_sum(axes = var_10188_axes_0, keep_dims = var_10188_keep_dims_0, x = var_10178)[name = string("op_10188")];
+            tensor<fp16, [1, 8, 1, 512]> var_10194_cast_fp16 = real_div(x = var_10178, y = var_10188)[name = string("op_10194_cast_fp16")];
+            bool attn_output_109_transpose_x_0 = const()[name = string("attn_output_109_transpose_x_0"), val = bool(false)];
+            bool attn_output_109_transpose_y_0 = const()[name = string("attn_output_109_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_109_cast_fp16 = matmul(transpose_x = attn_output_109_transpose_x_0, transpose_y = attn_output_109_transpose_y_0, x = var_10194_cast_fp16, y = V_expanded_21_cast_fp16)[name = string("attn_output_109_cast_fp16")];
+            tensor<int32, [4]> var_10205 = const()[name = string("op_10205"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_10212 = const()[name = string("op_10212"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_10206_cast_fp16 = transpose(perm = var_10205, x = attn_output_109_cast_fp16)[name = string("transpose_27")];
+            tensor<fp16, [1, 1, 2048]> attn_output_111_cast_fp16 = reshape(shape = var_10212, x = var_10206_cast_fp16)[name = string("attn_output_111_cast_fp16")];
+            tensor<int32, [3]> var_10217 = const()[name = string("op_10217"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_10233_pad_type_0 = const()[name = string("op_10233_pad_type_0"), val = string("valid")];
+            int32 var_10233_groups_0 = const()[name = string("op_10233_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_10233_strides_0 = const()[name = string("op_10233_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_10233_pad_0 = const()[name = string("op_10233_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_10233_dilations_0 = const()[name = string("op_10233_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_18_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(976015488))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(978636992))))[name = string("squeeze_18_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_10218_cast_fp16 = transpose(perm = var_10217, x = attn_output_111_cast_fp16)[name = string("transpose_26")];
+            tensor<fp16, [1, 2560, 1]> var_10233_cast_fp16 = conv(dilations = var_10233_dilations_0, groups = var_10233_groups_0, pad = var_10233_pad_0, pad_type = var_10233_pad_type_0, strides = var_10233_strides_0, weight = squeeze_18_cast_fp16_to_fp32_to_fp16_palettized, x = var_10218_cast_fp16)[name = string("op_10233_cast_fp16")];
+            tensor<int32, [3]> var_10237 = const()[name = string("op_10237"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_10243 = const()[name = string("op_10243"), val = int32(-1)];
+            fp16 const_191_promoted_to_fp16 = const()[name = string("const_191_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_343_cast_fp16 = transpose(perm = var_10237, x = var_10233_cast_fp16)[name = string("transpose_25")];
+            tensor<fp16, [1, 1, 2560]> var_10245_cast_fp16 = mul(x = x_343_cast_fp16, y = const_191_promoted_to_fp16)[name = string("op_10245_cast_fp16")];
+            bool input_505_interleave_0 = const()[name = string("input_505_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_505_cast_fp16 = concat(axis = var_10243, interleave = input_505_interleave_0, values = (x_343_cast_fp16, var_10245_cast_fp16))[name = string("input_505_cast_fp16")];
+            tensor<int32, [1]> normed_489_axes_0 = const()[name = string("normed_489_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_10240_to_fp16 = const()[name = string("op_10240_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_489_cast_fp16 = layer_norm(axes = normed_489_axes_0, epsilon = var_10240_to_fp16, x = input_505_cast_fp16)[name = string("normed_489_cast_fp16")];
+            tensor<int32, [2]> var_10250_split_sizes_0 = const()[name = string("op_10250_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_10250_axis_0 = const()[name = string("op_10250_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_10250_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_10250_cast_fp16_1 = split(axis = var_10250_axis_0, split_sizes = var_10250_split_sizes_0, x = normed_489_cast_fp16)[name = string("op_10250_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_6_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c3_6_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(978639616)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_113_cast_fp16 = mul(x = var_10250_cast_fp16_0, y = layers_c3_6_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_113_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_345_cast_fp16 = add(x = x_335_cast_fp16, y = attn_output_113_cast_fp16)[name = string("x_345_cast_fp16")];
+            int32 var_10259 = const()[name = string("op_10259"), val = int32(-1)];
+            fp16 const_192_promoted_to_fp16 = const()[name = string("const_192_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_10261_cast_fp16 = mul(x = x_345_cast_fp16, y = const_192_promoted_to_fp16)[name = string("op_10261_cast_fp16")];
+            bool input_507_interleave_0 = const()[name = string("input_507_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_507_cast_fp16 = concat(axis = var_10259, interleave = input_507_interleave_0, values = (x_345_cast_fp16, var_10261_cast_fp16))[name = string("input_507_cast_fp16")];
+            tensor<int32, [1]> normed_493_axes_0 = const()[name = string("normed_493_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_10256_to_fp16 = const()[name = string("op_10256_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_493_cast_fp16 = layer_norm(axes = normed_493_axes_0, epsilon = var_10256_to_fp16, x = input_507_cast_fp16)[name = string("normed_493_cast_fp16")];
+            tensor<int32, [2]> var_10266_split_sizes_0 = const()[name = string("op_10266_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_10266_axis_0 = const()[name = string("op_10266_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_10266_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_10266_cast_fp16_1 = split(axis = var_10266_axis_0, split_sizes = var_10266_split_sizes_0, x = normed_493_cast_fp16)[name = string("op_10266_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_6_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c3_6_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(978644800)))];
+            tensor<fp16, [1, 1, 2560]> h_111_cast_fp16 = mul(x = var_10266_cast_fp16_0, y = layers_c3_6_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_111_cast_fp16")];
+            tensor<int32, [3]> var_10277 = const()[name = string("op_10277"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_509_axes_0 = const()[name = string("input_509_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_10278 = transpose(perm = var_10277, x = h_111_cast_fp16)[name = string("transpose_24")];
+            tensor<fp16, [1, 2560, 1, 1]> input_509 = expand_dims(axes = input_509_axes_0, x = var_10278)[name = string("input_509")];
+            string gate_73_pad_type_0 = const()[name = string("gate_73_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_73_strides_0 = const()[name = string("gate_73_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_73_pad_0 = const()[name = string("gate_73_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_73_dilations_0 = const()[name = string("gate_73_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_73_groups_0 = const()[name = string("gate_73_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_73 = conv(dilations = gate_73_dilations_0, groups = gate_73_groups_0, pad = gate_73_pad_0, pad_type = gate_73_pad_type_0, strides = gate_73_strides_0, weight = layers_c3_6_mlp_gate_proj_weight_palettized, x = input_509)[name = string("gate_73")];
+            string up_37_pad_type_0 = const()[name = string("up_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_37_strides_0 = const()[name = string("up_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_37_pad_0 = const()[name = string("up_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_37_dilations_0 = const()[name = string("up_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_37_groups_0 = const()[name = string("up_37_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_37 = conv(dilations = up_37_dilations_0, groups = up_37_groups_0, pad = up_37_pad_0, pad_type = up_37_pad_type_0, strides = up_37_strides_0, weight = layers_c3_6_mlp_up_proj_weight_palettized, x = input_509)[name = string("up_37")];
+            string gate_75_mode_0 = const()[name = string("gate_75_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_75 = gelu(mode = gate_75_mode_0, x = gate_73)[name = string("gate_75")];
+            tensor<fp16, [1, 10240, 1, 1]> input_511 = mul(x = gate_75, y = up_37)[name = string("input_511")];
+            string mlp_out_37_pad_type_0 = const()[name = string("mlp_out_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_37_strides_0 = const()[name = string("mlp_out_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_37_pad_0 = const()[name = string("mlp_out_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_37_dilations_0 = const()[name = string("mlp_out_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_37_groups_0 = const()[name = string("mlp_out_37_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_37 = conv(dilations = mlp_out_37_dilations_0, groups = mlp_out_37_groups_0, pad = mlp_out_37_pad_0, pad_type = mlp_out_37_pad_type_0, strides = mlp_out_37_strides_0, weight = layers_c3_6_mlp_down_proj_weight_palettized, x = input_511)[name = string("mlp_out_37")];
+            tensor<int32, [1]> var_10318_axes_0 = const()[name = string("op_10318_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_10318 = squeeze(axes = var_10318_axes_0, x = mlp_out_37)[name = string("op_10318")];
+            tensor<int32, [3]> var_10322 = const()[name = string("op_10322"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_10328 = const()[name = string("op_10328"), val = int32(-1)];
+            fp16 const_193_promoted = const()[name = string("const_193_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_347 = transpose(perm = var_10322, x = var_10318)[name = string("transpose_23")];
+            tensor<fp16, [1, 1, 2560]> var_10330 = mul(x = x_347, y = const_193_promoted)[name = string("op_10330")];
+            bool input_513_interleave_0 = const()[name = string("input_513_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_513 = concat(axis = var_10328, interleave = input_513_interleave_0, values = (x_347, var_10330))[name = string("input_513")];
+            tensor<int32, [1]> normed_497_axes_0 = const()[name = string("normed_497_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_10325_to_fp16 = const()[name = string("op_10325_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_497_cast_fp16 = layer_norm(axes = normed_497_axes_0, epsilon = var_10325_to_fp16, x = input_513)[name = string("normed_497_cast_fp16")];
+            tensor<int32, [2]> var_10335_split_sizes_0 = const()[name = string("op_10335_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_10335_axis_0 = const()[name = string("op_10335_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_10335_0, tensor<fp16, [1, 1, 2560]> var_10335_1 = split(axis = var_10335_axis_0, split_sizes = var_10335_split_sizes_0, x = normed_497_cast_fp16)[name = string("op_10335")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_183 = mul(x = var_10335_0, y = layers_c3_6_post_feedforward_layernorm_weight)[name = string("hidden_states_183")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_185_cast_fp16 = add(x = x_345_cast_fp16, y = hidden_states_183)[name = string("hidden_states_185_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_37_begin_0 = const()[name = string("per_layer_slice_37_begin_0"), val = tensor<int32, [3]>([0, 0, 7680])];
+            tensor<int32, [3]> per_layer_slice_37_end_0 = const()[name = string("per_layer_slice_37_end_0"), val = tensor<int32, [3]>([1, 1, 7936])];
+            tensor<bool, [3]> per_layer_slice_37_end_mask_0 = const()[name = string("per_layer_slice_37_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_37_cast_fp16 = slice_by_index(begin = per_layer_slice_37_begin_0, end = per_layer_slice_37_end_0, end_mask = per_layer_slice_37_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_37_cast_fp16")];
+            tensor<int32, [3]> var_10363 = const()[name = string("op_10363"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_515_axes_0 = const()[name = string("input_515_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_10364 = transpose(perm = var_10363, x = hidden_states_185_cast_fp16)[name = string("transpose_22")];
+            tensor<fp16, [1, 2560, 1, 1]> input_515 = expand_dims(axes = input_515_axes_0, x = var_10364)[name = string("input_515")];
+            string gated_109_pad_type_0 = const()[name = string("gated_109_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_109_strides_0 = const()[name = string("gated_109_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_109_pad_0 = const()[name = string("gated_109_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_109_dilations_0 = const()[name = string("gated_109_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_109_groups_0 = const()[name = string("gated_109_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_109 = conv(dilations = gated_109_dilations_0, groups = gated_109_groups_0, pad = gated_109_pad_0, pad_type = gated_109_pad_type_0, strides = gated_109_strides_0, weight = layers_c3_6_per_layer_input_gate_weight_palettized, x = input_515)[name = string("gated_109")];
+            string gated_111_mode_0 = const()[name = string("gated_111_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_111 = gelu(mode = gated_111_mode_0, x = gated_109)[name = string("gated_111")];
+            tensor<int32, [3]> var_10383 = const()[name = string("op_10383"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_37_axes_0 = const()[name = string("per_layer_slice_conv_37_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_10384_cast_fp16 = transpose(perm = var_10383, x = per_layer_slice_37_cast_fp16)[name = string("transpose_21")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_37_cast_fp16 = expand_dims(axes = per_layer_slice_conv_37_axes_0, x = var_10384_cast_fp16)[name = string("per_layer_slice_conv_37_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_517_cast_fp16 = mul(x = gated_111, y = per_layer_slice_conv_37_cast_fp16)[name = string("input_517_cast_fp16")];
+            string gated_113_pad_type_0 = const()[name = string("gated_113_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_113_strides_0 = const()[name = string("gated_113_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_113_pad_0 = const()[name = string("gated_113_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_113_dilations_0 = const()[name = string("gated_113_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_113_groups_0 = const()[name = string("gated_113_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_c3_6_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(978649984))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(978977728))))[name = string("layers_c3_6_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_113_cast_fp16 = conv(dilations = gated_113_dilations_0, groups = gated_113_groups_0, pad = gated_113_pad_0, pad_type = gated_113_pad_type_0, strides = gated_113_strides_0, weight = layers_c3_6_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_517_cast_fp16)[name = string("gated_113_cast_fp16")];
+            tensor<int32, [1]> var_10400_axes_0 = const()[name = string("op_10400_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_10400_cast_fp16 = squeeze(axes = var_10400_axes_0, x = gated_113_cast_fp16)[name = string("op_10400_cast_fp16")];
+            tensor<int32, [3]> var_10404 = const()[name = string("op_10404"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_10410 = const()[name = string("op_10410"), val = int32(-1)];
+            fp16 const_194_promoted_to_fp16 = const()[name = string("const_194_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_349_cast_fp16 = transpose(perm = var_10404, x = var_10400_cast_fp16)[name = string("transpose_20")];
+            tensor<fp16, [1, 1, 2560]> var_10412_cast_fp16 = mul(x = x_349_cast_fp16, y = const_194_promoted_to_fp16)[name = string("op_10412_cast_fp16")];
+            bool input_519_interleave_0 = const()[name = string("input_519_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_519_cast_fp16 = concat(axis = var_10410, interleave = input_519_interleave_0, values = (x_349_cast_fp16, var_10412_cast_fp16))[name = string("input_519_cast_fp16")];
+            tensor<int32, [1]> normed_501_axes_0 = const()[name = string("normed_501_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_10407_to_fp16 = const()[name = string("op_10407_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_501_cast_fp16 = layer_norm(axes = normed_501_axes_0, epsilon = var_10407_to_fp16, x = input_519_cast_fp16)[name = string("normed_501_cast_fp16")];
+            tensor<int32, [2]> var_10417_split_sizes_0 = const()[name = string("op_10417_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_10417_axis_0 = const()[name = string("op_10417_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_10417_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_10417_cast_fp16_1 = split(axis = var_10417_axis_0, split_sizes = var_10417_split_sizes_0, x = normed_501_cast_fp16)[name = string("op_10417_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_6_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_c3_6_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(978980352)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_189_cast_fp16 = mul(x = var_10417_cast_fp16_0, y = layers_c3_6_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_189_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_191_cast_fp16 = add(x = hidden_states_185_cast_fp16, y = hidden_states_189_cast_fp16)[name = string("hidden_states_191_cast_fp16")];
+            tensor<fp16, [1]> const_195_promoted_to_fp16 = const()[name = string("const_195_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.6cp-1])];
+            tensor<fp16, [1, 1, 2560]> x_351_cast_fp16 = mul(x = hidden_states_191_cast_fp16, y = const_195_promoted_to_fp16)[name = string("x_351_cast_fp16")];
+            int32 var_10432 = const()[name = string("op_10432"), val = int32(-1)];
+            fp16 const_196_promoted_to_fp16 = const()[name = string("const_196_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_10434_cast_fp16 = mul(x = x_351_cast_fp16, y = const_196_promoted_to_fp16)[name = string("op_10434_cast_fp16")];
+            bool input_521_interleave_0 = const()[name = string("input_521_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_521_cast_fp16 = concat(axis = var_10432, interleave = input_521_interleave_0, values = (x_351_cast_fp16, var_10434_cast_fp16))[name = string("input_521_cast_fp16")];
+            tensor<int32, [1]> normed_505_axes_0 = const()[name = string("normed_505_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_10429_to_fp16 = const()[name = string("op_10429_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_505_cast_fp16 = layer_norm(axes = normed_505_axes_0, epsilon = var_10429_to_fp16, x = input_521_cast_fp16)[name = string("normed_505_cast_fp16")];
+            tensor<int32, [2]> var_10439_split_sizes_0 = const()[name = string("op_10439_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_10439_axis_0 = const()[name = string("op_10439_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_10439_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_10439_cast_fp16_1 = split(axis = var_10439_axis_0, split_sizes = var_10439_split_sizes_0, x = normed_505_cast_fp16)[name = string("op_10439_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_7_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c3_7_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(978985536)))];
+            tensor<fp16, [1, 1, 2560]> h_115_cast_fp16 = mul(x = var_10439_cast_fp16_0, y = layers_c3_7_input_layernorm_weight_promoted_to_fp16)[name = string("h_115_cast_fp16")];
+            tensor<int32, [3]> var_10445 = const()[name = string("op_10445"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_10448_axes_0 = const()[name = string("op_10448_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_10446_cast_fp16 = transpose(perm = var_10445, x = h_115_cast_fp16)[name = string("transpose_19")];
+            tensor<fp16, [1, 2560, 1, 1]> var_10448_cast_fp16 = expand_dims(axes = var_10448_axes_0, x = var_10446_cast_fp16)[name = string("op_10448_cast_fp16")];
+            string var_10464_pad_type_0 = const()[name = string("op_10464_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_10464_strides_0 = const()[name = string("op_10464_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_10464_pad_0 = const()[name = string("op_10464_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_10464_dilations_0 = const()[name = string("op_10464_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_10464_groups_0 = const()[name = string("op_10464_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_10464 = conv(dilations = var_10464_dilations_0, groups = var_10464_groups_0, pad = var_10464_pad_0, pad_type = var_10464_pad_type_0, strides = var_10464_strides_0, weight = layers_c3_7_self_attn_q_proj_weight_palettized, x = var_10448_cast_fp16)[name = string("op_10464")];
+            tensor<int32, [4]> var_10469 = const()[name = string("op_10469"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_10470 = reshape(shape = var_10469, x = var_10464)[name = string("op_10470")];
+            tensor<int32, [4]> var_10475 = const()[name = string("op_10475"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_10485 = const()[name = string("op_10485"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_10476 = transpose(perm = var_10475, x = var_10470)[name = string("transpose_18")];
+            tensor<fp16, [1, 8, 256]> x_353 = reshape(shape = var_10485, x = var_10476)[name = string("x_353")];
+            int32 var_10491 = const()[name = string("op_10491"), val = int32(-1)];
+            fp16 const_197_promoted = const()[name = string("const_197_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_10493 = mul(x = x_353, y = const_197_promoted)[name = string("op_10493")];
+            bool input_525_interleave_0 = const()[name = string("input_525_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_525 = concat(axis = var_10491, interleave = input_525_interleave_0, values = (x_353, var_10493))[name = string("input_525")];
+            tensor<int32, [1]> normed_509_axes_0 = const()[name = string("normed_509_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_10488_to_fp16 = const()[name = string("op_10488_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_509_cast_fp16 = layer_norm(axes = normed_509_axes_0, epsilon = var_10488_to_fp16, x = input_525)[name = string("normed_509_cast_fp16")];
+            tensor<int32, [2]> var_10498_split_sizes_0 = const()[name = string("op_10498_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_10498_axis_0 = const()[name = string("op_10498_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_10498_0, tensor<fp16, [1, 8, 256]> var_10498_1 = split(axis = var_10498_axis_0, split_sizes = var_10498_split_sizes_0, x = normed_509_cast_fp16)[name = string("op_10498")];
+            tensor<fp16, [1, 8, 256]> var_10500 = mul(x = var_10498_0, y = layers_c2_10_self_attn_q_norm_weight)[name = string("op_10500")];
+            tensor<int32, [4]> var_10505 = const()[name = string("op_10505"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_141 = reshape(shape = var_10505, x = var_10500)[name = string("q_141")];
+            tensor<fp16, [1, 8, 1, 256]> var_10507_cast_fp16 = mul(x = q_141, y = cos_s)[name = string("op_10507_cast_fp16")];
+            tensor<int32, [2]> var_10508_split_sizes_0 = const()[name = string("op_10508_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_10508_axis_0 = const()[name = string("op_10508_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_10508_0, tensor<fp16, [1, 8, 1, 128]> var_10508_1 = split(axis = var_10508_axis_0, split_sizes = var_10508_split_sizes_0, x = q_141)[name = string("op_10508")];
+            fp16 const_198_promoted = const()[name = string("const_198_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_10510 = mul(x = var_10508_1, y = const_198_promoted)[name = string("op_10510")];
+            int32 var_10512 = const()[name = string("op_10512"), val = int32(-1)];
+            bool var_10513_interleave_0 = const()[name = string("op_10513_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_10513 = concat(axis = var_10512, interleave = var_10513_interleave_0, values = (var_10510, var_10508_0))[name = string("op_10513")];
+            tensor<fp16, [1, 8, 1, 256]> var_10514_cast_fp16 = mul(x = var_10513, y = sin_s)[name = string("op_10514_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_143_cast_fp16 = add(x = var_10507_cast_fp16, y = var_10514_cast_fp16)[name = string("q_143_cast_fp16")];
+            bool attn_weights_77_transpose_x_0 = const()[name = string("attn_weights_77_transpose_x_0"), val = bool(false)];
+            bool attn_weights_77_transpose_y_0 = const()[name = string("attn_weights_77_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_77_cast_fp16 = matmul(transpose_x = attn_weights_77_transpose_x_0, transpose_y = attn_weights_77_transpose_y_0, x = q_143_cast_fp16, y = transpose_94_cast_fp16)[name = string("attn_weights_77_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_355_cast_fp16 = add(x = attn_weights_77_cast_fp16, y = causal_mask_sliding)[name = string("x_355_cast_fp16")];
+            tensor<int32, [1]> reduce_max_19_axes_0 = const()[name = string("reduce_max_19_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_19_keep_dims_0 = const()[name = string("reduce_max_19_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_19 = reduce_max(axes = reduce_max_19_axes_0, keep_dims = reduce_max_19_keep_dims_0, x = x_355_cast_fp16)[name = string("reduce_max_19")];
+            tensor<fp16, [1, 8, 1, 512]> var_10546 = sub(x = x_355_cast_fp16, y = reduce_max_19)[name = string("op_10546")];
+            tensor<fp16, [1, 8, 1, 512]> var_10552 = exp(x = var_10546)[name = string("op_10552")];
+            tensor<int32, [1]> var_10562_axes_0 = const()[name = string("op_10562_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_10562_keep_dims_0 = const()[name = string("op_10562_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_10562 = reduce_sum(axes = var_10562_axes_0, keep_dims = var_10562_keep_dims_0, x = var_10552)[name = string("op_10562")];
+            tensor<fp16, [1, 8, 1, 512]> var_10568_cast_fp16 = real_div(x = var_10552, y = var_10562)[name = string("op_10568_cast_fp16")];
+            bool attn_output_115_transpose_x_0 = const()[name = string("attn_output_115_transpose_x_0"), val = bool(false)];
+            bool attn_output_115_transpose_y_0 = const()[name = string("attn_output_115_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_115_cast_fp16 = matmul(transpose_x = attn_output_115_transpose_x_0, transpose_y = attn_output_115_transpose_y_0, x = var_10568_cast_fp16, y = V_expanded_21_cast_fp16)[name = string("attn_output_115_cast_fp16")];
+            tensor<int32, [4]> var_10579 = const()[name = string("op_10579"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_10586 = const()[name = string("op_10586"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_10580_cast_fp16 = transpose(perm = var_10579, x = attn_output_115_cast_fp16)[name = string("transpose_17")];
+            tensor<fp16, [1, 1, 2048]> attn_output_117_cast_fp16 = reshape(shape = var_10586, x = var_10580_cast_fp16)[name = string("attn_output_117_cast_fp16")];
+            tensor<int32, [3]> var_10591 = const()[name = string("op_10591"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_10607_pad_type_0 = const()[name = string("op_10607_pad_type_0"), val = string("valid")];
+            int32 var_10607_groups_0 = const()[name = string("op_10607_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_10607_strides_0 = const()[name = string("op_10607_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_10607_pad_0 = const()[name = string("op_10607_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_10607_dilations_0 = const()[name = string("op_10607_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_19_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(978990720))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(981612224))))[name = string("squeeze_19_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_10592_cast_fp16 = transpose(perm = var_10591, x = attn_output_117_cast_fp16)[name = string("transpose_16")];
+            tensor<fp16, [1, 2560, 1]> var_10607_cast_fp16 = conv(dilations = var_10607_dilations_0, groups = var_10607_groups_0, pad = var_10607_pad_0, pad_type = var_10607_pad_type_0, strides = var_10607_strides_0, weight = squeeze_19_cast_fp16_to_fp32_to_fp16_palettized, x = var_10592_cast_fp16)[name = string("op_10607_cast_fp16")];
+            tensor<int32, [3]> var_10611 = const()[name = string("op_10611"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_10617 = const()[name = string("op_10617"), val = int32(-1)];
+            fp16 const_199_promoted_to_fp16 = const()[name = string("const_199_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_359_cast_fp16 = transpose(perm = var_10611, x = var_10607_cast_fp16)[name = string("transpose_15")];
+            tensor<fp16, [1, 1, 2560]> var_10619_cast_fp16 = mul(x = x_359_cast_fp16, y = const_199_promoted_to_fp16)[name = string("op_10619_cast_fp16")];
+            bool input_529_interleave_0 = const()[name = string("input_529_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_529_cast_fp16 = concat(axis = var_10617, interleave = input_529_interleave_0, values = (x_359_cast_fp16, var_10619_cast_fp16))[name = string("input_529_cast_fp16")];
+            tensor<int32, [1]> normed_513_axes_0 = const()[name = string("normed_513_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_10614_to_fp16 = const()[name = string("op_10614_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_513_cast_fp16 = layer_norm(axes = normed_513_axes_0, epsilon = var_10614_to_fp16, x = input_529_cast_fp16)[name = string("normed_513_cast_fp16")];
+            tensor<int32, [2]> var_10624_split_sizes_0 = const()[name = string("op_10624_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_10624_axis_0 = const()[name = string("op_10624_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_10624_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_10624_cast_fp16_1 = split(axis = var_10624_axis_0, split_sizes = var_10624_split_sizes_0, x = normed_513_cast_fp16)[name = string("op_10624_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_7_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c3_7_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(981614848)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_119_cast_fp16 = mul(x = var_10624_cast_fp16_0, y = layers_c3_7_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_119_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_361_cast_fp16 = add(x = x_351_cast_fp16, y = attn_output_119_cast_fp16)[name = string("x_361_cast_fp16")];
+            int32 var_10633 = const()[name = string("op_10633"), val = int32(-1)];
+            fp16 const_200_promoted_to_fp16 = const()[name = string("const_200_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_10635_cast_fp16 = mul(x = x_361_cast_fp16, y = const_200_promoted_to_fp16)[name = string("op_10635_cast_fp16")];
+            bool input_531_interleave_0 = const()[name = string("input_531_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_531_cast_fp16 = concat(axis = var_10633, interleave = input_531_interleave_0, values = (x_361_cast_fp16, var_10635_cast_fp16))[name = string("input_531_cast_fp16")];
+            tensor<int32, [1]> normed_517_axes_0 = const()[name = string("normed_517_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_10630_to_fp16 = const()[name = string("op_10630_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_517_cast_fp16 = layer_norm(axes = normed_517_axes_0, epsilon = var_10630_to_fp16, x = input_531_cast_fp16)[name = string("normed_517_cast_fp16")];
+            tensor<int32, [2]> var_10640_split_sizes_0 = const()[name = string("op_10640_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_10640_axis_0 = const()[name = string("op_10640_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_10640_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_10640_cast_fp16_1 = split(axis = var_10640_axis_0, split_sizes = var_10640_split_sizes_0, x = normed_517_cast_fp16)[name = string("op_10640_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_7_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c3_7_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(981620032)))];
+            tensor<fp16, [1, 1, 2560]> h_117_cast_fp16 = mul(x = var_10640_cast_fp16_0, y = layers_c3_7_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_117_cast_fp16")];
+            tensor<int32, [3]> var_10651 = const()[name = string("op_10651"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_533_axes_0 = const()[name = string("input_533_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_10652 = transpose(perm = var_10651, x = h_117_cast_fp16)[name = string("transpose_14")];
+            tensor<fp16, [1, 2560, 1, 1]> input_533 = expand_dims(axes = input_533_axes_0, x = var_10652)[name = string("input_533")];
+            string gate_77_pad_type_0 = const()[name = string("gate_77_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_77_strides_0 = const()[name = string("gate_77_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_77_pad_0 = const()[name = string("gate_77_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_77_dilations_0 = const()[name = string("gate_77_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_77_groups_0 = const()[name = string("gate_77_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_77 = conv(dilations = gate_77_dilations_0, groups = gate_77_groups_0, pad = gate_77_pad_0, pad_type = gate_77_pad_type_0, strides = gate_77_strides_0, weight = layers_c3_7_mlp_gate_proj_weight_palettized, x = input_533)[name = string("gate_77")];
+            string up_39_pad_type_0 = const()[name = string("up_39_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_39_strides_0 = const()[name = string("up_39_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_39_pad_0 = const()[name = string("up_39_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_39_dilations_0 = const()[name = string("up_39_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_39_groups_0 = const()[name = string("up_39_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_39 = conv(dilations = up_39_dilations_0, groups = up_39_groups_0, pad = up_39_pad_0, pad_type = up_39_pad_type_0, strides = up_39_strides_0, weight = layers_c3_7_mlp_up_proj_weight_palettized, x = input_533)[name = string("up_39")];
+            string gate_79_mode_0 = const()[name = string("gate_79_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_79 = gelu(mode = gate_79_mode_0, x = gate_77)[name = string("gate_79")];
+            tensor<fp16, [1, 10240, 1, 1]> input_535 = mul(x = gate_79, y = up_39)[name = string("input_535")];
+            string mlp_out_39_pad_type_0 = const()[name = string("mlp_out_39_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_39_strides_0 = const()[name = string("mlp_out_39_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_39_pad_0 = const()[name = string("mlp_out_39_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_39_dilations_0 = const()[name = string("mlp_out_39_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_39_groups_0 = const()[name = string("mlp_out_39_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_39 = conv(dilations = mlp_out_39_dilations_0, groups = mlp_out_39_groups_0, pad = mlp_out_39_pad_0, pad_type = mlp_out_39_pad_type_0, strides = mlp_out_39_strides_0, weight = layers_c3_7_mlp_down_proj_weight_palettized, x = input_535)[name = string("mlp_out_39")];
+            tensor<int32, [1]> var_10692_axes_0 = const()[name = string("op_10692_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_10692 = squeeze(axes = var_10692_axes_0, x = mlp_out_39)[name = string("op_10692")];
+            tensor<int32, [3]> var_10696 = const()[name = string("op_10696"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_10702 = const()[name = string("op_10702"), val = int32(-1)];
+            fp16 const_201_promoted = const()[name = string("const_201_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_363 = transpose(perm = var_10696, x = var_10692)[name = string("transpose_13")];
+            tensor<fp16, [1, 1, 2560]> var_10704 = mul(x = x_363, y = const_201_promoted)[name = string("op_10704")];
+            bool input_537_interleave_0 = const()[name = string("input_537_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_537 = concat(axis = var_10702, interleave = input_537_interleave_0, values = (x_363, var_10704))[name = string("input_537")];
+            tensor<int32, [1]> normed_521_axes_0 = const()[name = string("normed_521_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_10699_to_fp16 = const()[name = string("op_10699_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_521_cast_fp16 = layer_norm(axes = normed_521_axes_0, epsilon = var_10699_to_fp16, x = input_537)[name = string("normed_521_cast_fp16")];
+            tensor<int32, [2]> var_10709_split_sizes_0 = const()[name = string("op_10709_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_10709_axis_0 = const()[name = string("op_10709_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_10709_0, tensor<fp16, [1, 1, 2560]> var_10709_1 = split(axis = var_10709_axis_0, split_sizes = var_10709_split_sizes_0, x = normed_521_cast_fp16)[name = string("op_10709")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_193 = mul(x = var_10709_0, y = layers_c3_7_post_feedforward_layernorm_weight)[name = string("hidden_states_193")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_195_cast_fp16 = add(x = x_361_cast_fp16, y = hidden_states_193)[name = string("hidden_states_195_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_39_begin_0 = const()[name = string("per_layer_slice_39_begin_0"), val = tensor<int32, [3]>([0, 0, 7936])];
+            tensor<int32, [3]> per_layer_slice_39_end_0 = const()[name = string("per_layer_slice_39_end_0"), val = tensor<int32, [3]>([1, 1, 8192])];
+            tensor<bool, [3]> per_layer_slice_39_end_mask_0 = const()[name = string("per_layer_slice_39_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_39_cast_fp16 = slice_by_index(begin = per_layer_slice_39_begin_0, end = per_layer_slice_39_end_0, end_mask = per_layer_slice_39_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_39_cast_fp16")];
+            tensor<int32, [3]> var_10737 = const()[name = string("op_10737"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_539_axes_0 = const()[name = string("input_539_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_10738 = transpose(perm = var_10737, x = hidden_states_195_cast_fp16)[name = string("transpose_12")];
+            tensor<fp16, [1, 2560, 1, 1]> input_539 = expand_dims(axes = input_539_axes_0, x = var_10738)[name = string("input_539")];
+            string gated_115_pad_type_0 = const()[name = string("gated_115_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_115_strides_0 = const()[name = string("gated_115_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_115_pad_0 = const()[name = string("gated_115_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_115_dilations_0 = const()[name = string("gated_115_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_115_groups_0 = const()[name = string("gated_115_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_115 = conv(dilations = gated_115_dilations_0, groups = gated_115_groups_0, pad = gated_115_pad_0, pad_type = gated_115_pad_type_0, strides = gated_115_strides_0, weight = layers_c3_7_per_layer_input_gate_weight_palettized, x = input_539)[name = string("gated_115")];
+            string gated_117_mode_0 = const()[name = string("gated_117_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_117 = gelu(mode = gated_117_mode_0, x = gated_115)[name = string("gated_117")];
+            tensor<int32, [3]> var_10757 = const()[name = string("op_10757"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_39_axes_0 = const()[name = string("per_layer_slice_conv_39_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_10758_cast_fp16 = transpose(perm = var_10757, x = per_layer_slice_39_cast_fp16)[name = string("transpose_11")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_39_cast_fp16 = expand_dims(axes = per_layer_slice_conv_39_axes_0, x = var_10758_cast_fp16)[name = string("per_layer_slice_conv_39_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_541_cast_fp16 = mul(x = gated_117, y = per_layer_slice_conv_39_cast_fp16)[name = string("input_541_cast_fp16")];
+            string gated_119_pad_type_0 = const()[name = string("gated_119_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_119_strides_0 = const()[name = string("gated_119_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_119_pad_0 = const()[name = string("gated_119_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_119_dilations_0 = const()[name = string("gated_119_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_119_groups_0 = const()[name = string("gated_119_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_c3_7_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(981625216))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(981952960))))[name = string("layers_c3_7_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_119_cast_fp16 = conv(dilations = gated_119_dilations_0, groups = gated_119_groups_0, pad = gated_119_pad_0, pad_type = gated_119_pad_type_0, strides = gated_119_strides_0, weight = layers_c3_7_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_541_cast_fp16)[name = string("gated_119_cast_fp16")];
+            tensor<int32, [1]> var_10774_axes_0 = const()[name = string("op_10774_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_10774_cast_fp16 = squeeze(axes = var_10774_axes_0, x = gated_119_cast_fp16)[name = string("op_10774_cast_fp16")];
+            tensor<int32, [3]> var_10778 = const()[name = string("op_10778"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_10784 = const()[name = string("op_10784"), val = int32(-1)];
+            fp16 const_202_promoted_to_fp16 = const()[name = string("const_202_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_365_cast_fp16 = transpose(perm = var_10778, x = var_10774_cast_fp16)[name = string("transpose_10")];
+            tensor<fp16, [1, 1, 2560]> var_10786_cast_fp16 = mul(x = x_365_cast_fp16, y = const_202_promoted_to_fp16)[name = string("op_10786_cast_fp16")];
+            bool input_543_interleave_0 = const()[name = string("input_543_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_543_cast_fp16 = concat(axis = var_10784, interleave = input_543_interleave_0, values = (x_365_cast_fp16, var_10786_cast_fp16))[name = string("input_543_cast_fp16")];
+            tensor<int32, [1]> normed_525_axes_0 = const()[name = string("normed_525_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_10781_to_fp16 = const()[name = string("op_10781_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_525_cast_fp16 = layer_norm(axes = normed_525_axes_0, epsilon = var_10781_to_fp16, x = input_543_cast_fp16)[name = string("normed_525_cast_fp16")];
+            tensor<int32, [2]> var_10791_split_sizes_0 = const()[name = string("op_10791_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_10791_axis_0 = const()[name = string("op_10791_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_10791_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_10791_cast_fp16_1 = split(axis = var_10791_axis_0, split_sizes = var_10791_split_sizes_0, x = normed_525_cast_fp16)[name = string("op_10791_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_7_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_c3_7_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(981955584)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_199_cast_fp16 = mul(x = var_10791_cast_fp16_0, y = layers_c3_7_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_199_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_201_cast_fp16 = add(x = hidden_states_195_cast_fp16, y = hidden_states_199_cast_fp16)[name = string("hidden_states_201_cast_fp16")];
+            tensor<fp16, [1]> const_203_promoted_to_fp16 = const()[name = string("const_203_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.a2p-1])];
+            tensor<fp16, [1, 1, 2560]> x_367_cast_fp16 = mul(x = hidden_states_201_cast_fp16, y = const_203_promoted_to_fp16)[name = string("x_367_cast_fp16")];
+            int32 var_10806 = const()[name = string("op_10806"), val = int32(-1)];
+            fp16 const_204_promoted_to_fp16 = const()[name = string("const_204_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_10808_cast_fp16 = mul(x = x_367_cast_fp16, y = const_204_promoted_to_fp16)[name = string("op_10808_cast_fp16")];
+            bool input_545_interleave_0 = const()[name = string("input_545_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_545_cast_fp16 = concat(axis = var_10806, interleave = input_545_interleave_0, values = (x_367_cast_fp16, var_10808_cast_fp16))[name = string("input_545_cast_fp16")];
+            tensor<int32, [1]> normed_529_axes_0 = const()[name = string("normed_529_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_10803_to_fp16 = const()[name = string("op_10803_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_529_cast_fp16 = layer_norm(axes = normed_529_axes_0, epsilon = var_10803_to_fp16, x = input_545_cast_fp16)[name = string("normed_529_cast_fp16")];
+            tensor<int32, [2]> var_10813_split_sizes_0 = const()[name = string("op_10813_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_10813_axis_0 = const()[name = string("op_10813_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_10813_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_10813_cast_fp16_1 = split(axis = var_10813_axis_0, split_sizes = var_10813_split_sizes_0, x = normed_529_cast_fp16)[name = string("op_10813_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_8_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c3_8_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(981960768)))];
+            tensor<fp16, [1, 1, 2560]> h_121_cast_fp16 = mul(x = var_10813_cast_fp16_0, y = layers_c3_8_input_layernorm_weight_promoted_to_fp16)[name = string("h_121_cast_fp16")];
+            tensor<int32, [3]> var_10819 = const()[name = string("op_10819"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_10822_axes_0 = const()[name = string("op_10822_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_10820_cast_fp16 = transpose(perm = var_10819, x = h_121_cast_fp16)[name = string("transpose_9")];
+            tensor<fp16, [1, 2560, 1, 1]> var_10822_cast_fp16 = expand_dims(axes = var_10822_axes_0, x = var_10820_cast_fp16)[name = string("op_10822_cast_fp16")];
+            string var_10838_pad_type_0 = const()[name = string("op_10838_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_10838_strides_0 = const()[name = string("op_10838_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_10838_pad_0 = const()[name = string("op_10838_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_10838_dilations_0 = const()[name = string("op_10838_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_10838_groups_0 = const()[name = string("op_10838_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_10838 = conv(dilations = var_10838_dilations_0, groups = var_10838_groups_0, pad = var_10838_pad_0, pad_type = var_10838_pad_type_0, strides = var_10838_strides_0, weight = layers_c3_8_self_attn_q_proj_weight_palettized, x = var_10822_cast_fp16)[name = string("op_10838")];
+            tensor<int32, [4]> var_10843 = const()[name = string("op_10843"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_10844 = reshape(shape = var_10843, x = var_10838)[name = string("op_10844")];
+            tensor<int32, [4]> var_10849 = const()[name = string("op_10849"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_10859 = const()[name = string("op_10859"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_10850 = transpose(perm = var_10849, x = var_10844)[name = string("transpose_8")];
+            tensor<fp16, [1, 8, 256]> x_369 = reshape(shape = var_10859, x = var_10850)[name = string("x_369")];
+            int32 var_10865 = const()[name = string("op_10865"), val = int32(-1)];
+            fp16 const_205_promoted = const()[name = string("const_205_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_10867 = mul(x = x_369, y = const_205_promoted)[name = string("op_10867")];
+            bool input_549_interleave_0 = const()[name = string("input_549_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_549 = concat(axis = var_10865, interleave = input_549_interleave_0, values = (x_369, var_10867))[name = string("input_549")];
+            tensor<int32, [1]> normed_533_axes_0 = const()[name = string("normed_533_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_10862_to_fp16 = const()[name = string("op_10862_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_533_cast_fp16 = layer_norm(axes = normed_533_axes_0, epsilon = var_10862_to_fp16, x = input_549)[name = string("normed_533_cast_fp16")];
+            tensor<int32, [2]> var_10872_split_sizes_0 = const()[name = string("op_10872_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_10872_axis_0 = const()[name = string("op_10872_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_10872_0, tensor<fp16, [1, 8, 256]> var_10872_1 = split(axis = var_10872_axis_0, split_sizes = var_10872_split_sizes_0, x = normed_533_cast_fp16)[name = string("op_10872")];
+            tensor<fp16, [1, 8, 256]> var_10874 = mul(x = var_10872_0, y = layers_c2_10_self_attn_q_norm_weight)[name = string("op_10874")];
+            tensor<int32, [4]> var_10879 = const()[name = string("op_10879"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_147 = reshape(shape = var_10879, x = var_10874)[name = string("q_147")];
+            tensor<fp16, [1, 8, 1, 256]> var_10881_cast_fp16 = mul(x = q_147, y = cos_s)[name = string("op_10881_cast_fp16")];
+            tensor<int32, [2]> var_10882_split_sizes_0 = const()[name = string("op_10882_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_10882_axis_0 = const()[name = string("op_10882_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_10882_0, tensor<fp16, [1, 8, 1, 128]> var_10882_1 = split(axis = var_10882_axis_0, split_sizes = var_10882_split_sizes_0, x = q_147)[name = string("op_10882")];
+            fp16 const_206_promoted = const()[name = string("const_206_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_10884 = mul(x = var_10882_1, y = const_206_promoted)[name = string("op_10884")];
+            int32 var_10886 = const()[name = string("op_10886"), val = int32(-1)];
+            bool var_10887_interleave_0 = const()[name = string("op_10887_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_10887 = concat(axis = var_10886, interleave = var_10887_interleave_0, values = (var_10884, var_10882_0))[name = string("op_10887")];
+            tensor<fp16, [1, 8, 1, 256]> var_10888_cast_fp16 = mul(x = var_10887, y = sin_s)[name = string("op_10888_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_cast_fp16 = add(x = var_10881_cast_fp16, y = var_10888_cast_fp16)[name = string("q_cast_fp16")];
+            bool attn_weights_81_transpose_x_0 = const()[name = string("attn_weights_81_transpose_x_0"), val = bool(false)];
+            bool attn_weights_81_transpose_y_0 = const()[name = string("attn_weights_81_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_81_cast_fp16 = matmul(transpose_x = attn_weights_81_transpose_x_0, transpose_y = attn_weights_81_transpose_y_0, x = q_cast_fp16, y = transpose_94_cast_fp16)[name = string("attn_weights_81_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_371_cast_fp16 = add(x = attn_weights_81_cast_fp16, y = causal_mask_sliding)[name = string("x_371_cast_fp16")];
+            tensor<int32, [1]> reduce_max_20_axes_0 = const()[name = string("reduce_max_20_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_20_keep_dims_0 = const()[name = string("reduce_max_20_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_20 = reduce_max(axes = reduce_max_20_axes_0, keep_dims = reduce_max_20_keep_dims_0, x = x_371_cast_fp16)[name = string("reduce_max_20")];
+            tensor<fp16, [1, 8, 1, 512]> var_10920 = sub(x = x_371_cast_fp16, y = reduce_max_20)[name = string("op_10920")];
+            tensor<fp16, [1, 8, 1, 512]> var_10926 = exp(x = var_10920)[name = string("op_10926")];
+            tensor<int32, [1]> var_10936_axes_0 = const()[name = string("op_10936_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_10936_keep_dims_0 = const()[name = string("op_10936_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_10936 = reduce_sum(axes = var_10936_axes_0, keep_dims = var_10936_keep_dims_0, x = var_10926)[name = string("op_10936")];
+            tensor<fp16, [1, 8, 1, 512]> var_10942_cast_fp16 = real_div(x = var_10926, y = var_10936)[name = string("op_10942_cast_fp16")];
+            bool attn_output_121_transpose_x_0 = const()[name = string("attn_output_121_transpose_x_0"), val = bool(false)];
+            bool attn_output_121_transpose_y_0 = const()[name = string("attn_output_121_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_121_cast_fp16 = matmul(transpose_x = attn_output_121_transpose_x_0, transpose_y = attn_output_121_transpose_y_0, x = var_10942_cast_fp16, y = V_expanded_21_cast_fp16)[name = string("attn_output_121_cast_fp16")];
+            tensor<int32, [4]> var_10953 = const()[name = string("op_10953"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_10960 = const()[name = string("op_10960"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_10954_cast_fp16 = transpose(perm = var_10953, x = attn_output_121_cast_fp16)[name = string("transpose_7")];
+            tensor<fp16, [1, 1, 2048]> attn_output_123_cast_fp16 = reshape(shape = var_10960, x = var_10954_cast_fp16)[name = string("attn_output_123_cast_fp16")];
+            tensor<int32, [3]> var_10965 = const()[name = string("op_10965"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_10981_pad_type_0 = const()[name = string("op_10981_pad_type_0"), val = string("valid")];
+            int32 var_10981_groups_0 = const()[name = string("op_10981_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_10981_strides_0 = const()[name = string("op_10981_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_10981_pad_0 = const()[name = string("op_10981_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_10981_dilations_0 = const()[name = string("op_10981_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_20_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(981965952))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(984587456))))[name = string("squeeze_20_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_10966_cast_fp16 = transpose(perm = var_10965, x = attn_output_123_cast_fp16)[name = string("transpose_6")];
+            tensor<fp16, [1, 2560, 1]> var_10981_cast_fp16 = conv(dilations = var_10981_dilations_0, groups = var_10981_groups_0, pad = var_10981_pad_0, pad_type = var_10981_pad_type_0, strides = var_10981_strides_0, weight = squeeze_20_cast_fp16_to_fp32_to_fp16_palettized, x = var_10966_cast_fp16)[name = string("op_10981_cast_fp16")];
+            tensor<int32, [3]> var_10985 = const()[name = string("op_10985"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_10991 = const()[name = string("op_10991"), val = int32(-1)];
+            fp16 const_207_promoted_to_fp16 = const()[name = string("const_207_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_375_cast_fp16 = transpose(perm = var_10985, x = var_10981_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 1, 2560]> var_10993_cast_fp16 = mul(x = x_375_cast_fp16, y = const_207_promoted_to_fp16)[name = string("op_10993_cast_fp16")];
+            bool input_553_interleave_0 = const()[name = string("input_553_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_553_cast_fp16 = concat(axis = var_10991, interleave = input_553_interleave_0, values = (x_375_cast_fp16, var_10993_cast_fp16))[name = string("input_553_cast_fp16")];
+            tensor<int32, [1]> normed_537_axes_0 = const()[name = string("normed_537_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_10988_to_fp16 = const()[name = string("op_10988_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_537_cast_fp16 = layer_norm(axes = normed_537_axes_0, epsilon = var_10988_to_fp16, x = input_553_cast_fp16)[name = string("normed_537_cast_fp16")];
+            tensor<int32, [2]> var_10998_split_sizes_0 = const()[name = string("op_10998_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_10998_axis_0 = const()[name = string("op_10998_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_10998_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_10998_cast_fp16_1 = split(axis = var_10998_axis_0, split_sizes = var_10998_split_sizes_0, x = normed_537_cast_fp16)[name = string("op_10998_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_8_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c3_8_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(984590080)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_cast_fp16 = mul(x = var_10998_cast_fp16_0, y = layers_c3_8_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_377_cast_fp16 = add(x = x_367_cast_fp16, y = attn_output_cast_fp16)[name = string("x_377_cast_fp16")];
+            int32 var_11007 = const()[name = string("op_11007"), val = int32(-1)];
+            fp16 const_208_promoted_to_fp16 = const()[name = string("const_208_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_11009_cast_fp16 = mul(x = x_377_cast_fp16, y = const_208_promoted_to_fp16)[name = string("op_11009_cast_fp16")];
+            bool input_555_interleave_0 = const()[name = string("input_555_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_555_cast_fp16 = concat(axis = var_11007, interleave = input_555_interleave_0, values = (x_377_cast_fp16, var_11009_cast_fp16))[name = string("input_555_cast_fp16")];
+            tensor<int32, [1]> normed_541_axes_0 = const()[name = string("normed_541_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_11004_to_fp16 = const()[name = string("op_11004_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_541_cast_fp16 = layer_norm(axes = normed_541_axes_0, epsilon = var_11004_to_fp16, x = input_555_cast_fp16)[name = string("normed_541_cast_fp16")];
+            tensor<int32, [2]> var_11014_split_sizes_0 = const()[name = string("op_11014_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_11014_axis_0 = const()[name = string("op_11014_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_11014_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_11014_cast_fp16_1 = split(axis = var_11014_axis_0, split_sizes = var_11014_split_sizes_0, x = normed_541_cast_fp16)[name = string("op_11014_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_8_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_c3_8_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(984595264)))];
+            tensor<fp16, [1, 1, 2560]> h_123_cast_fp16 = mul(x = var_11014_cast_fp16_0, y = layers_c3_8_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_123_cast_fp16")];
+            tensor<int32, [3]> var_11025 = const()[name = string("op_11025"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_557_axes_0 = const()[name = string("input_557_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_11026 = transpose(perm = var_11025, x = h_123_cast_fp16)[name = string("transpose_4")];
+            tensor<fp16, [1, 2560, 1, 1]> input_557 = expand_dims(axes = input_557_axes_0, x = var_11026)[name = string("input_557")];
+            string gate_81_pad_type_0 = const()[name = string("gate_81_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_81_strides_0 = const()[name = string("gate_81_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_81_pad_0 = const()[name = string("gate_81_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_81_dilations_0 = const()[name = string("gate_81_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_81_groups_0 = const()[name = string("gate_81_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_81 = conv(dilations = gate_81_dilations_0, groups = gate_81_groups_0, pad = gate_81_pad_0, pad_type = gate_81_pad_type_0, strides = gate_81_strides_0, weight = layers_c3_8_mlp_gate_proj_weight_palettized, x = input_557)[name = string("gate_81")];
+            string up_pad_type_0 = const()[name = string("up_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_strides_0 = const()[name = string("up_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_pad_0 = const()[name = string("up_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_dilations_0 = const()[name = string("up_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_groups_0 = const()[name = string("up_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up = conv(dilations = up_dilations_0, groups = up_groups_0, pad = up_pad_0, pad_type = up_pad_type_0, strides = up_strides_0, weight = layers_c3_8_mlp_up_proj_weight_palettized, x = input_557)[name = string("up")];
+            string gate_mode_0 = const()[name = string("gate_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate = gelu(mode = gate_mode_0, x = gate_81)[name = string("gate")];
+            tensor<fp16, [1, 10240, 1, 1]> input_559 = mul(x = gate, y = up)[name = string("input_559")];
+            string mlp_out_pad_type_0 = const()[name = string("mlp_out_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_strides_0 = const()[name = string("mlp_out_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_pad_0 = const()[name = string("mlp_out_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_dilations_0 = const()[name = string("mlp_out_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_groups_0 = const()[name = string("mlp_out_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out = conv(dilations = mlp_out_dilations_0, groups = mlp_out_groups_0, pad = mlp_out_pad_0, pad_type = mlp_out_pad_type_0, strides = mlp_out_strides_0, weight = layers_c3_8_mlp_down_proj_weight_palettized, x = input_559)[name = string("mlp_out")];
+            tensor<int32, [1]> var_11066_axes_0 = const()[name = string("op_11066_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_11066 = squeeze(axes = var_11066_axes_0, x = mlp_out)[name = string("op_11066")];
+            tensor<int32, [3]> var_11070 = const()[name = string("op_11070"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_11076 = const()[name = string("op_11076"), val = int32(-1)];
+            fp16 const_209_promoted = const()[name = string("const_209_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_379 = transpose(perm = var_11070, x = var_11066)[name = string("transpose_3")];
+            tensor<fp16, [1, 1, 2560]> var_11078 = mul(x = x_379, y = const_209_promoted)[name = string("op_11078")];
+            bool input_561_interleave_0 = const()[name = string("input_561_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_561 = concat(axis = var_11076, interleave = input_561_interleave_0, values = (x_379, var_11078))[name = string("input_561")];
+            tensor<int32, [1]> normed_545_axes_0 = const()[name = string("normed_545_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_11073_to_fp16 = const()[name = string("op_11073_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_545_cast_fp16 = layer_norm(axes = normed_545_axes_0, epsilon = var_11073_to_fp16, x = input_561)[name = string("normed_545_cast_fp16")];
+            tensor<int32, [2]> var_11083_split_sizes_0 = const()[name = string("op_11083_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_11083_axis_0 = const()[name = string("op_11083_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_11083_0, tensor<fp16, [1, 1, 2560]> var_11083_1 = split(axis = var_11083_axis_0, split_sizes = var_11083_split_sizes_0, x = normed_545_cast_fp16)[name = string("op_11083")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_203 = mul(x = var_11083_0, y = layers_c3_8_post_feedforward_layernorm_weight)[name = string("hidden_states_203")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_205_cast_fp16 = add(x = x_377_cast_fp16, y = hidden_states_203)[name = string("hidden_states_205_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_begin_0 = const()[name = string("per_layer_slice_begin_0"), val = tensor<int32, [3]>([0, 0, 8192])];
+            tensor<int32, [3]> per_layer_slice_end_0 = const()[name = string("per_layer_slice_end_0"), val = tensor<int32, [3]>([1, 1, 8448])];
+            tensor<bool, [3]> per_layer_slice_end_mask_0 = const()[name = string("per_layer_slice_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_cast_fp16 = slice_by_index(begin = per_layer_slice_begin_0, end = per_layer_slice_end_0, end_mask = per_layer_slice_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_cast_fp16")];
+            tensor<int32, [3]> var_11111 = const()[name = string("op_11111"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_563_axes_0 = const()[name = string("input_563_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_11112 = transpose(perm = var_11111, x = hidden_states_205_cast_fp16)[name = string("transpose_2")];
+            tensor<fp16, [1, 2560, 1, 1]> input_563 = expand_dims(axes = input_563_axes_0, x = var_11112)[name = string("input_563")];
+            string gated_121_pad_type_0 = const()[name = string("gated_121_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_121_strides_0 = const()[name = string("gated_121_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_121_pad_0 = const()[name = string("gated_121_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_121_dilations_0 = const()[name = string("gated_121_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_121_groups_0 = const()[name = string("gated_121_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_121 = conv(dilations = gated_121_dilations_0, groups = gated_121_groups_0, pad = gated_121_pad_0, pad_type = gated_121_pad_type_0, strides = gated_121_strides_0, weight = layers_c3_8_per_layer_input_gate_weight_palettized, x = input_563)[name = string("gated_121")];
+            string gated_123_mode_0 = const()[name = string("gated_123_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_123 = gelu(mode = gated_123_mode_0, x = gated_121)[name = string("gated_123")];
+            tensor<int32, [3]> var_11131 = const()[name = string("op_11131"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_axes_0 = const()[name = string("per_layer_slice_conv_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_11132_cast_fp16 = transpose(perm = var_11131, x = per_layer_slice_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_cast_fp16 = expand_dims(axes = per_layer_slice_conv_axes_0, x = var_11132_cast_fp16)[name = string("per_layer_slice_conv_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_565_cast_fp16 = mul(x = gated_123, y = per_layer_slice_conv_cast_fp16)[name = string("input_565_cast_fp16")];
+            string gated_pad_type_0 = const()[name = string("gated_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_strides_0 = const()[name = string("gated_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_pad_0 = const()[name = string("gated_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_dilations_0 = const()[name = string("gated_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_groups_0 = const()[name = string("gated_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_c3_8_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(984600448))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(984928192))))[name = string("layers_c3_8_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_cast_fp16 = conv(dilations = gated_dilations_0, groups = gated_groups_0, pad = gated_pad_0, pad_type = gated_pad_type_0, strides = gated_strides_0, weight = layers_c3_8_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_565_cast_fp16)[name = string("gated_cast_fp16")];
+            tensor<int32, [1]> var_11148_axes_0 = const()[name = string("op_11148_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_11148_cast_fp16 = squeeze(axes = var_11148_axes_0, x = gated_cast_fp16)[name = string("op_11148_cast_fp16")];
+            tensor<int32, [3]> var_11152 = const()[name = string("op_11152"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_11158 = const()[name = string("op_11158"), val = int32(-1)];
+            fp16 const_210_promoted_to_fp16 = const()[name = string("const_210_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_cast_fp16 = transpose(perm = var_11152, x = var_11148_cast_fp16)[name = string("transpose_0")];
+            tensor<fp16, [1, 1, 2560]> var_11160_cast_fp16 = mul(x = x_cast_fp16, y = const_210_promoted_to_fp16)[name = string("op_11160_cast_fp16")];
+            bool input_interleave_0 = const()[name = string("input_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_cast_fp16 = concat(axis = var_11158, interleave = input_interleave_0, values = (x_cast_fp16, var_11160_cast_fp16))[name = string("input_cast_fp16")];
+            tensor<int32, [1]> normed_549_axes_0 = const()[name = string("normed_549_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_11155_to_fp16 = const()[name = string("op_11155_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_549_cast_fp16 = layer_norm(axes = normed_549_axes_0, epsilon = var_11155_to_fp16, x = input_cast_fp16)[name = string("normed_549_cast_fp16")];
+            tensor<int32, [2]> var_11165_split_sizes_0 = const()[name = string("op_11165_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_11165_axis_0 = const()[name = string("op_11165_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_11165_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_11165_cast_fp16_1 = split(axis = var_11165_axis_0, split_sizes = var_11165_split_sizes_0, x = normed_549_cast_fp16)[name = string("op_11165_cast_fp16")];
+            tensor<fp16, [2560]> layers_c3_8_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_c3_8_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(984930816)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_209_cast_fp16 = mul(x = var_11165_cast_fp16_0, y = layers_c3_8_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_209_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_cast_fp16 = add(x = hidden_states_205_cast_fp16, y = hidden_states_209_cast_fp16)[name = string("hidden_states_cast_fp16")];
+            tensor<fp16, [1]> const_211_promoted_to_fp16 = const()[name = string("const_211_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.b4p-1])];
+            tensor<fp16, [1, 1, 2560]> hidden_states_out = mul(x = hidden_states_cast_fp16, y = const_211_promoted_to_fp16)[name = string("op_11175_cast_fp16")];
+            int32 var_11178_axis_0 = const()[name = string("op_11178_axis_0"), val = int32(0)];
+            tensor<fp16, [10, 2, 512, 512]> K_sliding_out = stack(axis = var_11178_axis_0, values = (var_1722_cast_fp16, var_2281_cast_fp16, var_2840_cast_fp16, var_3399_cast_fp16, var_3958_cast_fp16, var_5034_cast_fp16, var_5593_cast_fp16, var_6152_cast_fp16, var_6711_cast_fp16, var_7280_cast_fp16))[name = string("op_11178_cast_fp16")];
+            int32 var_11181_axis_0 = const()[name = string("op_11181_axis_0"), val = int32(0)];
+            tensor<fp16, [10, 2, 512, 512]> V_sliding_out = stack(axis = var_11181_axis_0, values = (var_1724_cast_fp16, var_2283_cast_fp16, var_2842_cast_fp16, var_3401_cast_fp16, var_3960_cast_fp16, var_5036_cast_fp16, var_5595_cast_fp16, var_6154_cast_fp16, var_6713_cast_fp16, var_7282_cast_fp16))[name = string("op_11181_cast_fp16")];
+            int32 var_11184_axis_0 = const()[name = string("op_11184_axis_0"), val = int32(0)];
+            tensor<fp16, [2, 2, 2048, 512]> K_full_out = stack(axis = var_11184_axis_0, values = (var_4475_cast_fp16, var_7807_cast_fp16))[name = string("op_11184_cast_fp16")];
+            int32 var_11187_axis_0 = const()[name = string("op_11187_axis_0"), val = int32(0)];
+            tensor<fp16, [2, 2, 2048, 512]> V_full_out = stack(axis = var_11187_axis_0, values = (var_4477_cast_fp16, var_7809_cast_fp16))[name = string("op_11187_cast_fp16")];
+        } -> (hidden_states_out, K_sliding_out, V_sliding_out, K_full_out, V_full_out, kv13_k, kv13_v, kv14_k, kv14_v);
+}
\ No newline at end of file
diff --git a/chunk2_3way.mlmodelc/weights/weight.bin b/chunk2_3way.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cdd76a14663cd78fc1a5e310f0480a852e059bb8
--- /dev/null
+++ b/chunk2_3way.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2dfcddee4de0905bab42c9dbf6b4d03ec1fc888d76de762dee5153423081b838
+size 984936000
diff --git a/chunk3.mlmodelc/analytics/coremldata.bin b/chunk3.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..743cc3d816cd971d742e94b590870f4b807ccf01
--- /dev/null
+++ b/chunk3.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83e107e7d0f531fa20c6861ea0483120a4246adc98daba2bdc9ec015f77bc7ac
+size 243
diff --git a/chunk3.mlmodelc/coremldata.bin b/chunk3.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f0d51080450c48a965e71afe61876ffe1ced1cff
--- /dev/null
+++ b/chunk3.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d43e7d6694b27bec11ece3eb5bb8b3b5b185fa8bc0b668980e154952fd36bf0b
+size 940
diff --git a/chunk3.mlmodelc/model.mil b/chunk3.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..3a509e87f3c5d31c8231e1c5ec49885b4e8744cf
--- /dev/null
+++ b/chunk3.mlmodelc/model.mil
@@ -0,0 +1,3871 @@
+program(1.3)
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3500.14.1"}, {"coremlc-version", "3500.32.1"}})]
+{
+    func decode_q1<ios18>(tensor<fp16, [1, 1, 1, 2048]> causal_mask_full, tensor<fp16, [1, 1, 1, 512]> causal_mask_sliding, tensor<fp16, [1, 1, 1, 512]> cos_f, tensor<fp16, [1, 1, 1, 256]> cos_s, tensor<fp16, [1, 1, 2560]> hidden_states, tensor<fp16, [1, 2, 512, 256]> kv13_k, tensor<fp16, [1, 2, 512, 256]> kv13_v, tensor<fp16, [1, 2, 2048, 512]> kv14_k, tensor<fp16, [1, 2, 2048, 512]> kv14_v, tensor<fp16, [1, 1, 10752]> per_layer_combined, tensor<fp16, [1, 1, 1, 512]> sin_f, tensor<fp16, [1, 1, 1, 256]> sin_s, tensor<fp16, [1, 1, 2048, 1]> update_mask) {
+            tensor<fp16, [2048, 2560, 1, 1]> layers_0_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2621568))))[name = string("layers_0_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_0_self_attn_q_norm_weight = const()[name = string("layers_0_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2623680)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_0_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2624256))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(15731520))))[name = string("layers_0_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_0_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(15741824))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(28849088))))[name = string("layers_0_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_0_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(28859392))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(41966656))))[name = string("layers_0_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_0_post_feedforward_layernorm_weight = const()[name = string("layers_0_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(41969280)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_0_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(41974464))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(42302208))))[name = string("layers_0_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_1_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(42302528))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(44924032))))[name = string("layers_1_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_1_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(44926144))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(58033408))))[name = string("layers_1_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_1_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(58043712))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(71150976))))[name = string("layers_1_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_1_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(71161280))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(84268544))))[name = string("layers_1_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_1_post_feedforward_layernorm_weight = const()[name = string("layers_1_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(84271168)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_1_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(84276352))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(84604096))))[name = string("layers_1_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_2_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(84604416))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(87225920))))[name = string("layers_2_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_2_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(87228032))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(100335296))))[name = string("layers_2_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_2_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(100345600))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113452864))))[name = string("layers_2_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_2_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113463168))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(126570432))))[name = string("layers_2_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_2_post_feedforward_layernorm_weight = const()[name = string("layers_2_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(126573056)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_2_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(126578240))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(126905984))))[name = string("layers_2_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_3_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(126906304))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(129527808))))[name = string("layers_3_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_3_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(129529920))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(142637184))))[name = string("layers_3_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_3_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(142647488))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(155754752))))[name = string("layers_3_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_3_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(155765056))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(168872320))))[name = string("layers_3_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_3_post_feedforward_layernorm_weight = const()[name = string("layers_3_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(168874944)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_3_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(168880128))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(169207872))))[name = string("layers_3_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_4_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(169208192))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(171829696))))[name = string("layers_4_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_4_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(171831808))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(184939072))))[name = string("layers_4_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_4_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(184949376))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(198056640))))[name = string("layers_4_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_4_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(198066944))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(211174208))))[name = string("layers_4_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_4_post_feedforward_layernorm_weight = const()[name = string("layers_4_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(211176832)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_4_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(211182016))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(211509760))))[name = string("layers_4_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [4096, 2560, 1, 1]> layers_5_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(211510080))), lut = tensor<fp16, [128, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(216753024))))[name = string("layers_5_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [512]> layers_5_self_attn_q_norm_weight = const()[name = string("layers_5_self_attn_q_norm_weight"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(216757184)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_5_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(216758272))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(229865536))))[name = string("layers_5_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_5_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(229875840))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(242983104))))[name = string("layers_5_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_5_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(242993408))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(256100672))))[name = string("layers_5_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_5_post_feedforward_layernorm_weight = const()[name = string("layers_5_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(256103296)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_5_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(256108480))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(256436224))))[name = string("layers_5_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_6_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(256436544))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(259058048))))[name = string("layers_6_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_6_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(259060160))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(272167424))))[name = string("layers_6_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_6_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(272177728))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(285284992))))[name = string("layers_6_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_6_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(285295296))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(298402560))))[name = string("layers_6_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_6_post_feedforward_layernorm_weight = const()[name = string("layers_6_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(298405184)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_6_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(298410368))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(298738112))))[name = string("layers_6_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_7_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(298738432))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(301359936))))[name = string("layers_7_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_7_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(301362048))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(314469312))))[name = string("layers_7_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_7_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(314479616))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(327586880))))[name = string("layers_7_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_7_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(327597184))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(340704448))))[name = string("layers_7_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_7_post_feedforward_layernorm_weight = const()[name = string("layers_7_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(340707072)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_7_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(340712256))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(341040000))))[name = string("layers_7_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_8_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(341040320))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(343661824))))[name = string("layers_8_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_8_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(343663936))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(356771200))))[name = string("layers_8_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_8_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(356781504))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(369888768))))[name = string("layers_8_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_8_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(369899072))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(383006336))))[name = string("layers_8_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_8_post_feedforward_layernorm_weight = const()[name = string("layers_8_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(383008960)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_8_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(383014144))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(383341888))))[name = string("layers_8_per_layer_input_gate_weight_palettized")];
+            int32 var_450 = const()[name = string("op_450"), val = int32(-1)];
+            fp16 const_0_promoted_to_fp16 = const()[name = string("const_0_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_452_cast_fp16 = mul(x = hidden_states, y = const_0_promoted_to_fp16)[name = string("op_452_cast_fp16")];
+            bool input_1_interleave_0 = const()[name = string("input_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_1_cast_fp16 = concat(axis = var_450, interleave = input_1_interleave_0, values = (hidden_states, var_452_cast_fp16))[name = string("input_1_cast_fp16")];
+            tensor<int32, [1]> normed_1_axes_0 = const()[name = string("normed_1_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_447_to_fp16 = const()[name = string("op_447_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_1_cast_fp16 = layer_norm(axes = normed_1_axes_0, epsilon = var_447_to_fp16, x = input_1_cast_fp16)[name = string("normed_1_cast_fp16")];
+            tensor<int32, [2]> var_457_split_sizes_0 = const()[name = string("op_457_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_457_axis_0 = const()[name = string("op_457_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_457_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_457_cast_fp16_1 = split(axis = var_457_axis_0, split_sizes = var_457_split_sizes_0, x = normed_1_cast_fp16)[name = string("op_457_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_0_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(383342208)))];
+            tensor<fp16, [1, 1, 2560]> h_1_cast_fp16 = mul(x = var_457_cast_fp16_0, y = layers_0_input_layernorm_weight_promoted_to_fp16)[name = string("h_1_cast_fp16")];
+            tensor<int32, [3]> var_463 = const()[name = string("op_463"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_466_axes_0 = const()[name = string("op_466_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_464_cast_fp16 = transpose(perm = var_463, x = h_1_cast_fp16)[name = string("transpose_101")];
+            tensor<fp16, [1, 2560, 1, 1]> var_466_cast_fp16 = expand_dims(axes = var_466_axes_0, x = var_464_cast_fp16)[name = string("op_466_cast_fp16")];
+            string var_482_pad_type_0 = const()[name = string("op_482_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_482_strides_0 = const()[name = string("op_482_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_482_pad_0 = const()[name = string("op_482_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_482_dilations_0 = const()[name = string("op_482_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_482_groups_0 = const()[name = string("op_482_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_482 = conv(dilations = var_482_dilations_0, groups = var_482_groups_0, pad = var_482_pad_0, pad_type = var_482_pad_type_0, strides = var_482_strides_0, weight = layers_0_self_attn_q_proj_weight_palettized, x = var_466_cast_fp16)[name = string("op_482")];
+            tensor<int32, [4]> var_487 = const()[name = string("op_487"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_488 = reshape(shape = var_487, x = var_482)[name = string("op_488")];
+            tensor<int32, [4]> var_493 = const()[name = string("op_493"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_503 = const()[name = string("op_503"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_494 = transpose(perm = var_493, x = var_488)[name = string("transpose_100")];
+            tensor<fp16, [1, 8, 256]> x_1 = reshape(shape = var_503, x = var_494)[name = string("x_1")];
+            int32 var_509 = const()[name = string("op_509"), val = int32(-1)];
+            fp16 const_1_promoted = const()[name = string("const_1_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_511 = mul(x = x_1, y = const_1_promoted)[name = string("op_511")];
+            bool input_5_interleave_0 = const()[name = string("input_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_5 = concat(axis = var_509, interleave = input_5_interleave_0, values = (x_1, var_511))[name = string("input_5")];
+            tensor<int32, [1]> normed_5_axes_0 = const()[name = string("normed_5_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_506_to_fp16 = const()[name = string("op_506_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_5_cast_fp16 = layer_norm(axes = normed_5_axes_0, epsilon = var_506_to_fp16, x = input_5)[name = string("normed_5_cast_fp16")];
+            tensor<int32, [2]> var_516_split_sizes_0 = const()[name = string("op_516_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_516_axis_0 = const()[name = string("op_516_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_516_0, tensor<fp16, [1, 8, 256]> var_516_1 = split(axis = var_516_axis_0, split_sizes = var_516_split_sizes_0, x = normed_5_cast_fp16)[name = string("op_516")];
+            tensor<fp16, [1, 8, 256]> var_518 = mul(x = var_516_0, y = layers_0_self_attn_q_norm_weight)[name = string("op_518")];
+            tensor<int32, [4]> var_523 = const()[name = string("op_523"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_3 = reshape(shape = var_523, x = var_518)[name = string("q_3")];
+            tensor<fp16, [1, 8, 1, 256]> var_525_cast_fp16 = mul(x = q_3, y = cos_s)[name = string("op_525_cast_fp16")];
+            tensor<int32, [2]> var_526_split_sizes_0 = const()[name = string("op_526_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_526_axis_0 = const()[name = string("op_526_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_526_0, tensor<fp16, [1, 8, 1, 128]> var_526_1 = split(axis = var_526_axis_0, split_sizes = var_526_split_sizes_0, x = q_3)[name = string("op_526")];
+            fp16 const_2_promoted = const()[name = string("const_2_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_528 = mul(x = var_526_1, y = const_2_promoted)[name = string("op_528")];
+            int32 var_530 = const()[name = string("op_530"), val = int32(-1)];
+            bool var_531_interleave_0 = const()[name = string("op_531_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_531 = concat(axis = var_530, interleave = var_531_interleave_0, values = (var_528, var_526_0))[name = string("op_531")];
+            tensor<fp16, [1, 8, 1, 256]> var_532_cast_fp16 = mul(x = var_531, y = sin_s)[name = string("op_532_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_5_cast_fp16 = add(x = var_525_cast_fp16, y = var_532_cast_fp16)[name = string("q_5_cast_fp16")];
+            tensor<int32, [4]> transpose_0_perm_0 = const()[name = string("transpose_0_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_0_reps_0 = const()[name = string("tile_0_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_0_cast_fp16 = transpose(perm = transpose_0_perm_0, x = kv13_k)[name = string("transpose_99")];
+            tensor<fp16, [8, 1, 512, 256]> tile_0_cast_fp16 = tile(reps = tile_0_reps_0, x = transpose_0_cast_fp16)[name = string("tile_0_cast_fp16")];
+            tensor<int32, [5]> concat_0 = const()[name = string("concat_0"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_0_cast_fp16 = reshape(shape = concat_0, x = tile_0_cast_fp16)[name = string("reshape_0_cast_fp16")];
+            tensor<int32, [5]> transpose_1_perm_0 = const()[name = string("transpose_1_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_1 = const()[name = string("concat_1"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_1_cast_fp16 = transpose(perm = transpose_1_perm_0, x = reshape_0_cast_fp16)[name = string("transpose_98")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_1_cast_fp16 = reshape(shape = concat_1, x = transpose_1_cast_fp16)[name = string("reshape_1_cast_fp16")];
+            tensor<int32, [4]> transpose_36_perm_0 = const()[name = string("transpose_36_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_2_perm_0 = const()[name = string("transpose_2_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_1_reps_0 = const()[name = string("tile_1_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_2_cast_fp16 = transpose(perm = transpose_2_perm_0, x = kv13_v)[name = string("transpose_97")];
+            tensor<fp16, [8, 1, 512, 256]> tile_1_cast_fp16 = tile(reps = tile_1_reps_0, x = transpose_2_cast_fp16)[name = string("tile_1_cast_fp16")];
+            tensor<int32, [5]> concat_2 = const()[name = string("concat_2"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_2_cast_fp16 = reshape(shape = concat_2, x = tile_1_cast_fp16)[name = string("reshape_2_cast_fp16")];
+            tensor<int32, [5]> transpose_3_perm_0 = const()[name = string("transpose_3_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_3 = const()[name = string("concat_3"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_3_cast_fp16 = transpose(perm = transpose_3_perm_0, x = reshape_2_cast_fp16)[name = string("transpose_96")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_3_cast_fp16 = reshape(shape = concat_3, x = transpose_3_cast_fp16)[name = string("reshape_3_cast_fp16")];
+            tensor<int32, [4]> V_expanded_1_perm_0 = const()[name = string("V_expanded_1_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(false)];
+            bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_36_cast_fp16 = transpose(perm = transpose_36_perm_0, x = reshape_1_cast_fp16)[name = string("transpose_95")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = q_5_cast_fp16, y = transpose_36_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = causal_mask_sliding)[name = string("x_3_cast_fp16")];
+            tensor<int32, [1]> reduce_max_0_axes_0 = const()[name = string("reduce_max_0_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_0_keep_dims_0 = const()[name = string("reduce_max_0_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_0 = reduce_max(axes = reduce_max_0_axes_0, keep_dims = reduce_max_0_keep_dims_0, x = x_3_cast_fp16)[name = string("reduce_max_0")];
+            tensor<fp16, [1, 8, 1, 512]> var_564 = sub(x = x_3_cast_fp16, y = reduce_max_0)[name = string("op_564")];
+            tensor<fp16, [1, 8, 1, 512]> var_570 = exp(x = var_564)[name = string("op_570")];
+            tensor<int32, [1]> var_580_axes_0 = const()[name = string("op_580_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_580_keep_dims_0 = const()[name = string("op_580_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_580 = reduce_sum(axes = var_580_axes_0, keep_dims = var_580_keep_dims_0, x = var_570)[name = string("op_580")];
+            tensor<fp16, [1, 8, 1, 512]> var_586_cast_fp16 = real_div(x = var_570, y = var_580)[name = string("op_586_cast_fp16")];
+            bool attn_output_1_transpose_x_0 = const()[name = string("attn_output_1_transpose_x_0"), val = bool(false)];
+            bool attn_output_1_transpose_y_0 = const()[name = string("attn_output_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_1_cast_fp16 = transpose(perm = V_expanded_1_perm_0, x = reshape_3_cast_fp16)[name = string("transpose_94")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_1_cast_fp16 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = var_586_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_1_cast_fp16")];
+            tensor<int32, [4]> var_597 = const()[name = string("op_597"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_604 = const()[name = string("op_604"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_598_cast_fp16 = transpose(perm = var_597, x = attn_output_1_cast_fp16)[name = string("transpose_93")];
+            tensor<fp16, [1, 1, 2048]> attn_output_3_cast_fp16 = reshape(shape = var_604, x = var_598_cast_fp16)[name = string("attn_output_3_cast_fp16")];
+            tensor<int32, [3]> var_609 = const()[name = string("op_609"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_625_pad_type_0 = const()[name = string("op_625_pad_type_0"), val = string("valid")];
+            int32 var_625_groups_0 = const()[name = string("op_625_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_625_strides_0 = const()[name = string("op_625_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_625_pad_0 = const()[name = string("op_625_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_625_dilations_0 = const()[name = string("op_625_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_0_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(383347392))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(385968896))))[name = string("squeeze_0_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_610_cast_fp16 = transpose(perm = var_609, x = attn_output_3_cast_fp16)[name = string("transpose_92")];
+            tensor<fp16, [1, 2560, 1]> var_625_cast_fp16 = conv(dilations = var_625_dilations_0, groups = var_625_groups_0, pad = var_625_pad_0, pad_type = var_625_pad_type_0, strides = var_625_strides_0, weight = squeeze_0_cast_fp16_to_fp32_to_fp16_palettized, x = var_610_cast_fp16)[name = string("op_625_cast_fp16")];
+            tensor<int32, [3]> var_629 = const()[name = string("op_629"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_635 = const()[name = string("op_635"), val = int32(-1)];
+            fp16 const_3_promoted_to_fp16 = const()[name = string("const_3_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_7_cast_fp16 = transpose(perm = var_629, x = var_625_cast_fp16)[name = string("transpose_91")];
+            tensor<fp16, [1, 1, 2560]> var_637_cast_fp16 = mul(x = x_7_cast_fp16, y = const_3_promoted_to_fp16)[name = string("op_637_cast_fp16")];
+            bool input_9_interleave_0 = const()[name = string("input_9_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_9_cast_fp16 = concat(axis = var_635, interleave = input_9_interleave_0, values = (x_7_cast_fp16, var_637_cast_fp16))[name = string("input_9_cast_fp16")];
+            tensor<int32, [1]> normed_9_axes_0 = const()[name = string("normed_9_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_632_to_fp16 = const()[name = string("op_632_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_9_cast_fp16 = layer_norm(axes = normed_9_axes_0, epsilon = var_632_to_fp16, x = input_9_cast_fp16)[name = string("normed_9_cast_fp16")];
+            tensor<int32, [2]> var_642_split_sizes_0 = const()[name = string("op_642_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_642_axis_0 = const()[name = string("op_642_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_642_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_642_cast_fp16_1 = split(axis = var_642_axis_0, split_sizes = var_642_split_sizes_0, x = normed_9_cast_fp16)[name = string("op_642_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_0_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(385971520)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_5_cast_fp16 = mul(x = var_642_cast_fp16_0, y = layers_0_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_5_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_9_cast_fp16 = add(x = hidden_states, y = attn_output_5_cast_fp16)[name = string("x_9_cast_fp16")];
+            int32 var_651 = const()[name = string("op_651"), val = int32(-1)];
+            fp16 const_4_promoted_to_fp16 = const()[name = string("const_4_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_653_cast_fp16 = mul(x = x_9_cast_fp16, y = const_4_promoted_to_fp16)[name = string("op_653_cast_fp16")];
+            bool input_11_interleave_0 = const()[name = string("input_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_11_cast_fp16 = concat(axis = var_651, interleave = input_11_interleave_0, values = (x_9_cast_fp16, var_653_cast_fp16))[name = string("input_11_cast_fp16")];
+            tensor<int32, [1]> normed_13_axes_0 = const()[name = string("normed_13_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_648_to_fp16 = const()[name = string("op_648_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_13_cast_fp16 = layer_norm(axes = normed_13_axes_0, epsilon = var_648_to_fp16, x = input_11_cast_fp16)[name = string("normed_13_cast_fp16")];
+            tensor<int32, [2]> var_658_split_sizes_0 = const()[name = string("op_658_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_658_axis_0 = const()[name = string("op_658_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_658_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_658_cast_fp16_1 = split(axis = var_658_axis_0, split_sizes = var_658_split_sizes_0, x = normed_13_cast_fp16)[name = string("op_658_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_0_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(385976704)))];
+            tensor<fp16, [1, 1, 2560]> h_3_cast_fp16 = mul(x = var_658_cast_fp16_0, y = layers_0_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_3_cast_fp16")];
+            tensor<int32, [3]> var_669 = const()[name = string("op_669"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_13_axes_0 = const()[name = string("input_13_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_670 = transpose(perm = var_669, x = h_3_cast_fp16)[name = string("transpose_90")];
+            tensor<fp16, [1, 2560, 1, 1]> input_13 = expand_dims(axes = input_13_axes_0, x = var_670)[name = string("input_13")];
+            string gate_1_pad_type_0 = const()[name = string("gate_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_1_strides_0 = const()[name = string("gate_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_1_pad_0 = const()[name = string("gate_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_1_dilations_0 = const()[name = string("gate_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_1_groups_0 = const()[name = string("gate_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_1 = conv(dilations = gate_1_dilations_0, groups = gate_1_groups_0, pad = gate_1_pad_0, pad_type = gate_1_pad_type_0, strides = gate_1_strides_0, weight = layers_0_mlp_gate_proj_weight_palettized, x = input_13)[name = string("gate_1")];
+            string up_1_pad_type_0 = const()[name = string("up_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_1_strides_0 = const()[name = string("up_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_1_pad_0 = const()[name = string("up_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_1_dilations_0 = const()[name = string("up_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_1_groups_0 = const()[name = string("up_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_1 = conv(dilations = up_1_dilations_0, groups = up_1_groups_0, pad = up_1_pad_0, pad_type = up_1_pad_type_0, strides = up_1_strides_0, weight = layers_0_mlp_up_proj_weight_palettized, x = input_13)[name = string("up_1")];
+            string gate_3_mode_0 = const()[name = string("gate_3_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_3 = gelu(mode = gate_3_mode_0, x = gate_1)[name = string("gate_3")];
+            tensor<fp16, [1, 10240, 1, 1]> input_15 = mul(x = gate_3, y = up_1)[name = string("input_15")];
+            string mlp_out_1_pad_type_0 = const()[name = string("mlp_out_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_1_strides_0 = const()[name = string("mlp_out_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_1_pad_0 = const()[name = string("mlp_out_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_1_dilations_0 = const()[name = string("mlp_out_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_1_groups_0 = const()[name = string("mlp_out_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_1 = conv(dilations = mlp_out_1_dilations_0, groups = mlp_out_1_groups_0, pad = mlp_out_1_pad_0, pad_type = mlp_out_1_pad_type_0, strides = mlp_out_1_strides_0, weight = layers_0_mlp_down_proj_weight_palettized, x = input_15)[name = string("mlp_out_1")];
+            tensor<int32, [1]> var_710_axes_0 = const()[name = string("op_710_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_710 = squeeze(axes = var_710_axes_0, x = mlp_out_1)[name = string("op_710")];
+            tensor<int32, [3]> var_714 = const()[name = string("op_714"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_720 = const()[name = string("op_720"), val = int32(-1)];
+            fp16 const_5_promoted = const()[name = string("const_5_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_11 = transpose(perm = var_714, x = var_710)[name = string("transpose_89")];
+            tensor<fp16, [1, 1, 2560]> var_722 = mul(x = x_11, y = const_5_promoted)[name = string("op_722")];
+            bool input_17_interleave_0 = const()[name = string("input_17_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_17 = concat(axis = var_720, interleave = input_17_interleave_0, values = (x_11, var_722))[name = string("input_17")];
+            tensor<int32, [1]> normed_17_axes_0 = const()[name = string("normed_17_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_717_to_fp16 = const()[name = string("op_717_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_17_cast_fp16 = layer_norm(axes = normed_17_axes_0, epsilon = var_717_to_fp16, x = input_17)[name = string("normed_17_cast_fp16")];
+            tensor<int32, [2]> var_727_split_sizes_0 = const()[name = string("op_727_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_727_axis_0 = const()[name = string("op_727_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_727_0, tensor<fp16, [1, 1, 2560]> var_727_1 = split(axis = var_727_axis_0, split_sizes = var_727_split_sizes_0, x = normed_17_cast_fp16)[name = string("op_727")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_3 = mul(x = var_727_0, y = layers_0_post_feedforward_layernorm_weight)[name = string("hidden_states_3")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_5_cast_fp16 = add(x = x_9_cast_fp16, y = hidden_states_3)[name = string("hidden_states_5_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_1_begin_0 = const()[name = string("per_layer_slice_1_begin_0"), val = tensor<int32, [3]>([0, 0, 6144])];
+            tensor<int32, [3]> per_layer_slice_1_end_0 = const()[name = string("per_layer_slice_1_end_0"), val = tensor<int32, [3]>([1, 1, 6400])];
+            tensor<bool, [3]> per_layer_slice_1_end_mask_0 = const()[name = string("per_layer_slice_1_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_1_cast_fp16 = slice_by_index(begin = per_layer_slice_1_begin_0, end = per_layer_slice_1_end_0, end_mask = per_layer_slice_1_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_1_cast_fp16")];
+            tensor<int32, [3]> var_755 = const()[name = string("op_755"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_19_axes_0 = const()[name = string("input_19_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_756 = transpose(perm = var_755, x = hidden_states_5_cast_fp16)[name = string("transpose_88")];
+            tensor<fp16, [1, 2560, 1, 1]> input_19 = expand_dims(axes = input_19_axes_0, x = var_756)[name = string("input_19")];
+            string gated_1_pad_type_0 = const()[name = string("gated_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_1_strides_0 = const()[name = string("gated_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_1_pad_0 = const()[name = string("gated_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_1_dilations_0 = const()[name = string("gated_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_1_groups_0 = const()[name = string("gated_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_1 = conv(dilations = gated_1_dilations_0, groups = gated_1_groups_0, pad = gated_1_pad_0, pad_type = gated_1_pad_type_0, strides = gated_1_strides_0, weight = layers_0_per_layer_input_gate_weight_palettized, x = input_19)[name = string("gated_1")];
+            string gated_3_mode_0 = const()[name = string("gated_3_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_3 = gelu(mode = gated_3_mode_0, x = gated_1)[name = string("gated_3")];
+            tensor<int32, [3]> var_775 = const()[name = string("op_775"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_1_axes_0 = const()[name = string("per_layer_slice_conv_1_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_776_cast_fp16 = transpose(perm = var_775, x = per_layer_slice_1_cast_fp16)[name = string("transpose_87")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_1_cast_fp16 = expand_dims(axes = per_layer_slice_conv_1_axes_0, x = var_776_cast_fp16)[name = string("per_layer_slice_conv_1_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_21_cast_fp16 = mul(x = gated_3, y = per_layer_slice_conv_1_cast_fp16)[name = string("input_21_cast_fp16")];
+            string gated_5_pad_type_0 = const()[name = string("gated_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_5_strides_0 = const()[name = string("gated_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_5_pad_0 = const()[name = string("gated_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_5_dilations_0 = const()[name = string("gated_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_5_groups_0 = const()[name = string("gated_5_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_0_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(385981888))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(386309632))))[name = string("layers_0_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_5_cast_fp16 = conv(dilations = gated_5_dilations_0, groups = gated_5_groups_0, pad = gated_5_pad_0, pad_type = gated_5_pad_type_0, strides = gated_5_strides_0, weight = layers_0_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_21_cast_fp16)[name = string("gated_5_cast_fp16")];
+            tensor<int32, [1]> var_792_axes_0 = const()[name = string("op_792_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_792_cast_fp16 = squeeze(axes = var_792_axes_0, x = gated_5_cast_fp16)[name = string("op_792_cast_fp16")];
+            tensor<int32, [3]> var_796 = const()[name = string("op_796"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_802 = const()[name = string("op_802"), val = int32(-1)];
+            fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_13_cast_fp16 = transpose(perm = var_796, x = var_792_cast_fp16)[name = string("transpose_86")];
+            tensor<fp16, [1, 1, 2560]> var_804_cast_fp16 = mul(x = x_13_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_804_cast_fp16")];
+            bool input_23_interleave_0 = const()[name = string("input_23_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_23_cast_fp16 = concat(axis = var_802, interleave = input_23_interleave_0, values = (x_13_cast_fp16, var_804_cast_fp16))[name = string("input_23_cast_fp16")];
+            tensor<int32, [1]> normed_21_axes_0 = const()[name = string("normed_21_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_799_to_fp16 = const()[name = string("op_799_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_21_cast_fp16 = layer_norm(axes = normed_21_axes_0, epsilon = var_799_to_fp16, x = input_23_cast_fp16)[name = string("normed_21_cast_fp16")];
+            tensor<int32, [2]> var_809_split_sizes_0 = const()[name = string("op_809_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_809_axis_0 = const()[name = string("op_809_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_809_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_809_cast_fp16_1 = split(axis = var_809_axis_0, split_sizes = var_809_split_sizes_0, x = normed_21_cast_fp16)[name = string("op_809_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_0_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(386312256)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_9_cast_fp16 = mul(x = var_809_cast_fp16_0, y = layers_0_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_9_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_11_cast_fp16 = add(x = hidden_states_5_cast_fp16, y = hidden_states_9_cast_fp16)[name = string("hidden_states_11_cast_fp16")];
+            tensor<fp16, [1]> const_7_promoted_to_fp16 = const()[name = string("const_7_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.02p-1])];
+            tensor<fp16, [1, 1, 2560]> x_15_cast_fp16 = mul(x = hidden_states_11_cast_fp16, y = const_7_promoted_to_fp16)[name = string("x_15_cast_fp16")];
+            int32 var_824 = const()[name = string("op_824"), val = int32(-1)];
+            fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_826_cast_fp16 = mul(x = x_15_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_826_cast_fp16")];
+            bool input_25_interleave_0 = const()[name = string("input_25_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_25_cast_fp16 = concat(axis = var_824, interleave = input_25_interleave_0, values = (x_15_cast_fp16, var_826_cast_fp16))[name = string("input_25_cast_fp16")];
+            tensor<int32, [1]> normed_25_axes_0 = const()[name = string("normed_25_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_821_to_fp16 = const()[name = string("op_821_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_25_cast_fp16 = layer_norm(axes = normed_25_axes_0, epsilon = var_821_to_fp16, x = input_25_cast_fp16)[name = string("normed_25_cast_fp16")];
+            tensor<int32, [2]> var_831_split_sizes_0 = const()[name = string("op_831_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_831_axis_0 = const()[name = string("op_831_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_831_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_831_cast_fp16_1 = split(axis = var_831_axis_0, split_sizes = var_831_split_sizes_0, x = normed_25_cast_fp16)[name = string("op_831_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_1_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(386317440)))];
+            tensor<fp16, [1, 1, 2560]> h_7_cast_fp16 = mul(x = var_831_cast_fp16_0, y = layers_1_input_layernorm_weight_promoted_to_fp16)[name = string("h_7_cast_fp16")];
+            tensor<int32, [3]> var_837 = const()[name = string("op_837"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_840_axes_0 = const()[name = string("op_840_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_838_cast_fp16 = transpose(perm = var_837, x = h_7_cast_fp16)[name = string("transpose_85")];
+            tensor<fp16, [1, 2560, 1, 1]> var_840_cast_fp16 = expand_dims(axes = var_840_axes_0, x = var_838_cast_fp16)[name = string("op_840_cast_fp16")];
+            string var_856_pad_type_0 = const()[name = string("op_856_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_856_strides_0 = const()[name = string("op_856_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_856_pad_0 = const()[name = string("op_856_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_856_dilations_0 = const()[name = string("op_856_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_856_groups_0 = const()[name = string("op_856_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_856 = conv(dilations = var_856_dilations_0, groups = var_856_groups_0, pad = var_856_pad_0, pad_type = var_856_pad_type_0, strides = var_856_strides_0, weight = layers_1_self_attn_q_proj_weight_palettized, x = var_840_cast_fp16)[name = string("op_856")];
+            tensor<int32, [4]> var_861 = const()[name = string("op_861"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_862 = reshape(shape = var_861, x = var_856)[name = string("op_862")];
+            tensor<int32, [4]> var_867 = const()[name = string("op_867"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_877 = const()[name = string("op_877"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_868 = transpose(perm = var_867, x = var_862)[name = string("transpose_84")];
+            tensor<fp16, [1, 8, 256]> x_17 = reshape(shape = var_877, x = var_868)[name = string("x_17")];
+            int32 var_883 = const()[name = string("op_883"), val = int32(-1)];
+            fp16 const_9_promoted = const()[name = string("const_9_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_885 = mul(x = x_17, y = const_9_promoted)[name = string("op_885")];
+            bool input_29_interleave_0 = const()[name = string("input_29_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_29 = concat(axis = var_883, interleave = input_29_interleave_0, values = (x_17, var_885))[name = string("input_29")];
+            tensor<int32, [1]> normed_29_axes_0 = const()[name = string("normed_29_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_880_to_fp16 = const()[name = string("op_880_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_29_cast_fp16 = layer_norm(axes = normed_29_axes_0, epsilon = var_880_to_fp16, x = input_29)[name = string("normed_29_cast_fp16")];
+            tensor<int32, [2]> var_890_split_sizes_0 = const()[name = string("op_890_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_890_axis_0 = const()[name = string("op_890_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_890_0, tensor<fp16, [1, 8, 256]> var_890_1 = split(axis = var_890_axis_0, split_sizes = var_890_split_sizes_0, x = normed_29_cast_fp16)[name = string("op_890")];
+            tensor<fp16, [1, 8, 256]> var_892 = mul(x = var_890_0, y = layers_0_self_attn_q_norm_weight)[name = string("op_892")];
+            tensor<int32, [4]> var_897 = const()[name = string("op_897"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_9 = reshape(shape = var_897, x = var_892)[name = string("q_9")];
+            tensor<fp16, [1, 8, 1, 256]> var_899_cast_fp16 = mul(x = q_9, y = cos_s)[name = string("op_899_cast_fp16")];
+            tensor<int32, [2]> var_900_split_sizes_0 = const()[name = string("op_900_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_900_axis_0 = const()[name = string("op_900_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_900_0, tensor<fp16, [1, 8, 1, 128]> var_900_1 = split(axis = var_900_axis_0, split_sizes = var_900_split_sizes_0, x = q_9)[name = string("op_900")];
+            fp16 const_10_promoted = const()[name = string("const_10_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_902 = mul(x = var_900_1, y = const_10_promoted)[name = string("op_902")];
+            int32 var_904 = const()[name = string("op_904"), val = int32(-1)];
+            bool var_905_interleave_0 = const()[name = string("op_905_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_905 = concat(axis = var_904, interleave = var_905_interleave_0, values = (var_902, var_900_0))[name = string("op_905")];
+            tensor<fp16, [1, 8, 1, 256]> var_906_cast_fp16 = mul(x = var_905, y = sin_s)[name = string("op_906_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_11_cast_fp16 = add(x = var_899_cast_fp16, y = var_906_cast_fp16)[name = string("q_11_cast_fp16")];
+            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(false)];
+            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = q_11_cast_fp16, y = transpose_36_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_19_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = causal_mask_sliding)[name = string("x_19_cast_fp16")];
+            tensor<int32, [1]> reduce_max_1_axes_0 = const()[name = string("reduce_max_1_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_1_keep_dims_0 = const()[name = string("reduce_max_1_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_1 = reduce_max(axes = reduce_max_1_axes_0, keep_dims = reduce_max_1_keep_dims_0, x = x_19_cast_fp16)[name = string("reduce_max_1")];
+            tensor<fp16, [1, 8, 1, 512]> var_938 = sub(x = x_19_cast_fp16, y = reduce_max_1)[name = string("op_938")];
+            tensor<fp16, [1, 8, 1, 512]> var_944 = exp(x = var_938)[name = string("op_944")];
+            tensor<int32, [1]> var_954_axes_0 = const()[name = string("op_954_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_954_keep_dims_0 = const()[name = string("op_954_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_954 = reduce_sum(axes = var_954_axes_0, keep_dims = var_954_keep_dims_0, x = var_944)[name = string("op_954")];
+            tensor<fp16, [1, 8, 1, 512]> var_960_cast_fp16 = real_div(x = var_944, y = var_954)[name = string("op_960_cast_fp16")];
+            bool attn_output_7_transpose_x_0 = const()[name = string("attn_output_7_transpose_x_0"), val = bool(false)];
+            bool attn_output_7_transpose_y_0 = const()[name = string("attn_output_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_7_cast_fp16 = matmul(transpose_x = attn_output_7_transpose_x_0, transpose_y = attn_output_7_transpose_y_0, x = var_960_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_7_cast_fp16")];
+            tensor<int32, [4]> var_971 = const()[name = string("op_971"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_978 = const()[name = string("op_978"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_972_cast_fp16 = transpose(perm = var_971, x = attn_output_7_cast_fp16)[name = string("transpose_83")];
+            tensor<fp16, [1, 1, 2048]> attn_output_9_cast_fp16 = reshape(shape = var_978, x = var_972_cast_fp16)[name = string("attn_output_9_cast_fp16")];
+            tensor<int32, [3]> var_983 = const()[name = string("op_983"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_999_pad_type_0 = const()[name = string("op_999_pad_type_0"), val = string("valid")];
+            int32 var_999_groups_0 = const()[name = string("op_999_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_999_strides_0 = const()[name = string("op_999_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_999_pad_0 = const()[name = string("op_999_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_999_dilations_0 = const()[name = string("op_999_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_1_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(386322624))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388944128))))[name = string("squeeze_1_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_984_cast_fp16 = transpose(perm = var_983, x = attn_output_9_cast_fp16)[name = string("transpose_82")];
+            tensor<fp16, [1, 2560, 1]> var_999_cast_fp16 = conv(dilations = var_999_dilations_0, groups = var_999_groups_0, pad = var_999_pad_0, pad_type = var_999_pad_type_0, strides = var_999_strides_0, weight = squeeze_1_cast_fp16_to_fp32_to_fp16_palettized, x = var_984_cast_fp16)[name = string("op_999_cast_fp16")];
+            tensor<int32, [3]> var_1003 = const()[name = string("op_1003"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1009 = const()[name = string("op_1009"), val = int32(-1)];
+            fp16 const_11_promoted_to_fp16 = const()[name = string("const_11_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_23_cast_fp16 = transpose(perm = var_1003, x = var_999_cast_fp16)[name = string("transpose_81")];
+            tensor<fp16, [1, 1, 2560]> var_1011_cast_fp16 = mul(x = x_23_cast_fp16, y = const_11_promoted_to_fp16)[name = string("op_1011_cast_fp16")];
+            bool input_33_interleave_0 = const()[name = string("input_33_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_33_cast_fp16 = concat(axis = var_1009, interleave = input_33_interleave_0, values = (x_23_cast_fp16, var_1011_cast_fp16))[name = string("input_33_cast_fp16")];
+            tensor<int32, [1]> normed_33_axes_0 = const()[name = string("normed_33_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1006_to_fp16 = const()[name = string("op_1006_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_33_cast_fp16 = layer_norm(axes = normed_33_axes_0, epsilon = var_1006_to_fp16, x = input_33_cast_fp16)[name = string("normed_33_cast_fp16")];
+            tensor<int32, [2]> var_1016_split_sizes_0 = const()[name = string("op_1016_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1016_axis_0 = const()[name = string("op_1016_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1016_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1016_cast_fp16_1 = split(axis = var_1016_axis_0, split_sizes = var_1016_split_sizes_0, x = normed_33_cast_fp16)[name = string("op_1016_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_1_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388946752)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_11_cast_fp16 = mul(x = var_1016_cast_fp16_0, y = layers_1_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_11_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_25_cast_fp16 = add(x = x_15_cast_fp16, y = attn_output_11_cast_fp16)[name = string("x_25_cast_fp16")];
+            int32 var_1025 = const()[name = string("op_1025"), val = int32(-1)];
+            fp16 const_12_promoted_to_fp16 = const()[name = string("const_12_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_1027_cast_fp16 = mul(x = x_25_cast_fp16, y = const_12_promoted_to_fp16)[name = string("op_1027_cast_fp16")];
+            bool input_35_interleave_0 = const()[name = string("input_35_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_35_cast_fp16 = concat(axis = var_1025, interleave = input_35_interleave_0, values = (x_25_cast_fp16, var_1027_cast_fp16))[name = string("input_35_cast_fp16")];
+            tensor<int32, [1]> normed_37_axes_0 = const()[name = string("normed_37_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1022_to_fp16 = const()[name = string("op_1022_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_37_cast_fp16 = layer_norm(axes = normed_37_axes_0, epsilon = var_1022_to_fp16, x = input_35_cast_fp16)[name = string("normed_37_cast_fp16")];
+            tensor<int32, [2]> var_1032_split_sizes_0 = const()[name = string("op_1032_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1032_axis_0 = const()[name = string("op_1032_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1032_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1032_cast_fp16_1 = split(axis = var_1032_axis_0, split_sizes = var_1032_split_sizes_0, x = normed_37_cast_fp16)[name = string("op_1032_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_1_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388951936)))];
+            tensor<fp16, [1, 1, 2560]> h_9_cast_fp16 = mul(x = var_1032_cast_fp16_0, y = layers_1_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_9_cast_fp16")];
+            tensor<int32, [3]> var_1043 = const()[name = string("op_1043"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_37_axes_0 = const()[name = string("input_37_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1044 = transpose(perm = var_1043, x = h_9_cast_fp16)[name = string("transpose_80")];
+            tensor<fp16, [1, 2560, 1, 1]> input_37 = expand_dims(axes = input_37_axes_0, x = var_1044)[name = string("input_37")];
+            string gate_5_pad_type_0 = const()[name = string("gate_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_5_strides_0 = const()[name = string("gate_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_5_pad_0 = const()[name = string("gate_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_5_dilations_0 = const()[name = string("gate_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_5_groups_0 = const()[name = string("gate_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_5 = conv(dilations = gate_5_dilations_0, groups = gate_5_groups_0, pad = gate_5_pad_0, pad_type = gate_5_pad_type_0, strides = gate_5_strides_0, weight = layers_1_mlp_gate_proj_weight_palettized, x = input_37)[name = string("gate_5")];
+            string up_3_pad_type_0 = const()[name = string("up_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_3_strides_0 = const()[name = string("up_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_3_pad_0 = const()[name = string("up_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_3_dilations_0 = const()[name = string("up_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_3_groups_0 = const()[name = string("up_3_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_3 = conv(dilations = up_3_dilations_0, groups = up_3_groups_0, pad = up_3_pad_0, pad_type = up_3_pad_type_0, strides = up_3_strides_0, weight = layers_1_mlp_up_proj_weight_palettized, x = input_37)[name = string("up_3")];
+            string gate_7_mode_0 = const()[name = string("gate_7_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_7 = gelu(mode = gate_7_mode_0, x = gate_5)[name = string("gate_7")];
+            tensor<fp16, [1, 10240, 1, 1]> input_39 = mul(x = gate_7, y = up_3)[name = string("input_39")];
+            string mlp_out_3_pad_type_0 = const()[name = string("mlp_out_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_3_strides_0 = const()[name = string("mlp_out_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_3_pad_0 = const()[name = string("mlp_out_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_3_dilations_0 = const()[name = string("mlp_out_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_3_groups_0 = const()[name = string("mlp_out_3_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_3 = conv(dilations = mlp_out_3_dilations_0, groups = mlp_out_3_groups_0, pad = mlp_out_3_pad_0, pad_type = mlp_out_3_pad_type_0, strides = mlp_out_3_strides_0, weight = layers_1_mlp_down_proj_weight_palettized, x = input_39)[name = string("mlp_out_3")];
+            tensor<int32, [1]> var_1084_axes_0 = const()[name = string("op_1084_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1084 = squeeze(axes = var_1084_axes_0, x = mlp_out_3)[name = string("op_1084")];
+            tensor<int32, [3]> var_1088 = const()[name = string("op_1088"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1094 = const()[name = string("op_1094"), val = int32(-1)];
+            fp16 const_13_promoted = const()[name = string("const_13_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_27 = transpose(perm = var_1088, x = var_1084)[name = string("transpose_79")];
+            tensor<fp16, [1, 1, 2560]> var_1096 = mul(x = x_27, y = const_13_promoted)[name = string("op_1096")];
+            bool input_41_interleave_0 = const()[name = string("input_41_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_41 = concat(axis = var_1094, interleave = input_41_interleave_0, values = (x_27, var_1096))[name = string("input_41")];
+            tensor<int32, [1]> normed_41_axes_0 = const()[name = string("normed_41_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1091_to_fp16 = const()[name = string("op_1091_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_41_cast_fp16 = layer_norm(axes = normed_41_axes_0, epsilon = var_1091_to_fp16, x = input_41)[name = string("normed_41_cast_fp16")];
+            tensor<int32, [2]> var_1101_split_sizes_0 = const()[name = string("op_1101_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1101_axis_0 = const()[name = string("op_1101_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1101_0, tensor<fp16, [1, 1, 2560]> var_1101_1 = split(axis = var_1101_axis_0, split_sizes = var_1101_split_sizes_0, x = normed_41_cast_fp16)[name = string("op_1101")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_13 = mul(x = var_1101_0, y = layers_1_post_feedforward_layernorm_weight)[name = string("hidden_states_13")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_15_cast_fp16 = add(x = x_25_cast_fp16, y = hidden_states_13)[name = string("hidden_states_15_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_3_begin_0 = const()[name = string("per_layer_slice_3_begin_0"), val = tensor<int32, [3]>([0, 0, 6400])];
+            tensor<int32, [3]> per_layer_slice_3_end_0 = const()[name = string("per_layer_slice_3_end_0"), val = tensor<int32, [3]>([1, 1, 6656])];
+            tensor<bool, [3]> per_layer_slice_3_end_mask_0 = const()[name = string("per_layer_slice_3_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_3_cast_fp16 = slice_by_index(begin = per_layer_slice_3_begin_0, end = per_layer_slice_3_end_0, end_mask = per_layer_slice_3_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_3_cast_fp16")];
+            tensor<int32, [3]> var_1129 = const()[name = string("op_1129"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_43_axes_0 = const()[name = string("input_43_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1130 = transpose(perm = var_1129, x = hidden_states_15_cast_fp16)[name = string("transpose_78")];
+            tensor<fp16, [1, 2560, 1, 1]> input_43 = expand_dims(axes = input_43_axes_0, x = var_1130)[name = string("input_43")];
+            string gated_7_pad_type_0 = const()[name = string("gated_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_7_strides_0 = const()[name = string("gated_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_7_pad_0 = const()[name = string("gated_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_7_dilations_0 = const()[name = string("gated_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_7_groups_0 = const()[name = string("gated_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_7 = conv(dilations = gated_7_dilations_0, groups = gated_7_groups_0, pad = gated_7_pad_0, pad_type = gated_7_pad_type_0, strides = gated_7_strides_0, weight = layers_1_per_layer_input_gate_weight_palettized, x = input_43)[name = string("gated_7")];
+            string gated_9_mode_0 = const()[name = string("gated_9_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_9 = gelu(mode = gated_9_mode_0, x = gated_7)[name = string("gated_9")];
+            tensor<int32, [3]> var_1149 = const()[name = string("op_1149"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_3_axes_0 = const()[name = string("per_layer_slice_conv_3_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_1150_cast_fp16 = transpose(perm = var_1149, x = per_layer_slice_3_cast_fp16)[name = string("transpose_77")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_3_cast_fp16 = expand_dims(axes = per_layer_slice_conv_3_axes_0, x = var_1150_cast_fp16)[name = string("per_layer_slice_conv_3_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_45_cast_fp16 = mul(x = gated_9, y = per_layer_slice_conv_3_cast_fp16)[name = string("input_45_cast_fp16")];
+            string gated_11_pad_type_0 = const()[name = string("gated_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_11_strides_0 = const()[name = string("gated_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_11_pad_0 = const()[name = string("gated_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_11_dilations_0 = const()[name = string("gated_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_11_groups_0 = const()[name = string("gated_11_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_1_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388957120))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(389284864))))[name = string("layers_1_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_11_cast_fp16 = conv(dilations = gated_11_dilations_0, groups = gated_11_groups_0, pad = gated_11_pad_0, pad_type = gated_11_pad_type_0, strides = gated_11_strides_0, weight = layers_1_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_45_cast_fp16)[name = string("gated_11_cast_fp16")];
+            tensor<int32, [1]> var_1166_axes_0 = const()[name = string("op_1166_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1166_cast_fp16 = squeeze(axes = var_1166_axes_0, x = gated_11_cast_fp16)[name = string("op_1166_cast_fp16")];
+            tensor<int32, [3]> var_1170 = const()[name = string("op_1170"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1176 = const()[name = string("op_1176"), val = int32(-1)];
+            fp16 const_14_promoted_to_fp16 = const()[name = string("const_14_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_29_cast_fp16 = transpose(perm = var_1170, x = var_1166_cast_fp16)[name = string("transpose_76")];
+            tensor<fp16, [1, 1, 2560]> var_1178_cast_fp16 = mul(x = x_29_cast_fp16, y = const_14_promoted_to_fp16)[name = string("op_1178_cast_fp16")];
+            bool input_47_interleave_0 = const()[name = string("input_47_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_47_cast_fp16 = concat(axis = var_1176, interleave = input_47_interleave_0, values = (x_29_cast_fp16, var_1178_cast_fp16))[name = string("input_47_cast_fp16")];
+            tensor<int32, [1]> normed_45_axes_0 = const()[name = string("normed_45_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1173_to_fp16 = const()[name = string("op_1173_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_45_cast_fp16 = layer_norm(axes = normed_45_axes_0, epsilon = var_1173_to_fp16, x = input_47_cast_fp16)[name = string("normed_45_cast_fp16")];
+            tensor<int32, [2]> var_1183_split_sizes_0 = const()[name = string("op_1183_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1183_axis_0 = const()[name = string("op_1183_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1183_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1183_cast_fp16_1 = split(axis = var_1183_axis_0, split_sizes = var_1183_split_sizes_0, x = normed_45_cast_fp16)[name = string("op_1183_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_1_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(389287488)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_19_cast_fp16 = mul(x = var_1183_cast_fp16_0, y = layers_1_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_19_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_21_cast_fp16 = add(x = hidden_states_15_cast_fp16, y = hidden_states_19_cast_fp16)[name = string("hidden_states_21_cast_fp16")];
+            tensor<fp16, [1]> const_15_promoted_to_fp16 = const()[name = string("const_15_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.6ep-1])];
+            tensor<fp16, [1, 1, 2560]> x_31_cast_fp16 = mul(x = hidden_states_21_cast_fp16, y = const_15_promoted_to_fp16)[name = string("x_31_cast_fp16")];
+            int32 var_1198 = const()[name = string("op_1198"), val = int32(-1)];
+            fp16 const_16_promoted_to_fp16 = const()[name = string("const_16_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_1200_cast_fp16 = mul(x = x_31_cast_fp16, y = const_16_promoted_to_fp16)[name = string("op_1200_cast_fp16")];
+            bool input_49_interleave_0 = const()[name = string("input_49_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_49_cast_fp16 = concat(axis = var_1198, interleave = input_49_interleave_0, values = (x_31_cast_fp16, var_1200_cast_fp16))[name = string("input_49_cast_fp16")];
+            tensor<int32, [1]> normed_49_axes_0 = const()[name = string("normed_49_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1195_to_fp16 = const()[name = string("op_1195_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_49_cast_fp16 = layer_norm(axes = normed_49_axes_0, epsilon = var_1195_to_fp16, x = input_49_cast_fp16)[name = string("normed_49_cast_fp16")];
+            tensor<int32, [2]> var_1205_split_sizes_0 = const()[name = string("op_1205_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1205_axis_0 = const()[name = string("op_1205_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1205_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1205_cast_fp16_1 = split(axis = var_1205_axis_0, split_sizes = var_1205_split_sizes_0, x = normed_49_cast_fp16)[name = string("op_1205_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_2_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(389292672)))];
+            tensor<fp16, [1, 1, 2560]> h_13_cast_fp16 = mul(x = var_1205_cast_fp16_0, y = layers_2_input_layernorm_weight_promoted_to_fp16)[name = string("h_13_cast_fp16")];
+            tensor<int32, [3]> var_1211 = const()[name = string("op_1211"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1214_axes_0 = const()[name = string("op_1214_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1212_cast_fp16 = transpose(perm = var_1211, x = h_13_cast_fp16)[name = string("transpose_75")];
+            tensor<fp16, [1, 2560, 1, 1]> var_1214_cast_fp16 = expand_dims(axes = var_1214_axes_0, x = var_1212_cast_fp16)[name = string("op_1214_cast_fp16")];
+            string var_1230_pad_type_0 = const()[name = string("op_1230_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1230_strides_0 = const()[name = string("op_1230_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1230_pad_0 = const()[name = string("op_1230_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1230_dilations_0 = const()[name = string("op_1230_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1230_groups_0 = const()[name = string("op_1230_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_1230 = conv(dilations = var_1230_dilations_0, groups = var_1230_groups_0, pad = var_1230_pad_0, pad_type = var_1230_pad_type_0, strides = var_1230_strides_0, weight = layers_2_self_attn_q_proj_weight_palettized, x = var_1214_cast_fp16)[name = string("op_1230")];
+            tensor<int32, [4]> var_1235 = const()[name = string("op_1235"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_1236 = reshape(shape = var_1235, x = var_1230)[name = string("op_1236")];
+            tensor<int32, [4]> var_1241 = const()[name = string("op_1241"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_1251 = const()[name = string("op_1251"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_1242 = transpose(perm = var_1241, x = var_1236)[name = string("transpose_74")];
+            tensor<fp16, [1, 8, 256]> x_33 = reshape(shape = var_1251, x = var_1242)[name = string("x_33")];
+            int32 var_1257 = const()[name = string("op_1257"), val = int32(-1)];
+            fp16 const_17_promoted = const()[name = string("const_17_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_1259 = mul(x = x_33, y = const_17_promoted)[name = string("op_1259")];
+            bool input_53_interleave_0 = const()[name = string("input_53_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_53 = concat(axis = var_1257, interleave = input_53_interleave_0, values = (x_33, var_1259))[name = string("input_53")];
+            tensor<int32, [1]> normed_53_axes_0 = const()[name = string("normed_53_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1254_to_fp16 = const()[name = string("op_1254_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_53_cast_fp16 = layer_norm(axes = normed_53_axes_0, epsilon = var_1254_to_fp16, x = input_53)[name = string("normed_53_cast_fp16")];
+            tensor<int32, [2]> var_1264_split_sizes_0 = const()[name = string("op_1264_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_1264_axis_0 = const()[name = string("op_1264_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_1264_0, tensor<fp16, [1, 8, 256]> var_1264_1 = split(axis = var_1264_axis_0, split_sizes = var_1264_split_sizes_0, x = normed_53_cast_fp16)[name = string("op_1264")];
+            tensor<fp16, [1, 8, 256]> var_1266 = mul(x = var_1264_0, y = layers_0_self_attn_q_norm_weight)[name = string("op_1266")];
+            tensor<int32, [4]> var_1271 = const()[name = string("op_1271"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_15 = reshape(shape = var_1271, x = var_1266)[name = string("q_15")];
+            tensor<fp16, [1, 8, 1, 256]> var_1273_cast_fp16 = mul(x = q_15, y = cos_s)[name = string("op_1273_cast_fp16")];
+            tensor<int32, [2]> var_1274_split_sizes_0 = const()[name = string("op_1274_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_1274_axis_0 = const()[name = string("op_1274_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_1274_0, tensor<fp16, [1, 8, 1, 128]> var_1274_1 = split(axis = var_1274_axis_0, split_sizes = var_1274_split_sizes_0, x = q_15)[name = string("op_1274")];
+            fp16 const_18_promoted = const()[name = string("const_18_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_1276 = mul(x = var_1274_1, y = const_18_promoted)[name = string("op_1276")];
+            int32 var_1278 = const()[name = string("op_1278"), val = int32(-1)];
+            bool var_1279_interleave_0 = const()[name = string("op_1279_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_1279 = concat(axis = var_1278, interleave = var_1279_interleave_0, values = (var_1276, var_1274_0))[name = string("op_1279")];
+            tensor<fp16, [1, 8, 1, 256]> var_1280_cast_fp16 = mul(x = var_1279, y = sin_s)[name = string("op_1280_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_17_cast_fp16 = add(x = var_1273_cast_fp16, y = var_1280_cast_fp16)[name = string("q_17_cast_fp16")];
+            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(false)];
+            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = q_17_cast_fp16, y = transpose_36_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_35_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = causal_mask_sliding)[name = string("x_35_cast_fp16")];
+            tensor<int32, [1]> reduce_max_2_axes_0 = const()[name = string("reduce_max_2_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_2_keep_dims_0 = const()[name = string("reduce_max_2_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_2 = reduce_max(axes = reduce_max_2_axes_0, keep_dims = reduce_max_2_keep_dims_0, x = x_35_cast_fp16)[name = string("reduce_max_2")];
+            tensor<fp16, [1, 8, 1, 512]> var_1312 = sub(x = x_35_cast_fp16, y = reduce_max_2)[name = string("op_1312")];
+            tensor<fp16, [1, 8, 1, 512]> var_1318 = exp(x = var_1312)[name = string("op_1318")];
+            tensor<int32, [1]> var_1328_axes_0 = const()[name = string("op_1328_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1328_keep_dims_0 = const()[name = string("op_1328_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_1328 = reduce_sum(axes = var_1328_axes_0, keep_dims = var_1328_keep_dims_0, x = var_1318)[name = string("op_1328")];
+            tensor<fp16, [1, 8, 1, 512]> var_1334_cast_fp16 = real_div(x = var_1318, y = var_1328)[name = string("op_1334_cast_fp16")];
+            bool attn_output_13_transpose_x_0 = const()[name = string("attn_output_13_transpose_x_0"), val = bool(false)];
+            bool attn_output_13_transpose_y_0 = const()[name = string("attn_output_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_13_cast_fp16 = matmul(transpose_x = attn_output_13_transpose_x_0, transpose_y = attn_output_13_transpose_y_0, x = var_1334_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_13_cast_fp16")];
+            tensor<int32, [4]> var_1345 = const()[name = string("op_1345"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1352 = const()[name = string("op_1352"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_1346_cast_fp16 = transpose(perm = var_1345, x = attn_output_13_cast_fp16)[name = string("transpose_73")];
+            tensor<fp16, [1, 1, 2048]> attn_output_15_cast_fp16 = reshape(shape = var_1352, x = var_1346_cast_fp16)[name = string("attn_output_15_cast_fp16")];
+            tensor<int32, [3]> var_1357 = const()[name = string("op_1357"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_1373_pad_type_0 = const()[name = string("op_1373_pad_type_0"), val = string("valid")];
+            int32 var_1373_groups_0 = const()[name = string("op_1373_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_1373_strides_0 = const()[name = string("op_1373_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_1373_pad_0 = const()[name = string("op_1373_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_1373_dilations_0 = const()[name = string("op_1373_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_2_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(389297856))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391919360))))[name = string("squeeze_2_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_1358_cast_fp16 = transpose(perm = var_1357, x = attn_output_15_cast_fp16)[name = string("transpose_72")];
+            tensor<fp16, [1, 2560, 1]> var_1373_cast_fp16 = conv(dilations = var_1373_dilations_0, groups = var_1373_groups_0, pad = var_1373_pad_0, pad_type = var_1373_pad_type_0, strides = var_1373_strides_0, weight = squeeze_2_cast_fp16_to_fp32_to_fp16_palettized, x = var_1358_cast_fp16)[name = string("op_1373_cast_fp16")];
+            tensor<int32, [3]> var_1377 = const()[name = string("op_1377"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1383 = const()[name = string("op_1383"), val = int32(-1)];
+            fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_39_cast_fp16 = transpose(perm = var_1377, x = var_1373_cast_fp16)[name = string("transpose_71")];
+            tensor<fp16, [1, 1, 2560]> var_1385_cast_fp16 = mul(x = x_39_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_1385_cast_fp16")];
+            bool input_57_interleave_0 = const()[name = string("input_57_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_57_cast_fp16 = concat(axis = var_1383, interleave = input_57_interleave_0, values = (x_39_cast_fp16, var_1385_cast_fp16))[name = string("input_57_cast_fp16")];
+            tensor<int32, [1]> normed_57_axes_0 = const()[name = string("normed_57_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1380_to_fp16 = const()[name = string("op_1380_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_57_cast_fp16 = layer_norm(axes = normed_57_axes_0, epsilon = var_1380_to_fp16, x = input_57_cast_fp16)[name = string("normed_57_cast_fp16")];
+            tensor<int32, [2]> var_1390_split_sizes_0 = const()[name = string("op_1390_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1390_axis_0 = const()[name = string("op_1390_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1390_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1390_cast_fp16_1 = split(axis = var_1390_axis_0, split_sizes = var_1390_split_sizes_0, x = normed_57_cast_fp16)[name = string("op_1390_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_2_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391921984)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_17_cast_fp16 = mul(x = var_1390_cast_fp16_0, y = layers_2_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_17_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_41_cast_fp16 = add(x = x_31_cast_fp16, y = attn_output_17_cast_fp16)[name = string("x_41_cast_fp16")];
+            int32 var_1399 = const()[name = string("op_1399"), val = int32(-1)];
+            fp16 const_20_promoted_to_fp16 = const()[name = string("const_20_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_1401_cast_fp16 = mul(x = x_41_cast_fp16, y = const_20_promoted_to_fp16)[name = string("op_1401_cast_fp16")];
+            bool input_59_interleave_0 = const()[name = string("input_59_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_59_cast_fp16 = concat(axis = var_1399, interleave = input_59_interleave_0, values = (x_41_cast_fp16, var_1401_cast_fp16))[name = string("input_59_cast_fp16")];
+            tensor<int32, [1]> normed_61_axes_0 = const()[name = string("normed_61_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1396_to_fp16 = const()[name = string("op_1396_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_61_cast_fp16 = layer_norm(axes = normed_61_axes_0, epsilon = var_1396_to_fp16, x = input_59_cast_fp16)[name = string("normed_61_cast_fp16")];
+            tensor<int32, [2]> var_1406_split_sizes_0 = const()[name = string("op_1406_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1406_axis_0 = const()[name = string("op_1406_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1406_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1406_cast_fp16_1 = split(axis = var_1406_axis_0, split_sizes = var_1406_split_sizes_0, x = normed_61_cast_fp16)[name = string("op_1406_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_2_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391927168)))];
+            tensor<fp16, [1, 1, 2560]> h_15_cast_fp16 = mul(x = var_1406_cast_fp16_0, y = layers_2_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_15_cast_fp16")];
+            tensor<int32, [3]> var_1417 = const()[name = string("op_1417"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_61_axes_0 = const()[name = string("input_61_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1418 = transpose(perm = var_1417, x = h_15_cast_fp16)[name = string("transpose_70")];
+            tensor<fp16, [1, 2560, 1, 1]> input_61 = expand_dims(axes = input_61_axes_0, x = var_1418)[name = string("input_61")];
+            string gate_9_pad_type_0 = const()[name = string("gate_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_9_strides_0 = const()[name = string("gate_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_9_pad_0 = const()[name = string("gate_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_9_dilations_0 = const()[name = string("gate_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_9_groups_0 = const()[name = string("gate_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_9 = conv(dilations = gate_9_dilations_0, groups = gate_9_groups_0, pad = gate_9_pad_0, pad_type = gate_9_pad_type_0, strides = gate_9_strides_0, weight = layers_2_mlp_gate_proj_weight_palettized, x = input_61)[name = string("gate_9")];
+            string up_5_pad_type_0 = const()[name = string("up_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_5_strides_0 = const()[name = string("up_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_5_pad_0 = const()[name = string("up_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_5_dilations_0 = const()[name = string("up_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_5_groups_0 = const()[name = string("up_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_5 = conv(dilations = up_5_dilations_0, groups = up_5_groups_0, pad = up_5_pad_0, pad_type = up_5_pad_type_0, strides = up_5_strides_0, weight = layers_2_mlp_up_proj_weight_palettized, x = input_61)[name = string("up_5")];
+            string gate_11_mode_0 = const()[name = string("gate_11_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_11 = gelu(mode = gate_11_mode_0, x = gate_9)[name = string("gate_11")];
+            tensor<fp16, [1, 10240, 1, 1]> input_63 = mul(x = gate_11, y = up_5)[name = string("input_63")];
+            string mlp_out_5_pad_type_0 = const()[name = string("mlp_out_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_5_strides_0 = const()[name = string("mlp_out_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_5_pad_0 = const()[name = string("mlp_out_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_5_dilations_0 = const()[name = string("mlp_out_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_5_groups_0 = const()[name = string("mlp_out_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_5 = conv(dilations = mlp_out_5_dilations_0, groups = mlp_out_5_groups_0, pad = mlp_out_5_pad_0, pad_type = mlp_out_5_pad_type_0, strides = mlp_out_5_strides_0, weight = layers_2_mlp_down_proj_weight_palettized, x = input_63)[name = string("mlp_out_5")];
+            tensor<int32, [1]> var_1458_axes_0 = const()[name = string("op_1458_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1458 = squeeze(axes = var_1458_axes_0, x = mlp_out_5)[name = string("op_1458")];
+            tensor<int32, [3]> var_1462 = const()[name = string("op_1462"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1468 = const()[name = string("op_1468"), val = int32(-1)];
+            fp16 const_21_promoted = const()[name = string("const_21_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_43 = transpose(perm = var_1462, x = var_1458)[name = string("transpose_69")];
+            tensor<fp16, [1, 1, 2560]> var_1470 = mul(x = x_43, y = const_21_promoted)[name = string("op_1470")];
+            bool input_65_interleave_0 = const()[name = string("input_65_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_65 = concat(axis = var_1468, interleave = input_65_interleave_0, values = (x_43, var_1470))[name = string("input_65")];
+            tensor<int32, [1]> normed_65_axes_0 = const()[name = string("normed_65_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1465_to_fp16 = const()[name = string("op_1465_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_65_cast_fp16 = layer_norm(axes = normed_65_axes_0, epsilon = var_1465_to_fp16, x = input_65)[name = string("normed_65_cast_fp16")];
+            tensor<int32, [2]> var_1475_split_sizes_0 = const()[name = string("op_1475_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1475_axis_0 = const()[name = string("op_1475_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1475_0, tensor<fp16, [1, 1, 2560]> var_1475_1 = split(axis = var_1475_axis_0, split_sizes = var_1475_split_sizes_0, x = normed_65_cast_fp16)[name = string("op_1475")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_23 = mul(x = var_1475_0, y = layers_2_post_feedforward_layernorm_weight)[name = string("hidden_states_23")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_25_cast_fp16 = add(x = x_41_cast_fp16, y = hidden_states_23)[name = string("hidden_states_25_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_5_begin_0 = const()[name = string("per_layer_slice_5_begin_0"), val = tensor<int32, [3]>([0, 0, 6656])];
+            tensor<int32, [3]> per_layer_slice_5_end_0 = const()[name = string("per_layer_slice_5_end_0"), val = tensor<int32, [3]>([1, 1, 6912])];
+            tensor<bool, [3]> per_layer_slice_5_end_mask_0 = const()[name = string("per_layer_slice_5_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_5_cast_fp16 = slice_by_index(begin = per_layer_slice_5_begin_0, end = per_layer_slice_5_end_0, end_mask = per_layer_slice_5_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_5_cast_fp16")];
+            tensor<int32, [3]> var_1503 = const()[name = string("op_1503"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_67_axes_0 = const()[name = string("input_67_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1504 = transpose(perm = var_1503, x = hidden_states_25_cast_fp16)[name = string("transpose_68")];
+            tensor<fp16, [1, 2560, 1, 1]> input_67 = expand_dims(axes = input_67_axes_0, x = var_1504)[name = string("input_67")];
+            string gated_13_pad_type_0 = const()[name = string("gated_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_13_strides_0 = const()[name = string("gated_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_13_pad_0 = const()[name = string("gated_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_13_dilations_0 = const()[name = string("gated_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_13_groups_0 = const()[name = string("gated_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_13 = conv(dilations = gated_13_dilations_0, groups = gated_13_groups_0, pad = gated_13_pad_0, pad_type = gated_13_pad_type_0, strides = gated_13_strides_0, weight = layers_2_per_layer_input_gate_weight_palettized, x = input_67)[name = string("gated_13")];
+            string gated_15_mode_0 = const()[name = string("gated_15_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_15 = gelu(mode = gated_15_mode_0, x = gated_13)[name = string("gated_15")];
+            tensor<int32, [3]> var_1523 = const()[name = string("op_1523"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_5_axes_0 = const()[name = string("per_layer_slice_conv_5_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_1524_cast_fp16 = transpose(perm = var_1523, x = per_layer_slice_5_cast_fp16)[name = string("transpose_67")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_5_cast_fp16 = expand_dims(axes = per_layer_slice_conv_5_axes_0, x = var_1524_cast_fp16)[name = string("per_layer_slice_conv_5_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_69_cast_fp16 = mul(x = gated_15, y = per_layer_slice_conv_5_cast_fp16)[name = string("input_69_cast_fp16")];
+            string gated_17_pad_type_0 = const()[name = string("gated_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_17_strides_0 = const()[name = string("gated_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_17_pad_0 = const()[name = string("gated_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_17_dilations_0 = const()[name = string("gated_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_17_groups_0 = const()[name = string("gated_17_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_2_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391932352))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(392260096))))[name = string("layers_2_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_17_cast_fp16 = conv(dilations = gated_17_dilations_0, groups = gated_17_groups_0, pad = gated_17_pad_0, pad_type = gated_17_pad_type_0, strides = gated_17_strides_0, weight = layers_2_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_69_cast_fp16)[name = string("gated_17_cast_fp16")];
+            tensor<int32, [1]> var_1540_axes_0 = const()[name = string("op_1540_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1540_cast_fp16 = squeeze(axes = var_1540_axes_0, x = gated_17_cast_fp16)[name = string("op_1540_cast_fp16")];
+            tensor<int32, [3]> var_1544 = const()[name = string("op_1544"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1550 = const()[name = string("op_1550"), val = int32(-1)];
+            fp16 const_22_promoted_to_fp16 = const()[name = string("const_22_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_45_cast_fp16 = transpose(perm = var_1544, x = var_1540_cast_fp16)[name = string("transpose_66")];
+            tensor<fp16, [1, 1, 2560]> var_1552_cast_fp16 = mul(x = x_45_cast_fp16, y = const_22_promoted_to_fp16)[name = string("op_1552_cast_fp16")];
+            bool input_71_interleave_0 = const()[name = string("input_71_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_71_cast_fp16 = concat(axis = var_1550, interleave = input_71_interleave_0, values = (x_45_cast_fp16, var_1552_cast_fp16))[name = string("input_71_cast_fp16")];
+            tensor<int32, [1]> normed_69_axes_0 = const()[name = string("normed_69_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1547_to_fp16 = const()[name = string("op_1547_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_69_cast_fp16 = layer_norm(axes = normed_69_axes_0, epsilon = var_1547_to_fp16, x = input_71_cast_fp16)[name = string("normed_69_cast_fp16")];
+            tensor<int32, [2]> var_1557_split_sizes_0 = const()[name = string("op_1557_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1557_axis_0 = const()[name = string("op_1557_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1557_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1557_cast_fp16_1 = split(axis = var_1557_axis_0, split_sizes = var_1557_split_sizes_0, x = normed_69_cast_fp16)[name = string("op_1557_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_2_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(392262720)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_29_cast_fp16 = mul(x = var_1557_cast_fp16_0, y = layers_2_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_29_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_31_cast_fp16 = add(x = hidden_states_25_cast_fp16, y = hidden_states_29_cast_fp16)[name = string("hidden_states_31_cast_fp16")];
+            tensor<fp16, [1]> const_23_promoted_to_fp16 = const()[name = string("const_23_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.6ep-1])];
+            tensor<fp16, [1, 1, 2560]> x_47_cast_fp16 = mul(x = hidden_states_31_cast_fp16, y = const_23_promoted_to_fp16)[name = string("x_47_cast_fp16")];
+            int32 var_1572 = const()[name = string("op_1572"), val = int32(-1)];
+            fp16 const_24_promoted_to_fp16 = const()[name = string("const_24_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_1574_cast_fp16 = mul(x = x_47_cast_fp16, y = const_24_promoted_to_fp16)[name = string("op_1574_cast_fp16")];
+            bool input_73_interleave_0 = const()[name = string("input_73_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_73_cast_fp16 = concat(axis = var_1572, interleave = input_73_interleave_0, values = (x_47_cast_fp16, var_1574_cast_fp16))[name = string("input_73_cast_fp16")];
+            tensor<int32, [1]> normed_73_axes_0 = const()[name = string("normed_73_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1569_to_fp16 = const()[name = string("op_1569_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_73_cast_fp16 = layer_norm(axes = normed_73_axes_0, epsilon = var_1569_to_fp16, x = input_73_cast_fp16)[name = string("normed_73_cast_fp16")];
+            tensor<int32, [2]> var_1579_split_sizes_0 = const()[name = string("op_1579_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1579_axis_0 = const()[name = string("op_1579_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1579_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1579_cast_fp16_1 = split(axis = var_1579_axis_0, split_sizes = var_1579_split_sizes_0, x = normed_73_cast_fp16)[name = string("op_1579_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_3_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(392267904)))];
+            tensor<fp16, [1, 1, 2560]> h_19_cast_fp16 = mul(x = var_1579_cast_fp16_0, y = layers_3_input_layernorm_weight_promoted_to_fp16)[name = string("h_19_cast_fp16")];
+            tensor<int32, [3]> var_1585 = const()[name = string("op_1585"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1588_axes_0 = const()[name = string("op_1588_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1586_cast_fp16 = transpose(perm = var_1585, x = h_19_cast_fp16)[name = string("transpose_65")];
+            tensor<fp16, [1, 2560, 1, 1]> var_1588_cast_fp16 = expand_dims(axes = var_1588_axes_0, x = var_1586_cast_fp16)[name = string("op_1588_cast_fp16")];
+            string var_1604_pad_type_0 = const()[name = string("op_1604_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1604_strides_0 = const()[name = string("op_1604_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1604_pad_0 = const()[name = string("op_1604_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1604_dilations_0 = const()[name = string("op_1604_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1604_groups_0 = const()[name = string("op_1604_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_1604 = conv(dilations = var_1604_dilations_0, groups = var_1604_groups_0, pad = var_1604_pad_0, pad_type = var_1604_pad_type_0, strides = var_1604_strides_0, weight = layers_3_self_attn_q_proj_weight_palettized, x = var_1588_cast_fp16)[name = string("op_1604")];
+            tensor<int32, [4]> var_1609 = const()[name = string("op_1609"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_1610 = reshape(shape = var_1609, x = var_1604)[name = string("op_1610")];
+            tensor<int32, [4]> var_1615 = const()[name = string("op_1615"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_1625 = const()[name = string("op_1625"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_1616 = transpose(perm = var_1615, x = var_1610)[name = string("transpose_64")];
+            tensor<fp16, [1, 8, 256]> x_49 = reshape(shape = var_1625, x = var_1616)[name = string("x_49")];
+            int32 var_1631 = const()[name = string("op_1631"), val = int32(-1)];
+            fp16 const_25_promoted = const()[name = string("const_25_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_1633 = mul(x = x_49, y = const_25_promoted)[name = string("op_1633")];
+            bool input_77_interleave_0 = const()[name = string("input_77_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_77 = concat(axis = var_1631, interleave = input_77_interleave_0, values = (x_49, var_1633))[name = string("input_77")];
+            tensor<int32, [1]> normed_77_axes_0 = const()[name = string("normed_77_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1628_to_fp16 = const()[name = string("op_1628_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_77_cast_fp16 = layer_norm(axes = normed_77_axes_0, epsilon = var_1628_to_fp16, x = input_77)[name = string("normed_77_cast_fp16")];
+            tensor<int32, [2]> var_1638_split_sizes_0 = const()[name = string("op_1638_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_1638_axis_0 = const()[name = string("op_1638_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_1638_0, tensor<fp16, [1, 8, 256]> var_1638_1 = split(axis = var_1638_axis_0, split_sizes = var_1638_split_sizes_0, x = normed_77_cast_fp16)[name = string("op_1638")];
+            tensor<fp16, [1, 8, 256]> var_1640 = mul(x = var_1638_0, y = layers_0_self_attn_q_norm_weight)[name = string("op_1640")];
+            tensor<int32, [4]> var_1645 = const()[name = string("op_1645"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_21 = reshape(shape = var_1645, x = var_1640)[name = string("q_21")];
+            tensor<fp16, [1, 8, 1, 256]> var_1647_cast_fp16 = mul(x = q_21, y = cos_s)[name = string("op_1647_cast_fp16")];
+            tensor<int32, [2]> var_1648_split_sizes_0 = const()[name = string("op_1648_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_1648_axis_0 = const()[name = string("op_1648_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_1648_0, tensor<fp16, [1, 8, 1, 128]> var_1648_1 = split(axis = var_1648_axis_0, split_sizes = var_1648_split_sizes_0, x = q_21)[name = string("op_1648")];
+            fp16 const_26_promoted = const()[name = string("const_26_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_1650 = mul(x = var_1648_1, y = const_26_promoted)[name = string("op_1650")];
+            int32 var_1652 = const()[name = string("op_1652"), val = int32(-1)];
+            bool var_1653_interleave_0 = const()[name = string("op_1653_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_1653 = concat(axis = var_1652, interleave = var_1653_interleave_0, values = (var_1650, var_1648_0))[name = string("op_1653")];
+            tensor<fp16, [1, 8, 1, 256]> var_1654_cast_fp16 = mul(x = var_1653, y = sin_s)[name = string("op_1654_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_23_cast_fp16 = add(x = var_1647_cast_fp16, y = var_1654_cast_fp16)[name = string("q_23_cast_fp16")];
+            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(false)];
+            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = q_23_cast_fp16, y = transpose_36_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_51_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = causal_mask_sliding)[name = string("x_51_cast_fp16")];
+            tensor<int32, [1]> reduce_max_3_axes_0 = const()[name = string("reduce_max_3_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_3_keep_dims_0 = const()[name = string("reduce_max_3_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_3 = reduce_max(axes = reduce_max_3_axes_0, keep_dims = reduce_max_3_keep_dims_0, x = x_51_cast_fp16)[name = string("reduce_max_3")];
+            tensor<fp16, [1, 8, 1, 512]> var_1686 = sub(x = x_51_cast_fp16, y = reduce_max_3)[name = string("op_1686")];
+            tensor<fp16, [1, 8, 1, 512]> var_1692 = exp(x = var_1686)[name = string("op_1692")];
+            tensor<int32, [1]> var_1702_axes_0 = const()[name = string("op_1702_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1702_keep_dims_0 = const()[name = string("op_1702_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_1702 = reduce_sum(axes = var_1702_axes_0, keep_dims = var_1702_keep_dims_0, x = var_1692)[name = string("op_1702")];
+            tensor<fp16, [1, 8, 1, 512]> var_1708_cast_fp16 = real_div(x = var_1692, y = var_1702)[name = string("op_1708_cast_fp16")];
+            bool attn_output_19_transpose_x_0 = const()[name = string("attn_output_19_transpose_x_0"), val = bool(false)];
+            bool attn_output_19_transpose_y_0 = const()[name = string("attn_output_19_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_19_cast_fp16 = matmul(transpose_x = attn_output_19_transpose_x_0, transpose_y = attn_output_19_transpose_y_0, x = var_1708_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_19_cast_fp16")];
+            tensor<int32, [4]> var_1719 = const()[name = string("op_1719"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1726 = const()[name = string("op_1726"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_1720_cast_fp16 = transpose(perm = var_1719, x = attn_output_19_cast_fp16)[name = string("transpose_63")];
+            tensor<fp16, [1, 1, 2048]> attn_output_21_cast_fp16 = reshape(shape = var_1726, x = var_1720_cast_fp16)[name = string("attn_output_21_cast_fp16")];
+            tensor<int32, [3]> var_1731 = const()[name = string("op_1731"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_1747_pad_type_0 = const()[name = string("op_1747_pad_type_0"), val = string("valid")];
+            int32 var_1747_groups_0 = const()[name = string("op_1747_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_1747_strides_0 = const()[name = string("op_1747_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_1747_pad_0 = const()[name = string("op_1747_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_1747_dilations_0 = const()[name = string("op_1747_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_3_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(392273088))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(394894592))))[name = string("squeeze_3_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_1732_cast_fp16 = transpose(perm = var_1731, x = attn_output_21_cast_fp16)[name = string("transpose_62")];
+            tensor<fp16, [1, 2560, 1]> var_1747_cast_fp16 = conv(dilations = var_1747_dilations_0, groups = var_1747_groups_0, pad = var_1747_pad_0, pad_type = var_1747_pad_type_0, strides = var_1747_strides_0, weight = squeeze_3_cast_fp16_to_fp32_to_fp16_palettized, x = var_1732_cast_fp16)[name = string("op_1747_cast_fp16")];
+            tensor<int32, [3]> var_1751 = const()[name = string("op_1751"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1757 = const()[name = string("op_1757"), val = int32(-1)];
+            fp16 const_27_promoted_to_fp16 = const()[name = string("const_27_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_55_cast_fp16 = transpose(perm = var_1751, x = var_1747_cast_fp16)[name = string("transpose_61")];
+            tensor<fp16, [1, 1, 2560]> var_1759_cast_fp16 = mul(x = x_55_cast_fp16, y = const_27_promoted_to_fp16)[name = string("op_1759_cast_fp16")];
+            bool input_81_interleave_0 = const()[name = string("input_81_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_81_cast_fp16 = concat(axis = var_1757, interleave = input_81_interleave_0, values = (x_55_cast_fp16, var_1759_cast_fp16))[name = string("input_81_cast_fp16")];
+            tensor<int32, [1]> normed_81_axes_0 = const()[name = string("normed_81_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1754_to_fp16 = const()[name = string("op_1754_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_81_cast_fp16 = layer_norm(axes = normed_81_axes_0, epsilon = var_1754_to_fp16, x = input_81_cast_fp16)[name = string("normed_81_cast_fp16")];
+            tensor<int32, [2]> var_1764_split_sizes_0 = const()[name = string("op_1764_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1764_axis_0 = const()[name = string("op_1764_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1764_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1764_cast_fp16_1 = split(axis = var_1764_axis_0, split_sizes = var_1764_split_sizes_0, x = normed_81_cast_fp16)[name = string("op_1764_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_3_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(394897216)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_23_cast_fp16 = mul(x = var_1764_cast_fp16_0, y = layers_3_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_23_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_57_cast_fp16 = add(x = x_47_cast_fp16, y = attn_output_23_cast_fp16)[name = string("x_57_cast_fp16")];
+            int32 var_1773 = const()[name = string("op_1773"), val = int32(-1)];
+            fp16 const_28_promoted_to_fp16 = const()[name = string("const_28_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_1775_cast_fp16 = mul(x = x_57_cast_fp16, y = const_28_promoted_to_fp16)[name = string("op_1775_cast_fp16")];
+            bool input_83_interleave_0 = const()[name = string("input_83_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_83_cast_fp16 = concat(axis = var_1773, interleave = input_83_interleave_0, values = (x_57_cast_fp16, var_1775_cast_fp16))[name = string("input_83_cast_fp16")];
+            tensor<int32, [1]> normed_85_axes_0 = const()[name = string("normed_85_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1770_to_fp16 = const()[name = string("op_1770_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_85_cast_fp16 = layer_norm(axes = normed_85_axes_0, epsilon = var_1770_to_fp16, x = input_83_cast_fp16)[name = string("normed_85_cast_fp16")];
+            tensor<int32, [2]> var_1780_split_sizes_0 = const()[name = string("op_1780_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1780_axis_0 = const()[name = string("op_1780_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1780_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1780_cast_fp16_1 = split(axis = var_1780_axis_0, split_sizes = var_1780_split_sizes_0, x = normed_85_cast_fp16)[name = string("op_1780_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_3_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(394902400)))];
+            tensor<fp16, [1, 1, 2560]> h_21_cast_fp16 = mul(x = var_1780_cast_fp16_0, y = layers_3_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_21_cast_fp16")];
+            tensor<int32, [3]> var_1791 = const()[name = string("op_1791"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_85_axes_0 = const()[name = string("input_85_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1792 = transpose(perm = var_1791, x = h_21_cast_fp16)[name = string("transpose_60")];
+            tensor<fp16, [1, 2560, 1, 1]> input_85 = expand_dims(axes = input_85_axes_0, x = var_1792)[name = string("input_85")];
+            string gate_13_pad_type_0 = const()[name = string("gate_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_13_strides_0 = const()[name = string("gate_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_13_pad_0 = const()[name = string("gate_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_13_dilations_0 = const()[name = string("gate_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_13_groups_0 = const()[name = string("gate_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_13 = conv(dilations = gate_13_dilations_0, groups = gate_13_groups_0, pad = gate_13_pad_0, pad_type = gate_13_pad_type_0, strides = gate_13_strides_0, weight = layers_3_mlp_gate_proj_weight_palettized, x = input_85)[name = string("gate_13")];
+            string up_7_pad_type_0 = const()[name = string("up_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_7_strides_0 = const()[name = string("up_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_7_pad_0 = const()[name = string("up_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_7_dilations_0 = const()[name = string("up_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_7_groups_0 = const()[name = string("up_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_7 = conv(dilations = up_7_dilations_0, groups = up_7_groups_0, pad = up_7_pad_0, pad_type = up_7_pad_type_0, strides = up_7_strides_0, weight = layers_3_mlp_up_proj_weight_palettized, x = input_85)[name = string("up_7")];
+            string gate_15_mode_0 = const()[name = string("gate_15_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_15 = gelu(mode = gate_15_mode_0, x = gate_13)[name = string("gate_15")];
+            tensor<fp16, [1, 10240, 1, 1]> input_87 = mul(x = gate_15, y = up_7)[name = string("input_87")];
+            string mlp_out_7_pad_type_0 = const()[name = string("mlp_out_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_7_strides_0 = const()[name = string("mlp_out_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_7_pad_0 = const()[name = string("mlp_out_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_7_dilations_0 = const()[name = string("mlp_out_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_7_groups_0 = const()[name = string("mlp_out_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_7 = conv(dilations = mlp_out_7_dilations_0, groups = mlp_out_7_groups_0, pad = mlp_out_7_pad_0, pad_type = mlp_out_7_pad_type_0, strides = mlp_out_7_strides_0, weight = layers_3_mlp_down_proj_weight_palettized, x = input_87)[name = string("mlp_out_7")];
+            tensor<int32, [1]> var_1832_axes_0 = const()[name = string("op_1832_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1832 = squeeze(axes = var_1832_axes_0, x = mlp_out_7)[name = string("op_1832")];
+            tensor<int32, [3]> var_1836 = const()[name = string("op_1836"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1842 = const()[name = string("op_1842"), val = int32(-1)];
+            fp16 const_29_promoted = const()[name = string("const_29_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_59 = transpose(perm = var_1836, x = var_1832)[name = string("transpose_59")];
+            tensor<fp16, [1, 1, 2560]> var_1844 = mul(x = x_59, y = const_29_promoted)[name = string("op_1844")];
+            bool input_89_interleave_0 = const()[name = string("input_89_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_89 = concat(axis = var_1842, interleave = input_89_interleave_0, values = (x_59, var_1844))[name = string("input_89")];
+            tensor<int32, [1]> normed_89_axes_0 = const()[name = string("normed_89_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1839_to_fp16 = const()[name = string("op_1839_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_89_cast_fp16 = layer_norm(axes = normed_89_axes_0, epsilon = var_1839_to_fp16, x = input_89)[name = string("normed_89_cast_fp16")];
+            tensor<int32, [2]> var_1849_split_sizes_0 = const()[name = string("op_1849_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1849_axis_0 = const()[name = string("op_1849_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1849_0, tensor<fp16, [1, 1, 2560]> var_1849_1 = split(axis = var_1849_axis_0, split_sizes = var_1849_split_sizes_0, x = normed_89_cast_fp16)[name = string("op_1849")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_33 = mul(x = var_1849_0, y = layers_3_post_feedforward_layernorm_weight)[name = string("hidden_states_33")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_35_cast_fp16 = add(x = x_57_cast_fp16, y = hidden_states_33)[name = string("hidden_states_35_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_7_begin_0 = const()[name = string("per_layer_slice_7_begin_0"), val = tensor<int32, [3]>([0, 0, 6912])];
+            tensor<int32, [3]> per_layer_slice_7_end_0 = const()[name = string("per_layer_slice_7_end_0"), val = tensor<int32, [3]>([1, 1, 7168])];
+            tensor<bool, [3]> per_layer_slice_7_end_mask_0 = const()[name = string("per_layer_slice_7_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_7_cast_fp16 = slice_by_index(begin = per_layer_slice_7_begin_0, end = per_layer_slice_7_end_0, end_mask = per_layer_slice_7_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_7_cast_fp16")];
+            tensor<int32, [3]> var_1877 = const()[name = string("op_1877"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_91_axes_0 = const()[name = string("input_91_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1878 = transpose(perm = var_1877, x = hidden_states_35_cast_fp16)[name = string("transpose_58")];
+            tensor<fp16, [1, 2560, 1, 1]> input_91 = expand_dims(axes = input_91_axes_0, x = var_1878)[name = string("input_91")];
+            string gated_19_pad_type_0 = const()[name = string("gated_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_19_strides_0 = const()[name = string("gated_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_19_pad_0 = const()[name = string("gated_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_19_dilations_0 = const()[name = string("gated_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_19_groups_0 = const()[name = string("gated_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_19 = conv(dilations = gated_19_dilations_0, groups = gated_19_groups_0, pad = gated_19_pad_0, pad_type = gated_19_pad_type_0, strides = gated_19_strides_0, weight = layers_3_per_layer_input_gate_weight_palettized, x = input_91)[name = string("gated_19")];
+            string gated_21_mode_0 = const()[name = string("gated_21_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_21 = gelu(mode = gated_21_mode_0, x = gated_19)[name = string("gated_21")];
+            tensor<int32, [3]> var_1897 = const()[name = string("op_1897"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_7_axes_0 = const()[name = string("per_layer_slice_conv_7_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_1898_cast_fp16 = transpose(perm = var_1897, x = per_layer_slice_7_cast_fp16)[name = string("transpose_57")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_7_cast_fp16 = expand_dims(axes = per_layer_slice_conv_7_axes_0, x = var_1898_cast_fp16)[name = string("per_layer_slice_conv_7_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_93_cast_fp16 = mul(x = gated_21, y = per_layer_slice_conv_7_cast_fp16)[name = string("input_93_cast_fp16")];
+            string gated_23_pad_type_0 = const()[name = string("gated_23_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_23_strides_0 = const()[name = string("gated_23_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_23_pad_0 = const()[name = string("gated_23_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_23_dilations_0 = const()[name = string("gated_23_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_23_groups_0 = const()[name = string("gated_23_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_3_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(394907584))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(395235328))))[name = string("layers_3_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_23_cast_fp16 = conv(dilations = gated_23_dilations_0, groups = gated_23_groups_0, pad = gated_23_pad_0, pad_type = gated_23_pad_type_0, strides = gated_23_strides_0, weight = layers_3_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_93_cast_fp16)[name = string("gated_23_cast_fp16")];
+            tensor<int32, [1]> var_1914_axes_0 = const()[name = string("op_1914_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1914_cast_fp16 = squeeze(axes = var_1914_axes_0, x = gated_23_cast_fp16)[name = string("op_1914_cast_fp16")];
+            tensor<int32, [3]> var_1918 = const()[name = string("op_1918"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1924 = const()[name = string("op_1924"), val = int32(-1)];
+            fp16 const_30_promoted_to_fp16 = const()[name = string("const_30_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_61_cast_fp16 = transpose(perm = var_1918, x = var_1914_cast_fp16)[name = string("transpose_56")];
+            tensor<fp16, [1, 1, 2560]> var_1926_cast_fp16 = mul(x = x_61_cast_fp16, y = const_30_promoted_to_fp16)[name = string("op_1926_cast_fp16")];
+            bool input_95_interleave_0 = const()[name = string("input_95_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_95_cast_fp16 = concat(axis = var_1924, interleave = input_95_interleave_0, values = (x_61_cast_fp16, var_1926_cast_fp16))[name = string("input_95_cast_fp16")];
+            tensor<int32, [1]> normed_93_axes_0 = const()[name = string("normed_93_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1921_to_fp16 = const()[name = string("op_1921_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_93_cast_fp16 = layer_norm(axes = normed_93_axes_0, epsilon = var_1921_to_fp16, x = input_95_cast_fp16)[name = string("normed_93_cast_fp16")];
+            tensor<int32, [2]> var_1931_split_sizes_0 = const()[name = string("op_1931_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1931_axis_0 = const()[name = string("op_1931_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1931_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1931_cast_fp16_1 = split(axis = var_1931_axis_0, split_sizes = var_1931_split_sizes_0, x = normed_93_cast_fp16)[name = string("op_1931_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_3_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(395237952)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_39_cast_fp16 = mul(x = var_1931_cast_fp16_0, y = layers_3_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_39_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_41_cast_fp16 = add(x = hidden_states_35_cast_fp16, y = hidden_states_39_cast_fp16)[name = string("hidden_states_41_cast_fp16")];
+            tensor<fp16, [1]> const_31_promoted_to_fp16 = const()[name = string("const_31_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.62p-1])];
+            tensor<fp16, [1, 1, 2560]> x_63_cast_fp16 = mul(x = hidden_states_41_cast_fp16, y = const_31_promoted_to_fp16)[name = string("x_63_cast_fp16")];
+            int32 var_1946 = const()[name = string("op_1946"), val = int32(-1)];
+            fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_1948_cast_fp16 = mul(x = x_63_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_1948_cast_fp16")];
+            bool input_97_interleave_0 = const()[name = string("input_97_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_97_cast_fp16 = concat(axis = var_1946, interleave = input_97_interleave_0, values = (x_63_cast_fp16, var_1948_cast_fp16))[name = string("input_97_cast_fp16")];
+            tensor<int32, [1]> normed_97_axes_0 = const()[name = string("normed_97_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1943_to_fp16 = const()[name = string("op_1943_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_97_cast_fp16 = layer_norm(axes = normed_97_axes_0, epsilon = var_1943_to_fp16, x = input_97_cast_fp16)[name = string("normed_97_cast_fp16")];
+            tensor<int32, [2]> var_1953_split_sizes_0 = const()[name = string("op_1953_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1953_axis_0 = const()[name = string("op_1953_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1953_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1953_cast_fp16_1 = split(axis = var_1953_axis_0, split_sizes = var_1953_split_sizes_0, x = normed_97_cast_fp16)[name = string("op_1953_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_4_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(395243136)))];
+            tensor<fp16, [1, 1, 2560]> h_25_cast_fp16 = mul(x = var_1953_cast_fp16_0, y = layers_4_input_layernorm_weight_promoted_to_fp16)[name = string("h_25_cast_fp16")];
+            tensor<int32, [3]> var_1959 = const()[name = string("op_1959"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1962_axes_0 = const()[name = string("op_1962_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1960_cast_fp16 = transpose(perm = var_1959, x = h_25_cast_fp16)[name = string("transpose_55")];
+            tensor<fp16, [1, 2560, 1, 1]> var_1962_cast_fp16 = expand_dims(axes = var_1962_axes_0, x = var_1960_cast_fp16)[name = string("op_1962_cast_fp16")];
+            string var_1978_pad_type_0 = const()[name = string("op_1978_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1978_strides_0 = const()[name = string("op_1978_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1978_pad_0 = const()[name = string("op_1978_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1978_dilations_0 = const()[name = string("op_1978_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1978_groups_0 = const()[name = string("op_1978_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_1978 = conv(dilations = var_1978_dilations_0, groups = var_1978_groups_0, pad = var_1978_pad_0, pad_type = var_1978_pad_type_0, strides = var_1978_strides_0, weight = layers_4_self_attn_q_proj_weight_palettized, x = var_1962_cast_fp16)[name = string("op_1978")];
+            tensor<int32, [4]> var_1983 = const()[name = string("op_1983"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_1984 = reshape(shape = var_1983, x = var_1978)[name = string("op_1984")];
+            tensor<int32, [4]> var_1989 = const()[name = string("op_1989"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_1999 = const()[name = string("op_1999"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_1990 = transpose(perm = var_1989, x = var_1984)[name = string("transpose_54")];
+            tensor<fp16, [1, 8, 256]> x_65 = reshape(shape = var_1999, x = var_1990)[name = string("x_65")];
+            int32 var_2005 = const()[name = string("op_2005"), val = int32(-1)];
+            fp16 const_33_promoted = const()[name = string("const_33_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_2007 = mul(x = x_65, y = const_33_promoted)[name = string("op_2007")];
+            bool input_101_interleave_0 = const()[name = string("input_101_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_101 = concat(axis = var_2005, interleave = input_101_interleave_0, values = (x_65, var_2007))[name = string("input_101")];
+            tensor<int32, [1]> normed_101_axes_0 = const()[name = string("normed_101_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2002_to_fp16 = const()[name = string("op_2002_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_101_cast_fp16 = layer_norm(axes = normed_101_axes_0, epsilon = var_2002_to_fp16, x = input_101)[name = string("normed_101_cast_fp16")];
+            tensor<int32, [2]> var_2012_split_sizes_0 = const()[name = string("op_2012_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2012_axis_0 = const()[name = string("op_2012_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_2012_0, tensor<fp16, [1, 8, 256]> var_2012_1 = split(axis = var_2012_axis_0, split_sizes = var_2012_split_sizes_0, x = normed_101_cast_fp16)[name = string("op_2012")];
+            tensor<fp16, [1, 8, 256]> var_2014 = mul(x = var_2012_0, y = layers_0_self_attn_q_norm_weight)[name = string("op_2014")];
+            tensor<int32, [4]> var_2019 = const()[name = string("op_2019"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_27 = reshape(shape = var_2019, x = var_2014)[name = string("q_27")];
+            tensor<fp16, [1, 8, 1, 256]> var_2021_cast_fp16 = mul(x = q_27, y = cos_s)[name = string("op_2021_cast_fp16")];
+            tensor<int32, [2]> var_2022_split_sizes_0 = const()[name = string("op_2022_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2022_axis_0 = const()[name = string("op_2022_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_2022_0, tensor<fp16, [1, 8, 1, 128]> var_2022_1 = split(axis = var_2022_axis_0, split_sizes = var_2022_split_sizes_0, x = q_27)[name = string("op_2022")];
+            fp16 const_34_promoted = const()[name = string("const_34_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_2024 = mul(x = var_2022_1, y = const_34_promoted)[name = string("op_2024")];
+            int32 var_2026 = const()[name = string("op_2026"), val = int32(-1)];
+            bool var_2027_interleave_0 = const()[name = string("op_2027_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_2027 = concat(axis = var_2026, interleave = var_2027_interleave_0, values = (var_2024, var_2022_0))[name = string("op_2027")];
+            tensor<fp16, [1, 8, 1, 256]> var_2028_cast_fp16 = mul(x = var_2027, y = sin_s)[name = string("op_2028_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_29_cast_fp16 = add(x = var_2021_cast_fp16, y = var_2028_cast_fp16)[name = string("q_29_cast_fp16")];
+            bool attn_weights_17_transpose_x_0 = const()[name = string("attn_weights_17_transpose_x_0"), val = bool(false)];
+            bool attn_weights_17_transpose_y_0 = const()[name = string("attn_weights_17_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_17_cast_fp16 = matmul(transpose_x = attn_weights_17_transpose_x_0, transpose_y = attn_weights_17_transpose_y_0, x = q_29_cast_fp16, y = transpose_36_cast_fp16)[name = string("attn_weights_17_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_67_cast_fp16 = add(x = attn_weights_17_cast_fp16, y = causal_mask_sliding)[name = string("x_67_cast_fp16")];
+            tensor<int32, [1]> reduce_max_4_axes_0 = const()[name = string("reduce_max_4_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_4_keep_dims_0 = const()[name = string("reduce_max_4_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_4 = reduce_max(axes = reduce_max_4_axes_0, keep_dims = reduce_max_4_keep_dims_0, x = x_67_cast_fp16)[name = string("reduce_max_4")];
+            tensor<fp16, [1, 8, 1, 512]> var_2060 = sub(x = x_67_cast_fp16, y = reduce_max_4)[name = string("op_2060")];
+            tensor<fp16, [1, 8, 1, 512]> var_2066 = exp(x = var_2060)[name = string("op_2066")];
+            tensor<int32, [1]> var_2076_axes_0 = const()[name = string("op_2076_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2076_keep_dims_0 = const()[name = string("op_2076_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_2076 = reduce_sum(axes = var_2076_axes_0, keep_dims = var_2076_keep_dims_0, x = var_2066)[name = string("op_2076")];
+            tensor<fp16, [1, 8, 1, 512]> var_2082_cast_fp16 = real_div(x = var_2066, y = var_2076)[name = string("op_2082_cast_fp16")];
+            bool attn_output_25_transpose_x_0 = const()[name = string("attn_output_25_transpose_x_0"), val = bool(false)];
+            bool attn_output_25_transpose_y_0 = const()[name = string("attn_output_25_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_25_cast_fp16 = matmul(transpose_x = attn_output_25_transpose_x_0, transpose_y = attn_output_25_transpose_y_0, x = var_2082_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_25_cast_fp16")];
+            tensor<int32, [4]> var_2093 = const()[name = string("op_2093"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_2100 = const()[name = string("op_2100"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_2094_cast_fp16 = transpose(perm = var_2093, x = attn_output_25_cast_fp16)[name = string("transpose_53")];
+            tensor<fp16, [1, 1, 2048]> attn_output_27_cast_fp16 = reshape(shape = var_2100, x = var_2094_cast_fp16)[name = string("attn_output_27_cast_fp16")];
+            tensor<int32, [3]> var_2105 = const()[name = string("op_2105"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_2121_pad_type_0 = const()[name = string("op_2121_pad_type_0"), val = string("valid")];
+            int32 var_2121_groups_0 = const()[name = string("op_2121_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_2121_strides_0 = const()[name = string("op_2121_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_2121_pad_0 = const()[name = string("op_2121_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_2121_dilations_0 = const()[name = string("op_2121_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_4_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(395248320))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397869824))))[name = string("squeeze_4_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_2106_cast_fp16 = transpose(perm = var_2105, x = attn_output_27_cast_fp16)[name = string("transpose_52")];
+            tensor<fp16, [1, 2560, 1]> var_2121_cast_fp16 = conv(dilations = var_2121_dilations_0, groups = var_2121_groups_0, pad = var_2121_pad_0, pad_type = var_2121_pad_type_0, strides = var_2121_strides_0, weight = squeeze_4_cast_fp16_to_fp32_to_fp16_palettized, x = var_2106_cast_fp16)[name = string("op_2121_cast_fp16")];
+            tensor<int32, [3]> var_2125 = const()[name = string("op_2125"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2131 = const()[name = string("op_2131"), val = int32(-1)];
+            fp16 const_35_promoted_to_fp16 = const()[name = string("const_35_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_71_cast_fp16 = transpose(perm = var_2125, x = var_2121_cast_fp16)[name = string("transpose_51")];
+            tensor<fp16, [1, 1, 2560]> var_2133_cast_fp16 = mul(x = x_71_cast_fp16, y = const_35_promoted_to_fp16)[name = string("op_2133_cast_fp16")];
+            bool input_105_interleave_0 = const()[name = string("input_105_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_105_cast_fp16 = concat(axis = var_2131, interleave = input_105_interleave_0, values = (x_71_cast_fp16, var_2133_cast_fp16))[name = string("input_105_cast_fp16")];
+            tensor<int32, [1]> normed_105_axes_0 = const()[name = string("normed_105_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2128_to_fp16 = const()[name = string("op_2128_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_105_cast_fp16 = layer_norm(axes = normed_105_axes_0, epsilon = var_2128_to_fp16, x = input_105_cast_fp16)[name = string("normed_105_cast_fp16")];
+            tensor<int32, [2]> var_2138_split_sizes_0 = const()[name = string("op_2138_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2138_axis_0 = const()[name = string("op_2138_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2138_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2138_cast_fp16_1 = split(axis = var_2138_axis_0, split_sizes = var_2138_split_sizes_0, x = normed_105_cast_fp16)[name = string("op_2138_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_4_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397872448)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_29_cast_fp16 = mul(x = var_2138_cast_fp16_0, y = layers_4_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_29_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_73_cast_fp16 = add(x = x_63_cast_fp16, y = attn_output_29_cast_fp16)[name = string("x_73_cast_fp16")];
+            int32 var_2147 = const()[name = string("op_2147"), val = int32(-1)];
+            fp16 const_36_promoted_to_fp16 = const()[name = string("const_36_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_2149_cast_fp16 = mul(x = x_73_cast_fp16, y = const_36_promoted_to_fp16)[name = string("op_2149_cast_fp16")];
+            bool input_107_interleave_0 = const()[name = string("input_107_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_107_cast_fp16 = concat(axis = var_2147, interleave = input_107_interleave_0, values = (x_73_cast_fp16, var_2149_cast_fp16))[name = string("input_107_cast_fp16")];
+            tensor<int32, [1]> normed_109_axes_0 = const()[name = string("normed_109_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2144_to_fp16 = const()[name = string("op_2144_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_109_cast_fp16 = layer_norm(axes = normed_109_axes_0, epsilon = var_2144_to_fp16, x = input_107_cast_fp16)[name = string("normed_109_cast_fp16")];
+            tensor<int32, [2]> var_2154_split_sizes_0 = const()[name = string("op_2154_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2154_axis_0 = const()[name = string("op_2154_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2154_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2154_cast_fp16_1 = split(axis = var_2154_axis_0, split_sizes = var_2154_split_sizes_0, x = normed_109_cast_fp16)[name = string("op_2154_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_4_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397877632)))];
+            tensor<fp16, [1, 1, 2560]> h_27_cast_fp16 = mul(x = var_2154_cast_fp16_0, y = layers_4_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_27_cast_fp16")];
+            tensor<int32, [3]> var_2165 = const()[name = string("op_2165"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_109_axes_0 = const()[name = string("input_109_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2166 = transpose(perm = var_2165, x = h_27_cast_fp16)[name = string("transpose_50")];
+            tensor<fp16, [1, 2560, 1, 1]> input_109 = expand_dims(axes = input_109_axes_0, x = var_2166)[name = string("input_109")];
+            string gate_17_pad_type_0 = const()[name = string("gate_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_17_strides_0 = const()[name = string("gate_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_17_pad_0 = const()[name = string("gate_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_17_dilations_0 = const()[name = string("gate_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_17_groups_0 = const()[name = string("gate_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_17 = conv(dilations = gate_17_dilations_0, groups = gate_17_groups_0, pad = gate_17_pad_0, pad_type = gate_17_pad_type_0, strides = gate_17_strides_0, weight = layers_4_mlp_gate_proj_weight_palettized, x = input_109)[name = string("gate_17")];
+            string up_9_pad_type_0 = const()[name = string("up_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_9_strides_0 = const()[name = string("up_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_9_pad_0 = const()[name = string("up_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_9_dilations_0 = const()[name = string("up_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_9_groups_0 = const()[name = string("up_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_9 = conv(dilations = up_9_dilations_0, groups = up_9_groups_0, pad = up_9_pad_0, pad_type = up_9_pad_type_0, strides = up_9_strides_0, weight = layers_4_mlp_up_proj_weight_palettized, x = input_109)[name = string("up_9")];
+            string gate_19_mode_0 = const()[name = string("gate_19_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_19 = gelu(mode = gate_19_mode_0, x = gate_17)[name = string("gate_19")];
+            tensor<fp16, [1, 10240, 1, 1]> input_111 = mul(x = gate_19, y = up_9)[name = string("input_111")];
+            string mlp_out_9_pad_type_0 = const()[name = string("mlp_out_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_9_strides_0 = const()[name = string("mlp_out_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_9_pad_0 = const()[name = string("mlp_out_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_9_dilations_0 = const()[name = string("mlp_out_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_9_groups_0 = const()[name = string("mlp_out_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_9 = conv(dilations = mlp_out_9_dilations_0, groups = mlp_out_9_groups_0, pad = mlp_out_9_pad_0, pad_type = mlp_out_9_pad_type_0, strides = mlp_out_9_strides_0, weight = layers_4_mlp_down_proj_weight_palettized, x = input_111)[name = string("mlp_out_9")];
+            tensor<int32, [1]> var_2206_axes_0 = const()[name = string("op_2206_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2206 = squeeze(axes = var_2206_axes_0, x = mlp_out_9)[name = string("op_2206")];
+            tensor<int32, [3]> var_2210 = const()[name = string("op_2210"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2216 = const()[name = string("op_2216"), val = int32(-1)];
+            fp16 const_37_promoted = const()[name = string("const_37_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_75 = transpose(perm = var_2210, x = var_2206)[name = string("transpose_49")];
+            tensor<fp16, [1, 1, 2560]> var_2218 = mul(x = x_75, y = const_37_promoted)[name = string("op_2218")];
+            bool input_113_interleave_0 = const()[name = string("input_113_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_113 = concat(axis = var_2216, interleave = input_113_interleave_0, values = (x_75, var_2218))[name = string("input_113")];
+            tensor<int32, [1]> normed_113_axes_0 = const()[name = string("normed_113_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2213_to_fp16 = const()[name = string("op_2213_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_113_cast_fp16 = layer_norm(axes = normed_113_axes_0, epsilon = var_2213_to_fp16, x = input_113)[name = string("normed_113_cast_fp16")];
+            tensor<int32, [2]> var_2223_split_sizes_0 = const()[name = string("op_2223_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2223_axis_0 = const()[name = string("op_2223_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2223_0, tensor<fp16, [1, 1, 2560]> var_2223_1 = split(axis = var_2223_axis_0, split_sizes = var_2223_split_sizes_0, x = normed_113_cast_fp16)[name = string("op_2223")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_43 = mul(x = var_2223_0, y = layers_4_post_feedforward_layernorm_weight)[name = string("hidden_states_43")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_45_cast_fp16 = add(x = x_73_cast_fp16, y = hidden_states_43)[name = string("hidden_states_45_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_9_begin_0 = const()[name = string("per_layer_slice_9_begin_0"), val = tensor<int32, [3]>([0, 0, 7168])];
+            tensor<int32, [3]> per_layer_slice_9_end_0 = const()[name = string("per_layer_slice_9_end_0"), val = tensor<int32, [3]>([1, 1, 7424])];
+            tensor<bool, [3]> per_layer_slice_9_end_mask_0 = const()[name = string("per_layer_slice_9_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_9_cast_fp16 = slice_by_index(begin = per_layer_slice_9_begin_0, end = per_layer_slice_9_end_0, end_mask = per_layer_slice_9_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_9_cast_fp16")];
+            tensor<int32, [3]> var_2251 = const()[name = string("op_2251"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_115_axes_0 = const()[name = string("input_115_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2252 = transpose(perm = var_2251, x = hidden_states_45_cast_fp16)[name = string("transpose_48")];
+            tensor<fp16, [1, 2560, 1, 1]> input_115 = expand_dims(axes = input_115_axes_0, x = var_2252)[name = string("input_115")];
+            string gated_25_pad_type_0 = const()[name = string("gated_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_25_strides_0 = const()[name = string("gated_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_25_pad_0 = const()[name = string("gated_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_25_dilations_0 = const()[name = string("gated_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_25_groups_0 = const()[name = string("gated_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_25 = conv(dilations = gated_25_dilations_0, groups = gated_25_groups_0, pad = gated_25_pad_0, pad_type = gated_25_pad_type_0, strides = gated_25_strides_0, weight = layers_4_per_layer_input_gate_weight_palettized, x = input_115)[name = string("gated_25")];
+            string gated_27_mode_0 = const()[name = string("gated_27_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_27 = gelu(mode = gated_27_mode_0, x = gated_25)[name = string("gated_27")];
+            tensor<int32, [3]> var_2271 = const()[name = string("op_2271"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_9_axes_0 = const()[name = string("per_layer_slice_conv_9_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_2272_cast_fp16 = transpose(perm = var_2271, x = per_layer_slice_9_cast_fp16)[name = string("transpose_47")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_9_cast_fp16 = expand_dims(axes = per_layer_slice_conv_9_axes_0, x = var_2272_cast_fp16)[name = string("per_layer_slice_conv_9_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_117_cast_fp16 = mul(x = gated_27, y = per_layer_slice_conv_9_cast_fp16)[name = string("input_117_cast_fp16")];
+            string gated_29_pad_type_0 = const()[name = string("gated_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_29_strides_0 = const()[name = string("gated_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_29_pad_0 = const()[name = string("gated_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_29_dilations_0 = const()[name = string("gated_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_29_groups_0 = const()[name = string("gated_29_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_4_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397882816))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(398210560))))[name = string("layers_4_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_29_cast_fp16 = conv(dilations = gated_29_dilations_0, groups = gated_29_groups_0, pad = gated_29_pad_0, pad_type = gated_29_pad_type_0, strides = gated_29_strides_0, weight = layers_4_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_117_cast_fp16)[name = string("gated_29_cast_fp16")];
+            tensor<int32, [1]> var_2288_axes_0 = const()[name = string("op_2288_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2288_cast_fp16 = squeeze(axes = var_2288_axes_0, x = gated_29_cast_fp16)[name = string("op_2288_cast_fp16")];
+            tensor<int32, [3]> var_2292 = const()[name = string("op_2292"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2298 = const()[name = string("op_2298"), val = int32(-1)];
+            fp16 const_38_promoted_to_fp16 = const()[name = string("const_38_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_77_cast_fp16 = transpose(perm = var_2292, x = var_2288_cast_fp16)[name = string("transpose_46")];
+            tensor<fp16, [1, 1, 2560]> var_2300_cast_fp16 = mul(x = x_77_cast_fp16, y = const_38_promoted_to_fp16)[name = string("op_2300_cast_fp16")];
+            bool input_119_interleave_0 = const()[name = string("input_119_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_119_cast_fp16 = concat(axis = var_2298, interleave = input_119_interleave_0, values = (x_77_cast_fp16, var_2300_cast_fp16))[name = string("input_119_cast_fp16")];
+            tensor<int32, [1]> normed_117_axes_0 = const()[name = string("normed_117_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2295_to_fp16 = const()[name = string("op_2295_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_117_cast_fp16 = layer_norm(axes = normed_117_axes_0, epsilon = var_2295_to_fp16, x = input_119_cast_fp16)[name = string("normed_117_cast_fp16")];
+            tensor<int32, [2]> var_2305_split_sizes_0 = const()[name = string("op_2305_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2305_axis_0 = const()[name = string("op_2305_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2305_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2305_cast_fp16_1 = split(axis = var_2305_axis_0, split_sizes = var_2305_split_sizes_0, x = normed_117_cast_fp16)[name = string("op_2305_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_4_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(398213184)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_49_cast_fp16 = mul(x = var_2305_cast_fp16_0, y = layers_4_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_49_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_51_cast_fp16 = add(x = hidden_states_45_cast_fp16, y = hidden_states_49_cast_fp16)[name = string("hidden_states_51_cast_fp16")];
+            tensor<fp16, [1]> const_39_promoted_to_fp16 = const()[name = string("const_39_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.3ap-1])];
+            tensor<fp16, [1, 1, 2560]> x_79_cast_fp16 = mul(x = hidden_states_51_cast_fp16, y = const_39_promoted_to_fp16)[name = string("x_79_cast_fp16")];
+            int32 var_2320 = const()[name = string("op_2320"), val = int32(-1)];
+            fp16 const_40_promoted_to_fp16 = const()[name = string("const_40_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_2322_cast_fp16 = mul(x = x_79_cast_fp16, y = const_40_promoted_to_fp16)[name = string("op_2322_cast_fp16")];
+            bool input_121_interleave_0 = const()[name = string("input_121_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_121_cast_fp16 = concat(axis = var_2320, interleave = input_121_interleave_0, values = (x_79_cast_fp16, var_2322_cast_fp16))[name = string("input_121_cast_fp16")];
+            tensor<int32, [1]> normed_121_axes_0 = const()[name = string("normed_121_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2317_to_fp16 = const()[name = string("op_2317_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_121_cast_fp16 = layer_norm(axes = normed_121_axes_0, epsilon = var_2317_to_fp16, x = input_121_cast_fp16)[name = string("normed_121_cast_fp16")];
+            tensor<int32, [2]> var_2327_split_sizes_0 = const()[name = string("op_2327_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2327_axis_0 = const()[name = string("op_2327_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2327_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2327_cast_fp16_1 = split(axis = var_2327_axis_0, split_sizes = var_2327_split_sizes_0, x = normed_121_cast_fp16)[name = string("op_2327_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_5_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(398218368)))];
+            tensor<fp16, [1, 1, 2560]> h_31_cast_fp16 = mul(x = var_2327_cast_fp16_0, y = layers_5_input_layernorm_weight_promoted_to_fp16)[name = string("h_31_cast_fp16")];
+            tensor<int32, [3]> var_2333 = const()[name = string("op_2333"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_2336_axes_0 = const()[name = string("op_2336_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2334_cast_fp16 = transpose(perm = var_2333, x = h_31_cast_fp16)[name = string("transpose_45")];
+            tensor<fp16, [1, 2560, 1, 1]> var_2336_cast_fp16 = expand_dims(axes = var_2336_axes_0, x = var_2334_cast_fp16)[name = string("op_2336_cast_fp16")];
+            string var_2352_pad_type_0 = const()[name = string("op_2352_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2352_strides_0 = const()[name = string("op_2352_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2352_pad_0 = const()[name = string("op_2352_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2352_dilations_0 = const()[name = string("op_2352_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2352_groups_0 = const()[name = string("op_2352_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 4096, 1, 1]> var_2352 = conv(dilations = var_2352_dilations_0, groups = var_2352_groups_0, pad = var_2352_pad_0, pad_type = var_2352_pad_type_0, strides = var_2352_strides_0, weight = layers_5_self_attn_q_proj_weight_palettized, x = var_2336_cast_fp16)[name = string("op_2352")];
+            tensor<int32, [4]> var_2357 = const()[name = string("op_2357"), val = tensor<int32, [4]>([1, 8, 512, 1])];
+            tensor<fp16, [1, 8, 512, 1]> var_2358 = reshape(shape = var_2357, x = var_2352)[name = string("op_2358")];
+            tensor<int32, [4]> var_2363 = const()[name = string("op_2363"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_2373 = const()[name = string("op_2373"), val = tensor<int32, [3]>([1, 8, 512])];
+            tensor<fp16, [1, 8, 1, 512]> var_2364 = transpose(perm = var_2363, x = var_2358)[name = string("transpose_44")];
+            tensor<fp16, [1, 8, 512]> x_81 = reshape(shape = var_2373, x = var_2364)[name = string("x_81")];
+            int32 var_2379 = const()[name = string("op_2379"), val = int32(-1)];
+            fp16 const_41_promoted = const()[name = string("const_41_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 512]> var_2381 = mul(x = x_81, y = const_41_promoted)[name = string("op_2381")];
+            bool input_125_interleave_0 = const()[name = string("input_125_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1024]> input_125 = concat(axis = var_2379, interleave = input_125_interleave_0, values = (x_81, var_2381))[name = string("input_125")];
+            tensor<int32, [1]> normed_125_axes_0 = const()[name = string("normed_125_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2376_to_fp16 = const()[name = string("op_2376_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 1024]> normed_125_cast_fp16 = layer_norm(axes = normed_125_axes_0, epsilon = var_2376_to_fp16, x = input_125)[name = string("normed_125_cast_fp16")];
+            tensor<int32, [2]> var_2386_split_sizes_0 = const()[name = string("op_2386_split_sizes_0"), val = tensor<int32, [2]>([512, 512])];
+            int32 var_2386_axis_0 = const()[name = string("op_2386_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 512]> var_2386_0, tensor<fp16, [1, 8, 512]> var_2386_1 = split(axis = var_2386_axis_0, split_sizes = var_2386_split_sizes_0, x = normed_125_cast_fp16)[name = string("op_2386")];
+            tensor<fp16, [1, 8, 512]> var_2388 = mul(x = var_2386_0, y = layers_5_self_attn_q_norm_weight)[name = string("op_2388")];
+            tensor<int32, [4]> var_2393 = const()[name = string("op_2393"), val = tensor<int32, [4]>([1, 8, 1, 512])];
+            tensor<fp16, [1, 8, 1, 512]> q_33 = reshape(shape = var_2393, x = var_2388)[name = string("q_33")];
+            tensor<fp16, [1, 8, 1, 512]> var_2395_cast_fp16 = mul(x = q_33, y = cos_f)[name = string("op_2395_cast_fp16")];
+            tensor<int32, [2]> var_2396_split_sizes_0 = const()[name = string("op_2396_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2396_axis_0 = const()[name = string("op_2396_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 256]> var_2396_0, tensor<fp16, [1, 8, 1, 256]> var_2396_1 = split(axis = var_2396_axis_0, split_sizes = var_2396_split_sizes_0, x = q_33)[name = string("op_2396")];
+            fp16 const_42_promoted = const()[name = string("const_42_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 256]> var_2398 = mul(x = var_2396_1, y = const_42_promoted)[name = string("op_2398")];
+            int32 var_2400 = const()[name = string("op_2400"), val = int32(-1)];
+            bool var_2401_interleave_0 = const()[name = string("op_2401_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> var_2401 = concat(axis = var_2400, interleave = var_2401_interleave_0, values = (var_2398, var_2396_0))[name = string("op_2401")];
+            tensor<fp16, [1, 8, 1, 512]> var_2402_cast_fp16 = mul(x = var_2401, y = sin_f)[name = string("op_2402_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> q_35_cast_fp16 = add(x = var_2395_cast_fp16, y = var_2402_cast_fp16)[name = string("q_35_cast_fp16")];
+            tensor<int32, [4]> transpose_20_perm_0 = const()[name = string("transpose_20_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_10_reps_0 = const()[name = string("tile_10_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 2048, 512]> transpose_20_cast_fp16 = transpose(perm = transpose_20_perm_0, x = kv14_k)[name = string("transpose_43")];
+            tensor<fp16, [8, 1, 2048, 512]> tile_10_cast_fp16 = tile(reps = tile_10_reps_0, x = transpose_20_cast_fp16)[name = string("tile_10_cast_fp16")];
+            tensor<int32, [5]> concat_20 = const()[name = string("concat_20"), val = tensor<int32, [5]>([4, 2, 1, 2048, 512])];
+            tensor<fp16, [4, 2, 1, 2048, 512]> reshape_20_cast_fp16 = reshape(shape = concat_20, x = tile_10_cast_fp16)[name = string("reshape_20_cast_fp16")];
+            tensor<int32, [5]> transpose_21_perm_0 = const()[name = string("transpose_21_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_21 = const()[name = string("concat_21"), val = tensor<int32, [4]>([-1, 1, 2048, 512])];
+            tensor<fp16, [2, 4, 1, 2048, 512]> transpose_21_cast_fp16 = transpose(perm = transpose_21_perm_0, x = reshape_20_cast_fp16)[name = string("transpose_42")];
+            tensor<fp16, [8, 1, 2048, 512]> reshape_21_cast_fp16 = reshape(shape = concat_21, x = transpose_21_cast_fp16)[name = string("reshape_21_cast_fp16")];
+            tensor<int32, [4]> transpose_41_perm_0 = const()[name = string("transpose_41_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_22_perm_0 = const()[name = string("transpose_22_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_11_reps_0 = const()[name = string("tile_11_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 2048, 512]> transpose_22_cast_fp16 = transpose(perm = transpose_22_perm_0, x = kv14_v)[name = string("transpose_41")];
+            tensor<fp16, [8, 1, 2048, 512]> tile_11_cast_fp16 = tile(reps = tile_11_reps_0, x = transpose_22_cast_fp16)[name = string("tile_11_cast_fp16")];
+            tensor<int32, [5]> concat_22 = const()[name = string("concat_22"), val = tensor<int32, [5]>([4, 2, 1, 2048, 512])];
+            tensor<fp16, [4, 2, 1, 2048, 512]> reshape_22_cast_fp16 = reshape(shape = concat_22, x = tile_11_cast_fp16)[name = string("reshape_22_cast_fp16")];
+            tensor<int32, [5]> transpose_23_perm_0 = const()[name = string("transpose_23_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_23 = const()[name = string("concat_23"), val = tensor<int32, [4]>([-1, 1, 2048, 512])];
+            tensor<fp16, [2, 4, 1, 2048, 512]> transpose_23_cast_fp16 = transpose(perm = transpose_23_perm_0, x = reshape_22_cast_fp16)[name = string("transpose_40")];
+            tensor<fp16, [8, 1, 2048, 512]> reshape_23_cast_fp16 = reshape(shape = concat_23, x = transpose_23_cast_fp16)[name = string("reshape_23_cast_fp16")];
+            tensor<int32, [4]> V_expanded_11_perm_0 = const()[name = string("V_expanded_11_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_21_transpose_x_0 = const()[name = string("attn_weights_21_transpose_x_0"), val = bool(false)];
+            bool attn_weights_21_transpose_y_0 = const()[name = string("attn_weights_21_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 2048]> transpose_41_cast_fp16 = transpose(perm = transpose_41_perm_0, x = reshape_21_cast_fp16)[name = string("transpose_39")];
+            tensor<fp16, [1, 8, 1, 2048]> attn_weights_21_cast_fp16 = matmul(transpose_x = attn_weights_21_transpose_x_0, transpose_y = attn_weights_21_transpose_y_0, x = q_35_cast_fp16, y = transpose_41_cast_fp16)[name = string("attn_weights_21_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 2048]> x_83_cast_fp16 = add(x = attn_weights_21_cast_fp16, y = causal_mask_full)[name = string("x_83_cast_fp16")];
+            tensor<int32, [1]> reduce_max_5_axes_0 = const()[name = string("reduce_max_5_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_5_keep_dims_0 = const()[name = string("reduce_max_5_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_5 = reduce_max(axes = reduce_max_5_axes_0, keep_dims = reduce_max_5_keep_dims_0, x = x_83_cast_fp16)[name = string("reduce_max_5")];
+            tensor<fp16, [1, 8, 1, 2048]> var_2434 = sub(x = x_83_cast_fp16, y = reduce_max_5)[name = string("op_2434")];
+            tensor<fp16, [1, 8, 1, 2048]> var_2440 = exp(x = var_2434)[name = string("op_2440")];
+            tensor<int32, [1]> var_2450_axes_0 = const()[name = string("op_2450_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2450_keep_dims_0 = const()[name = string("op_2450_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_2450 = reduce_sum(axes = var_2450_axes_0, keep_dims = var_2450_keep_dims_0, x = var_2440)[name = string("op_2450")];
+            tensor<fp16, [1, 8, 1, 2048]> var_2456_cast_fp16 = real_div(x = var_2440, y = var_2450)[name = string("op_2456_cast_fp16")];
+            bool attn_output_31_transpose_x_0 = const()[name = string("attn_output_31_transpose_x_0"), val = bool(false)];
+            bool attn_output_31_transpose_y_0 = const()[name = string("attn_output_31_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 2048, 512]> V_expanded_11_cast_fp16 = transpose(perm = V_expanded_11_perm_0, x = reshape_23_cast_fp16)[name = string("transpose_38")];
+            tensor<fp16, [1, 8, 1, 512]> attn_output_31_cast_fp16 = matmul(transpose_x = attn_output_31_transpose_x_0, transpose_y = attn_output_31_transpose_y_0, x = var_2456_cast_fp16, y = V_expanded_11_cast_fp16)[name = string("attn_output_31_cast_fp16")];
+            tensor<int32, [4]> var_2467 = const()[name = string("op_2467"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_2474 = const()[name = string("op_2474"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 512]> var_2468_cast_fp16 = transpose(perm = var_2467, x = attn_output_31_cast_fp16)[name = string("transpose_37")];
+            tensor<fp16, [1, 1, 4096]> attn_output_33_cast_fp16 = reshape(shape = var_2474, x = var_2468_cast_fp16)[name = string("attn_output_33_cast_fp16")];
+            tensor<int32, [3]> var_2479 = const()[name = string("op_2479"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_2495_pad_type_0 = const()[name = string("op_2495_pad_type_0"), val = string("valid")];
+            int32 var_2495_groups_0 = const()[name = string("op_2495_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_2495_strides_0 = const()[name = string("op_2495_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_2495_pad_0 = const()[name = string("op_2495_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_2495_dilations_0 = const()[name = string("op_2495_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 4096, 1]> squeeze_5_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 4096, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(398223552))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403466496))))[name = string("squeeze_5_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 4096, 1]> var_2480_cast_fp16 = transpose(perm = var_2479, x = attn_output_33_cast_fp16)[name = string("transpose_36")];
+            tensor<fp16, [1, 2560, 1]> var_2495_cast_fp16 = conv(dilations = var_2495_dilations_0, groups = var_2495_groups_0, pad = var_2495_pad_0, pad_type = var_2495_pad_type_0, strides = var_2495_strides_0, weight = squeeze_5_cast_fp16_to_fp32_to_fp16_palettized, x = var_2480_cast_fp16)[name = string("op_2495_cast_fp16")];
+            tensor<int32, [3]> var_2499 = const()[name = string("op_2499"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2505 = const()[name = string("op_2505"), val = int32(-1)];
+            fp16 const_43_promoted_to_fp16 = const()[name = string("const_43_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_87_cast_fp16 = transpose(perm = var_2499, x = var_2495_cast_fp16)[name = string("transpose_35")];
+            tensor<fp16, [1, 1, 2560]> var_2507_cast_fp16 = mul(x = x_87_cast_fp16, y = const_43_promoted_to_fp16)[name = string("op_2507_cast_fp16")];
+            bool input_129_interleave_0 = const()[name = string("input_129_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_129_cast_fp16 = concat(axis = var_2505, interleave = input_129_interleave_0, values = (x_87_cast_fp16, var_2507_cast_fp16))[name = string("input_129_cast_fp16")];
+            tensor<int32, [1]> normed_129_axes_0 = const()[name = string("normed_129_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2502_to_fp16 = const()[name = string("op_2502_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_129_cast_fp16 = layer_norm(axes = normed_129_axes_0, epsilon = var_2502_to_fp16, x = input_129_cast_fp16)[name = string("normed_129_cast_fp16")];
+            tensor<int32, [2]> var_2512_split_sizes_0 = const()[name = string("op_2512_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2512_axis_0 = const()[name = string("op_2512_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2512_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2512_cast_fp16_1 = split(axis = var_2512_axis_0, split_sizes = var_2512_split_sizes_0, x = normed_129_cast_fp16)[name = string("op_2512_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_5_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403469120)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_35_cast_fp16 = mul(x = var_2512_cast_fp16_0, y = layers_5_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_35_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_89_cast_fp16 = add(x = x_79_cast_fp16, y = attn_output_35_cast_fp16)[name = string("x_89_cast_fp16")];
+            int32 var_2521 = const()[name = string("op_2521"), val = int32(-1)];
+            fp16 const_44_promoted_to_fp16 = const()[name = string("const_44_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_2523_cast_fp16 = mul(x = x_89_cast_fp16, y = const_44_promoted_to_fp16)[name = string("op_2523_cast_fp16")];
+            bool input_131_interleave_0 = const()[name = string("input_131_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_131_cast_fp16 = concat(axis = var_2521, interleave = input_131_interleave_0, values = (x_89_cast_fp16, var_2523_cast_fp16))[name = string("input_131_cast_fp16")];
+            tensor<int32, [1]> normed_133_axes_0 = const()[name = string("normed_133_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2518_to_fp16 = const()[name = string("op_2518_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_133_cast_fp16 = layer_norm(axes = normed_133_axes_0, epsilon = var_2518_to_fp16, x = input_131_cast_fp16)[name = string("normed_133_cast_fp16")];
+            tensor<int32, [2]> var_2528_split_sizes_0 = const()[name = string("op_2528_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2528_axis_0 = const()[name = string("op_2528_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2528_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2528_cast_fp16_1 = split(axis = var_2528_axis_0, split_sizes = var_2528_split_sizes_0, x = normed_133_cast_fp16)[name = string("op_2528_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_5_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403474304)))];
+            tensor<fp16, [1, 1, 2560]> h_33_cast_fp16 = mul(x = var_2528_cast_fp16_0, y = layers_5_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_33_cast_fp16")];
+            tensor<int32, [3]> var_2539 = const()[name = string("op_2539"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_133_axes_0 = const()[name = string("input_133_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2540 = transpose(perm = var_2539, x = h_33_cast_fp16)[name = string("transpose_34")];
+            tensor<fp16, [1, 2560, 1, 1]> input_133 = expand_dims(axes = input_133_axes_0, x = var_2540)[name = string("input_133")];
+            string gate_21_pad_type_0 = const()[name = string("gate_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_21_strides_0 = const()[name = string("gate_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_21_pad_0 = const()[name = string("gate_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_21_dilations_0 = const()[name = string("gate_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_21_groups_0 = const()[name = string("gate_21_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_21 = conv(dilations = gate_21_dilations_0, groups = gate_21_groups_0, pad = gate_21_pad_0, pad_type = gate_21_pad_type_0, strides = gate_21_strides_0, weight = layers_5_mlp_gate_proj_weight_palettized, x = input_133)[name = string("gate_21")];
+            string up_11_pad_type_0 = const()[name = string("up_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_11_strides_0 = const()[name = string("up_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_11_pad_0 = const()[name = string("up_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_11_dilations_0 = const()[name = string("up_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_11_groups_0 = const()[name = string("up_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_11 = conv(dilations = up_11_dilations_0, groups = up_11_groups_0, pad = up_11_pad_0, pad_type = up_11_pad_type_0, strides = up_11_strides_0, weight = layers_5_mlp_up_proj_weight_palettized, x = input_133)[name = string("up_11")];
+            string gate_23_mode_0 = const()[name = string("gate_23_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_23 = gelu(mode = gate_23_mode_0, x = gate_21)[name = string("gate_23")];
+            tensor<fp16, [1, 10240, 1, 1]> input_135 = mul(x = gate_23, y = up_11)[name = string("input_135")];
+            string mlp_out_11_pad_type_0 = const()[name = string("mlp_out_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_11_strides_0 = const()[name = string("mlp_out_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_11_pad_0 = const()[name = string("mlp_out_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_11_dilations_0 = const()[name = string("mlp_out_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_11_groups_0 = const()[name = string("mlp_out_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_11 = conv(dilations = mlp_out_11_dilations_0, groups = mlp_out_11_groups_0, pad = mlp_out_11_pad_0, pad_type = mlp_out_11_pad_type_0, strides = mlp_out_11_strides_0, weight = layers_5_mlp_down_proj_weight_palettized, x = input_135)[name = string("mlp_out_11")];
+            tensor<int32, [1]> var_2580_axes_0 = const()[name = string("op_2580_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2580 = squeeze(axes = var_2580_axes_0, x = mlp_out_11)[name = string("op_2580")];
+            tensor<int32, [3]> var_2584 = const()[name = string("op_2584"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2590 = const()[name = string("op_2590"), val = int32(-1)];
+            fp16 const_45_promoted = const()[name = string("const_45_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_91 = transpose(perm = var_2584, x = var_2580)[name = string("transpose_33")];
+            tensor<fp16, [1, 1, 2560]> var_2592 = mul(x = x_91, y = const_45_promoted)[name = string("op_2592")];
+            bool input_137_interleave_0 = const()[name = string("input_137_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_137 = concat(axis = var_2590, interleave = input_137_interleave_0, values = (x_91, var_2592))[name = string("input_137")];
+            tensor<int32, [1]> normed_137_axes_0 = const()[name = string("normed_137_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2587_to_fp16 = const()[name = string("op_2587_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_137_cast_fp16 = layer_norm(axes = normed_137_axes_0, epsilon = var_2587_to_fp16, x = input_137)[name = string("normed_137_cast_fp16")];
+            tensor<int32, [2]> var_2597_split_sizes_0 = const()[name = string("op_2597_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2597_axis_0 = const()[name = string("op_2597_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2597_0, tensor<fp16, [1, 1, 2560]> var_2597_1 = split(axis = var_2597_axis_0, split_sizes = var_2597_split_sizes_0, x = normed_137_cast_fp16)[name = string("op_2597")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_53 = mul(x = var_2597_0, y = layers_5_post_feedforward_layernorm_weight)[name = string("hidden_states_53")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_55_cast_fp16 = add(x = x_89_cast_fp16, y = hidden_states_53)[name = string("hidden_states_55_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_11_begin_0 = const()[name = string("per_layer_slice_11_begin_0"), val = tensor<int32, [3]>([0, 0, 7424])];
+            tensor<int32, [3]> per_layer_slice_11_end_0 = const()[name = string("per_layer_slice_11_end_0"), val = tensor<int32, [3]>([1, 1, 7680])];
+            tensor<bool, [3]> per_layer_slice_11_end_mask_0 = const()[name = string("per_layer_slice_11_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_11_cast_fp16 = slice_by_index(begin = per_layer_slice_11_begin_0, end = per_layer_slice_11_end_0, end_mask = per_layer_slice_11_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_11_cast_fp16")];
+            tensor<int32, [3]> var_2625 = const()[name = string("op_2625"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_139_axes_0 = const()[name = string("input_139_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2626 = transpose(perm = var_2625, x = hidden_states_55_cast_fp16)[name = string("transpose_32")];
+            tensor<fp16, [1, 2560, 1, 1]> input_139 = expand_dims(axes = input_139_axes_0, x = var_2626)[name = string("input_139")];
+            string gated_31_pad_type_0 = const()[name = string("gated_31_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_31_strides_0 = const()[name = string("gated_31_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_31_pad_0 = const()[name = string("gated_31_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_31_dilations_0 = const()[name = string("gated_31_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_31_groups_0 = const()[name = string("gated_31_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_31 = conv(dilations = gated_31_dilations_0, groups = gated_31_groups_0, pad = gated_31_pad_0, pad_type = gated_31_pad_type_0, strides = gated_31_strides_0, weight = layers_5_per_layer_input_gate_weight_palettized, x = input_139)[name = string("gated_31")];
+            string gated_33_mode_0 = const()[name = string("gated_33_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_33 = gelu(mode = gated_33_mode_0, x = gated_31)[name = string("gated_33")];
+            tensor<int32, [3]> var_2645 = const()[name = string("op_2645"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_11_axes_0 = const()[name = string("per_layer_slice_conv_11_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_2646_cast_fp16 = transpose(perm = var_2645, x = per_layer_slice_11_cast_fp16)[name = string("transpose_31")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_11_cast_fp16 = expand_dims(axes = per_layer_slice_conv_11_axes_0, x = var_2646_cast_fp16)[name = string("per_layer_slice_conv_11_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_141_cast_fp16 = mul(x = gated_33, y = per_layer_slice_conv_11_cast_fp16)[name = string("input_141_cast_fp16")];
+            string gated_35_pad_type_0 = const()[name = string("gated_35_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_35_strides_0 = const()[name = string("gated_35_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_35_pad_0 = const()[name = string("gated_35_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_35_dilations_0 = const()[name = string("gated_35_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_35_groups_0 = const()[name = string("gated_35_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_5_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403479488))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403807232))))[name = string("layers_5_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_35_cast_fp16 = conv(dilations = gated_35_dilations_0, groups = gated_35_groups_0, pad = gated_35_pad_0, pad_type = gated_35_pad_type_0, strides = gated_35_strides_0, weight = layers_5_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_141_cast_fp16)[name = string("gated_35_cast_fp16")];
+            tensor<int32, [1]> var_2662_axes_0 = const()[name = string("op_2662_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2662_cast_fp16 = squeeze(axes = var_2662_axes_0, x = gated_35_cast_fp16)[name = string("op_2662_cast_fp16")];
+            tensor<int32, [3]> var_2666 = const()[name = string("op_2666"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2672 = const()[name = string("op_2672"), val = int32(-1)];
+            fp16 const_46_promoted_to_fp16 = const()[name = string("const_46_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_93_cast_fp16 = transpose(perm = var_2666, x = var_2662_cast_fp16)[name = string("transpose_30")];
+            tensor<fp16, [1, 1, 2560]> var_2674_cast_fp16 = mul(x = x_93_cast_fp16, y = const_46_promoted_to_fp16)[name = string("op_2674_cast_fp16")];
+            bool input_143_interleave_0 = const()[name = string("input_143_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_143_cast_fp16 = concat(axis = var_2672, interleave = input_143_interleave_0, values = (x_93_cast_fp16, var_2674_cast_fp16))[name = string("input_143_cast_fp16")];
+            tensor<int32, [1]> normed_141_axes_0 = const()[name = string("normed_141_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2669_to_fp16 = const()[name = string("op_2669_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_141_cast_fp16 = layer_norm(axes = normed_141_axes_0, epsilon = var_2669_to_fp16, x = input_143_cast_fp16)[name = string("normed_141_cast_fp16")];
+            tensor<int32, [2]> var_2679_split_sizes_0 = const()[name = string("op_2679_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2679_axis_0 = const()[name = string("op_2679_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2679_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2679_cast_fp16_1 = split(axis = var_2679_axis_0, split_sizes = var_2679_split_sizes_0, x = normed_141_cast_fp16)[name = string("op_2679_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_5_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403809856)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_59_cast_fp16 = mul(x = var_2679_cast_fp16_0, y = layers_5_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_59_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_61_cast_fp16 = add(x = hidden_states_55_cast_fp16, y = hidden_states_59_cast_fp16)[name = string("hidden_states_61_cast_fp16")];
+            tensor<fp16, [1]> const_47_promoted_to_fp16 = const()[name = string("const_47_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.aep-2])];
+            tensor<fp16, [1, 1, 2560]> x_95_cast_fp16 = mul(x = hidden_states_61_cast_fp16, y = const_47_promoted_to_fp16)[name = string("x_95_cast_fp16")];
+            int32 var_2694 = const()[name = string("op_2694"), val = int32(-1)];
+            fp16 const_48_promoted_to_fp16 = const()[name = string("const_48_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_2696_cast_fp16 = mul(x = x_95_cast_fp16, y = const_48_promoted_to_fp16)[name = string("op_2696_cast_fp16")];
+            bool input_145_interleave_0 = const()[name = string("input_145_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_145_cast_fp16 = concat(axis = var_2694, interleave = input_145_interleave_0, values = (x_95_cast_fp16, var_2696_cast_fp16))[name = string("input_145_cast_fp16")];
+            tensor<int32, [1]> normed_145_axes_0 = const()[name = string("normed_145_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2691_to_fp16 = const()[name = string("op_2691_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_145_cast_fp16 = layer_norm(axes = normed_145_axes_0, epsilon = var_2691_to_fp16, x = input_145_cast_fp16)[name = string("normed_145_cast_fp16")];
+            tensor<int32, [2]> var_2701_split_sizes_0 = const()[name = string("op_2701_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2701_axis_0 = const()[name = string("op_2701_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2701_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2701_cast_fp16_1 = split(axis = var_2701_axis_0, split_sizes = var_2701_split_sizes_0, x = normed_145_cast_fp16)[name = string("op_2701_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_6_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403815040)))];
+            tensor<fp16, [1, 1, 2560]> h_37_cast_fp16 = mul(x = var_2701_cast_fp16_0, y = layers_6_input_layernorm_weight_promoted_to_fp16)[name = string("h_37_cast_fp16")];
+            tensor<int32, [3]> var_2707 = const()[name = string("op_2707"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_2710_axes_0 = const()[name = string("op_2710_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2708_cast_fp16 = transpose(perm = var_2707, x = h_37_cast_fp16)[name = string("transpose_29")];
+            tensor<fp16, [1, 2560, 1, 1]> var_2710_cast_fp16 = expand_dims(axes = var_2710_axes_0, x = var_2708_cast_fp16)[name = string("op_2710_cast_fp16")];
+            string var_2726_pad_type_0 = const()[name = string("op_2726_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2726_strides_0 = const()[name = string("op_2726_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2726_pad_0 = const()[name = string("op_2726_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2726_dilations_0 = const()[name = string("op_2726_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2726_groups_0 = const()[name = string("op_2726_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_2726 = conv(dilations = var_2726_dilations_0, groups = var_2726_groups_0, pad = var_2726_pad_0, pad_type = var_2726_pad_type_0, strides = var_2726_strides_0, weight = layers_6_self_attn_q_proj_weight_palettized, x = var_2710_cast_fp16)[name = string("op_2726")];
+            tensor<int32, [4]> var_2731 = const()[name = string("op_2731"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_2732 = reshape(shape = var_2731, x = var_2726)[name = string("op_2732")];
+            tensor<int32, [4]> var_2737 = const()[name = string("op_2737"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_2747 = const()[name = string("op_2747"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_2738 = transpose(perm = var_2737, x = var_2732)[name = string("transpose_28")];
+            tensor<fp16, [1, 8, 256]> x_97 = reshape(shape = var_2747, x = var_2738)[name = string("x_97")];
+            int32 var_2753 = const()[name = string("op_2753"), val = int32(-1)];
+            fp16 const_49_promoted = const()[name = string("const_49_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_2755 = mul(x = x_97, y = const_49_promoted)[name = string("op_2755")];
+            bool input_149_interleave_0 = const()[name = string("input_149_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_149 = concat(axis = var_2753, interleave = input_149_interleave_0, values = (x_97, var_2755))[name = string("input_149")];
+            tensor<int32, [1]> normed_149_axes_0 = const()[name = string("normed_149_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2750_to_fp16 = const()[name = string("op_2750_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_149_cast_fp16 = layer_norm(axes = normed_149_axes_0, epsilon = var_2750_to_fp16, x = input_149)[name = string("normed_149_cast_fp16")];
+            tensor<int32, [2]> var_2760_split_sizes_0 = const()[name = string("op_2760_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2760_axis_0 = const()[name = string("op_2760_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_2760_0, tensor<fp16, [1, 8, 256]> var_2760_1 = split(axis = var_2760_axis_0, split_sizes = var_2760_split_sizes_0, x = normed_149_cast_fp16)[name = string("op_2760")];
+            tensor<fp16, [1, 8, 256]> var_2762 = mul(x = var_2760_0, y = layers_0_self_attn_q_norm_weight)[name = string("op_2762")];
+            tensor<int32, [4]> var_2767 = const()[name = string("op_2767"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_39 = reshape(shape = var_2767, x = var_2762)[name = string("q_39")];
+            tensor<fp16, [1, 8, 1, 256]> var_2769_cast_fp16 = mul(x = q_39, y = cos_s)[name = string("op_2769_cast_fp16")];
+            tensor<int32, [2]> var_2770_split_sizes_0 = const()[name = string("op_2770_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2770_axis_0 = const()[name = string("op_2770_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_2770_0, tensor<fp16, [1, 8, 1, 128]> var_2770_1 = split(axis = var_2770_axis_0, split_sizes = var_2770_split_sizes_0, x = q_39)[name = string("op_2770")];
+            fp16 const_50_promoted = const()[name = string("const_50_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_2772 = mul(x = var_2770_1, y = const_50_promoted)[name = string("op_2772")];
+            int32 var_2774 = const()[name = string("op_2774"), val = int32(-1)];
+            bool var_2775_interleave_0 = const()[name = string("op_2775_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_2775 = concat(axis = var_2774, interleave = var_2775_interleave_0, values = (var_2772, var_2770_0))[name = string("op_2775")];
+            tensor<fp16, [1, 8, 1, 256]> var_2776_cast_fp16 = mul(x = var_2775, y = sin_s)[name = string("op_2776_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_41_cast_fp16 = add(x = var_2769_cast_fp16, y = var_2776_cast_fp16)[name = string("q_41_cast_fp16")];
+            bool attn_weights_25_transpose_x_0 = const()[name = string("attn_weights_25_transpose_x_0"), val = bool(false)];
+            bool attn_weights_25_transpose_y_0 = const()[name = string("attn_weights_25_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_25_cast_fp16 = matmul(transpose_x = attn_weights_25_transpose_x_0, transpose_y = attn_weights_25_transpose_y_0, x = q_41_cast_fp16, y = transpose_36_cast_fp16)[name = string("attn_weights_25_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_99_cast_fp16 = add(x = attn_weights_25_cast_fp16, y = causal_mask_sliding)[name = string("x_99_cast_fp16")];
+            tensor<int32, [1]> reduce_max_6_axes_0 = const()[name = string("reduce_max_6_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_6_keep_dims_0 = const()[name = string("reduce_max_6_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_6 = reduce_max(axes = reduce_max_6_axes_0, keep_dims = reduce_max_6_keep_dims_0, x = x_99_cast_fp16)[name = string("reduce_max_6")];
+            tensor<fp16, [1, 8, 1, 512]> var_2808 = sub(x = x_99_cast_fp16, y = reduce_max_6)[name = string("op_2808")];
+            tensor<fp16, [1, 8, 1, 512]> var_2814 = exp(x = var_2808)[name = string("op_2814")];
+            tensor<int32, [1]> var_2824_axes_0 = const()[name = string("op_2824_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2824_keep_dims_0 = const()[name = string("op_2824_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_2824 = reduce_sum(axes = var_2824_axes_0, keep_dims = var_2824_keep_dims_0, x = var_2814)[name = string("op_2824")];
+            tensor<fp16, [1, 8, 1, 512]> var_2830_cast_fp16 = real_div(x = var_2814, y = var_2824)[name = string("op_2830_cast_fp16")];
+            bool attn_output_37_transpose_x_0 = const()[name = string("attn_output_37_transpose_x_0"), val = bool(false)];
+            bool attn_output_37_transpose_y_0 = const()[name = string("attn_output_37_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_37_cast_fp16 = matmul(transpose_x = attn_output_37_transpose_x_0, transpose_y = attn_output_37_transpose_y_0, x = var_2830_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_37_cast_fp16")];
+            tensor<int32, [4]> var_2841 = const()[name = string("op_2841"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_2848 = const()[name = string("op_2848"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_2842_cast_fp16 = transpose(perm = var_2841, x = attn_output_37_cast_fp16)[name = string("transpose_27")];
+            tensor<fp16, [1, 1, 2048]> attn_output_39_cast_fp16 = reshape(shape = var_2848, x = var_2842_cast_fp16)[name = string("attn_output_39_cast_fp16")];
+            tensor<int32, [3]> var_2853 = const()[name = string("op_2853"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_2869_pad_type_0 = const()[name = string("op_2869_pad_type_0"), val = string("valid")];
+            int32 var_2869_groups_0 = const()[name = string("op_2869_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_2869_strides_0 = const()[name = string("op_2869_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_2869_pad_0 = const()[name = string("op_2869_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_2869_dilations_0 = const()[name = string("op_2869_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_6_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403820224))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406441728))))[name = string("squeeze_6_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_2854_cast_fp16 = transpose(perm = var_2853, x = attn_output_39_cast_fp16)[name = string("transpose_26")];
+            tensor<fp16, [1, 2560, 1]> var_2869_cast_fp16 = conv(dilations = var_2869_dilations_0, groups = var_2869_groups_0, pad = var_2869_pad_0, pad_type = var_2869_pad_type_0, strides = var_2869_strides_0, weight = squeeze_6_cast_fp16_to_fp32_to_fp16_palettized, x = var_2854_cast_fp16)[name = string("op_2869_cast_fp16")];
+            tensor<int32, [3]> var_2873 = const()[name = string("op_2873"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2879 = const()[name = string("op_2879"), val = int32(-1)];
+            fp16 const_51_promoted_to_fp16 = const()[name = string("const_51_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_103_cast_fp16 = transpose(perm = var_2873, x = var_2869_cast_fp16)[name = string("transpose_25")];
+            tensor<fp16, [1, 1, 2560]> var_2881_cast_fp16 = mul(x = x_103_cast_fp16, y = const_51_promoted_to_fp16)[name = string("op_2881_cast_fp16")];
+            bool input_153_interleave_0 = const()[name = string("input_153_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_153_cast_fp16 = concat(axis = var_2879, interleave = input_153_interleave_0, values = (x_103_cast_fp16, var_2881_cast_fp16))[name = string("input_153_cast_fp16")];
+            tensor<int32, [1]> normed_153_axes_0 = const()[name = string("normed_153_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2876_to_fp16 = const()[name = string("op_2876_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_153_cast_fp16 = layer_norm(axes = normed_153_axes_0, epsilon = var_2876_to_fp16, x = input_153_cast_fp16)[name = string("normed_153_cast_fp16")];
+            tensor<int32, [2]> var_2886_split_sizes_0 = const()[name = string("op_2886_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2886_axis_0 = const()[name = string("op_2886_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2886_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2886_cast_fp16_1 = split(axis = var_2886_axis_0, split_sizes = var_2886_split_sizes_0, x = normed_153_cast_fp16)[name = string("op_2886_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_6_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406444352)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_41_cast_fp16 = mul(x = var_2886_cast_fp16_0, y = layers_6_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_41_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_105_cast_fp16 = add(x = x_95_cast_fp16, y = attn_output_41_cast_fp16)[name = string("x_105_cast_fp16")];
+            int32 var_2895 = const()[name = string("op_2895"), val = int32(-1)];
+            fp16 const_52_promoted_to_fp16 = const()[name = string("const_52_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_2897_cast_fp16 = mul(x = x_105_cast_fp16, y = const_52_promoted_to_fp16)[name = string("op_2897_cast_fp16")];
+            bool input_155_interleave_0 = const()[name = string("input_155_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_155_cast_fp16 = concat(axis = var_2895, interleave = input_155_interleave_0, values = (x_105_cast_fp16, var_2897_cast_fp16))[name = string("input_155_cast_fp16")];
+            tensor<int32, [1]> normed_157_axes_0 = const()[name = string("normed_157_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2892_to_fp16 = const()[name = string("op_2892_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_157_cast_fp16 = layer_norm(axes = normed_157_axes_0, epsilon = var_2892_to_fp16, x = input_155_cast_fp16)[name = string("normed_157_cast_fp16")];
+            tensor<int32, [2]> var_2902_split_sizes_0 = const()[name = string("op_2902_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2902_axis_0 = const()[name = string("op_2902_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2902_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2902_cast_fp16_1 = split(axis = var_2902_axis_0, split_sizes = var_2902_split_sizes_0, x = normed_157_cast_fp16)[name = string("op_2902_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_6_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406449536)))];
+            tensor<fp16, [1, 1, 2560]> h_39_cast_fp16 = mul(x = var_2902_cast_fp16_0, y = layers_6_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_39_cast_fp16")];
+            tensor<int32, [3]> var_2913 = const()[name = string("op_2913"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_157_axes_0 = const()[name = string("input_157_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2914 = transpose(perm = var_2913, x = h_39_cast_fp16)[name = string("transpose_24")];
+            tensor<fp16, [1, 2560, 1, 1]> input_157 = expand_dims(axes = input_157_axes_0, x = var_2914)[name = string("input_157")];
+            string gate_25_pad_type_0 = const()[name = string("gate_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_25_strides_0 = const()[name = string("gate_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_25_pad_0 = const()[name = string("gate_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_25_dilations_0 = const()[name = string("gate_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_25_groups_0 = const()[name = string("gate_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_25 = conv(dilations = gate_25_dilations_0, groups = gate_25_groups_0, pad = gate_25_pad_0, pad_type = gate_25_pad_type_0, strides = gate_25_strides_0, weight = layers_6_mlp_gate_proj_weight_palettized, x = input_157)[name = string("gate_25")];
+            string up_13_pad_type_0 = const()[name = string("up_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_13_strides_0 = const()[name = string("up_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_13_pad_0 = const()[name = string("up_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_13_dilations_0 = const()[name = string("up_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_13_groups_0 = const()[name = string("up_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_13 = conv(dilations = up_13_dilations_0, groups = up_13_groups_0, pad = up_13_pad_0, pad_type = up_13_pad_type_0, strides = up_13_strides_0, weight = layers_6_mlp_up_proj_weight_palettized, x = input_157)[name = string("up_13")];
+            string gate_27_mode_0 = const()[name = string("gate_27_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_27 = gelu(mode = gate_27_mode_0, x = gate_25)[name = string("gate_27")];
+            tensor<fp16, [1, 10240, 1, 1]> input_159 = mul(x = gate_27, y = up_13)[name = string("input_159")];
+            string mlp_out_13_pad_type_0 = const()[name = string("mlp_out_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_13_strides_0 = const()[name = string("mlp_out_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_13_pad_0 = const()[name = string("mlp_out_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_13_dilations_0 = const()[name = string("mlp_out_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_13_groups_0 = const()[name = string("mlp_out_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_13 = conv(dilations = mlp_out_13_dilations_0, groups = mlp_out_13_groups_0, pad = mlp_out_13_pad_0, pad_type = mlp_out_13_pad_type_0, strides = mlp_out_13_strides_0, weight = layers_6_mlp_down_proj_weight_palettized, x = input_159)[name = string("mlp_out_13")];
+            tensor<int32, [1]> var_2954_axes_0 = const()[name = string("op_2954_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2954 = squeeze(axes = var_2954_axes_0, x = mlp_out_13)[name = string("op_2954")];
+            tensor<int32, [3]> var_2958 = const()[name = string("op_2958"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2964 = const()[name = string("op_2964"), val = int32(-1)];
+            fp16 const_53_promoted = const()[name = string("const_53_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_107 = transpose(perm = var_2958, x = var_2954)[name = string("transpose_23")];
+            tensor<fp16, [1, 1, 2560]> var_2966 = mul(x = x_107, y = const_53_promoted)[name = string("op_2966")];
+            bool input_161_interleave_0 = const()[name = string("input_161_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_161 = concat(axis = var_2964, interleave = input_161_interleave_0, values = (x_107, var_2966))[name = string("input_161")];
+            tensor<int32, [1]> normed_161_axes_0 = const()[name = string("normed_161_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2961_to_fp16 = const()[name = string("op_2961_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_161_cast_fp16 = layer_norm(axes = normed_161_axes_0, epsilon = var_2961_to_fp16, x = input_161)[name = string("normed_161_cast_fp16")];
+            tensor<int32, [2]> var_2971_split_sizes_0 = const()[name = string("op_2971_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2971_axis_0 = const()[name = string("op_2971_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2971_0, tensor<fp16, [1, 1, 2560]> var_2971_1 = split(axis = var_2971_axis_0, split_sizes = var_2971_split_sizes_0, x = normed_161_cast_fp16)[name = string("op_2971")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_63 = mul(x = var_2971_0, y = layers_6_post_feedforward_layernorm_weight)[name = string("hidden_states_63")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_65_cast_fp16 = add(x = x_105_cast_fp16, y = hidden_states_63)[name = string("hidden_states_65_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_13_begin_0 = const()[name = string("per_layer_slice_13_begin_0"), val = tensor<int32, [3]>([0, 0, 7680])];
+            tensor<int32, [3]> per_layer_slice_13_end_0 = const()[name = string("per_layer_slice_13_end_0"), val = tensor<int32, [3]>([1, 1, 7936])];
+            tensor<bool, [3]> per_layer_slice_13_end_mask_0 = const()[name = string("per_layer_slice_13_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_13_cast_fp16 = slice_by_index(begin = per_layer_slice_13_begin_0, end = per_layer_slice_13_end_0, end_mask = per_layer_slice_13_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_13_cast_fp16")];
+            tensor<int32, [3]> var_2999 = const()[name = string("op_2999"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_163_axes_0 = const()[name = string("input_163_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3000 = transpose(perm = var_2999, x = hidden_states_65_cast_fp16)[name = string("transpose_22")];
+            tensor<fp16, [1, 2560, 1, 1]> input_163 = expand_dims(axes = input_163_axes_0, x = var_3000)[name = string("input_163")];
+            string gated_37_pad_type_0 = const()[name = string("gated_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_37_strides_0 = const()[name = string("gated_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_37_pad_0 = const()[name = string("gated_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_37_dilations_0 = const()[name = string("gated_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_37_groups_0 = const()[name = string("gated_37_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_37 = conv(dilations = gated_37_dilations_0, groups = gated_37_groups_0, pad = gated_37_pad_0, pad_type = gated_37_pad_type_0, strides = gated_37_strides_0, weight = layers_6_per_layer_input_gate_weight_palettized, x = input_163)[name = string("gated_37")];
+            string gated_39_mode_0 = const()[name = string("gated_39_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_39 = gelu(mode = gated_39_mode_0, x = gated_37)[name = string("gated_39")];
+            tensor<int32, [3]> var_3019 = const()[name = string("op_3019"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_13_axes_0 = const()[name = string("per_layer_slice_conv_13_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_3020_cast_fp16 = transpose(perm = var_3019, x = per_layer_slice_13_cast_fp16)[name = string("transpose_21")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_13_cast_fp16 = expand_dims(axes = per_layer_slice_conv_13_axes_0, x = var_3020_cast_fp16)[name = string("per_layer_slice_conv_13_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_165_cast_fp16 = mul(x = gated_39, y = per_layer_slice_conv_13_cast_fp16)[name = string("input_165_cast_fp16")];
+            string gated_41_pad_type_0 = const()[name = string("gated_41_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_41_strides_0 = const()[name = string("gated_41_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_41_pad_0 = const()[name = string("gated_41_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_41_dilations_0 = const()[name = string("gated_41_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_41_groups_0 = const()[name = string("gated_41_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_6_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406454720))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406782464))))[name = string("layers_6_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_41_cast_fp16 = conv(dilations = gated_41_dilations_0, groups = gated_41_groups_0, pad = gated_41_pad_0, pad_type = gated_41_pad_type_0, strides = gated_41_strides_0, weight = layers_6_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_165_cast_fp16)[name = string("gated_41_cast_fp16")];
+            tensor<int32, [1]> var_3036_axes_0 = const()[name = string("op_3036_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3036_cast_fp16 = squeeze(axes = var_3036_axes_0, x = gated_41_cast_fp16)[name = string("op_3036_cast_fp16")];
+            tensor<int32, [3]> var_3040 = const()[name = string("op_3040"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3046 = const()[name = string("op_3046"), val = int32(-1)];
+            fp16 const_54_promoted_to_fp16 = const()[name = string("const_54_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_109_cast_fp16 = transpose(perm = var_3040, x = var_3036_cast_fp16)[name = string("transpose_20")];
+            tensor<fp16, [1, 1, 2560]> var_3048_cast_fp16 = mul(x = x_109_cast_fp16, y = const_54_promoted_to_fp16)[name = string("op_3048_cast_fp16")];
+            bool input_167_interleave_0 = const()[name = string("input_167_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_167_cast_fp16 = concat(axis = var_3046, interleave = input_167_interleave_0, values = (x_109_cast_fp16, var_3048_cast_fp16))[name = string("input_167_cast_fp16")];
+            tensor<int32, [1]> normed_165_axes_0 = const()[name = string("normed_165_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3043_to_fp16 = const()[name = string("op_3043_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_165_cast_fp16 = layer_norm(axes = normed_165_axes_0, epsilon = var_3043_to_fp16, x = input_167_cast_fp16)[name = string("normed_165_cast_fp16")];
+            tensor<int32, [2]> var_3053_split_sizes_0 = const()[name = string("op_3053_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3053_axis_0 = const()[name = string("op_3053_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3053_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3053_cast_fp16_1 = split(axis = var_3053_axis_0, split_sizes = var_3053_split_sizes_0, x = normed_165_cast_fp16)[name = string("op_3053_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_6_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406785088)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_69_cast_fp16 = mul(x = var_3053_cast_fp16_0, y = layers_6_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_69_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_71_cast_fp16 = add(x = hidden_states_65_cast_fp16, y = hidden_states_69_cast_fp16)[name = string("hidden_states_71_cast_fp16")];
+            tensor<fp16, [1]> const_55_promoted_to_fp16 = const()[name = string("const_55_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.6cp-1])];
+            tensor<fp16, [1, 1, 2560]> x_111_cast_fp16 = mul(x = hidden_states_71_cast_fp16, y = const_55_promoted_to_fp16)[name = string("x_111_cast_fp16")];
+            int32 var_3068 = const()[name = string("op_3068"), val = int32(-1)];
+            fp16 const_56_promoted_to_fp16 = const()[name = string("const_56_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_3070_cast_fp16 = mul(x = x_111_cast_fp16, y = const_56_promoted_to_fp16)[name = string("op_3070_cast_fp16")];
+            bool input_169_interleave_0 = const()[name = string("input_169_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_169_cast_fp16 = concat(axis = var_3068, interleave = input_169_interleave_0, values = (x_111_cast_fp16, var_3070_cast_fp16))[name = string("input_169_cast_fp16")];
+            tensor<int32, [1]> normed_169_axes_0 = const()[name = string("normed_169_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3065_to_fp16 = const()[name = string("op_3065_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_169_cast_fp16 = layer_norm(axes = normed_169_axes_0, epsilon = var_3065_to_fp16, x = input_169_cast_fp16)[name = string("normed_169_cast_fp16")];
+            tensor<int32, [2]> var_3075_split_sizes_0 = const()[name = string("op_3075_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3075_axis_0 = const()[name = string("op_3075_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3075_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3075_cast_fp16_1 = split(axis = var_3075_axis_0, split_sizes = var_3075_split_sizes_0, x = normed_169_cast_fp16)[name = string("op_3075_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_7_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406790272)))];
+            tensor<fp16, [1, 1, 2560]> h_43_cast_fp16 = mul(x = var_3075_cast_fp16_0, y = layers_7_input_layernorm_weight_promoted_to_fp16)[name = string("h_43_cast_fp16")];
+            tensor<int32, [3]> var_3081 = const()[name = string("op_3081"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_3084_axes_0 = const()[name = string("op_3084_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3082_cast_fp16 = transpose(perm = var_3081, x = h_43_cast_fp16)[name = string("transpose_19")];
+            tensor<fp16, [1, 2560, 1, 1]> var_3084_cast_fp16 = expand_dims(axes = var_3084_axes_0, x = var_3082_cast_fp16)[name = string("op_3084_cast_fp16")];
+            string var_3100_pad_type_0 = const()[name = string("op_3100_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_3100_strides_0 = const()[name = string("op_3100_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_3100_pad_0 = const()[name = string("op_3100_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_3100_dilations_0 = const()[name = string("op_3100_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_3100_groups_0 = const()[name = string("op_3100_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_3100 = conv(dilations = var_3100_dilations_0, groups = var_3100_groups_0, pad = var_3100_pad_0, pad_type = var_3100_pad_type_0, strides = var_3100_strides_0, weight = layers_7_self_attn_q_proj_weight_palettized, x = var_3084_cast_fp16)[name = string("op_3100")];
+            tensor<int32, [4]> var_3105 = const()[name = string("op_3105"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_3106 = reshape(shape = var_3105, x = var_3100)[name = string("op_3106")];
+            tensor<int32, [4]> var_3111 = const()[name = string("op_3111"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_3121 = const()[name = string("op_3121"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_3112 = transpose(perm = var_3111, x = var_3106)[name = string("transpose_18")];
+            tensor<fp16, [1, 8, 256]> x_113 = reshape(shape = var_3121, x = var_3112)[name = string("x_113")];
+            int32 var_3127 = const()[name = string("op_3127"), val = int32(-1)];
+            fp16 const_57_promoted = const()[name = string("const_57_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_3129 = mul(x = x_113, y = const_57_promoted)[name = string("op_3129")];
+            bool input_173_interleave_0 = const()[name = string("input_173_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_173 = concat(axis = var_3127, interleave = input_173_interleave_0, values = (x_113, var_3129))[name = string("input_173")];
+            tensor<int32, [1]> normed_173_axes_0 = const()[name = string("normed_173_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3124_to_fp16 = const()[name = string("op_3124_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_173_cast_fp16 = layer_norm(axes = normed_173_axes_0, epsilon = var_3124_to_fp16, x = input_173)[name = string("normed_173_cast_fp16")];
+            tensor<int32, [2]> var_3134_split_sizes_0 = const()[name = string("op_3134_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_3134_axis_0 = const()[name = string("op_3134_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_3134_0, tensor<fp16, [1, 8, 256]> var_3134_1 = split(axis = var_3134_axis_0, split_sizes = var_3134_split_sizes_0, x = normed_173_cast_fp16)[name = string("op_3134")];
+            tensor<fp16, [1, 8, 256]> var_3136 = mul(x = var_3134_0, y = layers_0_self_attn_q_norm_weight)[name = string("op_3136")];
+            tensor<int32, [4]> var_3141 = const()[name = string("op_3141"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_45 = reshape(shape = var_3141, x = var_3136)[name = string("q_45")];
+            tensor<fp16, [1, 8, 1, 256]> var_3143_cast_fp16 = mul(x = q_45, y = cos_s)[name = string("op_3143_cast_fp16")];
+            tensor<int32, [2]> var_3144_split_sizes_0 = const()[name = string("op_3144_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_3144_axis_0 = const()[name = string("op_3144_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_3144_0, tensor<fp16, [1, 8, 1, 128]> var_3144_1 = split(axis = var_3144_axis_0, split_sizes = var_3144_split_sizes_0, x = q_45)[name = string("op_3144")];
+            fp16 const_58_promoted = const()[name = string("const_58_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_3146 = mul(x = var_3144_1, y = const_58_promoted)[name = string("op_3146")];
+            int32 var_3148 = const()[name = string("op_3148"), val = int32(-1)];
+            bool var_3149_interleave_0 = const()[name = string("op_3149_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_3149 = concat(axis = var_3148, interleave = var_3149_interleave_0, values = (var_3146, var_3144_0))[name = string("op_3149")];
+            tensor<fp16, [1, 8, 1, 256]> var_3150_cast_fp16 = mul(x = var_3149, y = sin_s)[name = string("op_3150_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_47_cast_fp16 = add(x = var_3143_cast_fp16, y = var_3150_cast_fp16)[name = string("q_47_cast_fp16")];
+            bool attn_weights_29_transpose_x_0 = const()[name = string("attn_weights_29_transpose_x_0"), val = bool(false)];
+            bool attn_weights_29_transpose_y_0 = const()[name = string("attn_weights_29_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_29_cast_fp16 = matmul(transpose_x = attn_weights_29_transpose_x_0, transpose_y = attn_weights_29_transpose_y_0, x = q_47_cast_fp16, y = transpose_36_cast_fp16)[name = string("attn_weights_29_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_115_cast_fp16 = add(x = attn_weights_29_cast_fp16, y = causal_mask_sliding)[name = string("x_115_cast_fp16")];
+            tensor<int32, [1]> reduce_max_7_axes_0 = const()[name = string("reduce_max_7_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_7_keep_dims_0 = const()[name = string("reduce_max_7_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_7 = reduce_max(axes = reduce_max_7_axes_0, keep_dims = reduce_max_7_keep_dims_0, x = x_115_cast_fp16)[name = string("reduce_max_7")];
+            tensor<fp16, [1, 8, 1, 512]> var_3182 = sub(x = x_115_cast_fp16, y = reduce_max_7)[name = string("op_3182")];
+            tensor<fp16, [1, 8, 1, 512]> var_3188 = exp(x = var_3182)[name = string("op_3188")];
+            tensor<int32, [1]> var_3198_axes_0 = const()[name = string("op_3198_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3198_keep_dims_0 = const()[name = string("op_3198_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_3198 = reduce_sum(axes = var_3198_axes_0, keep_dims = var_3198_keep_dims_0, x = var_3188)[name = string("op_3198")];
+            tensor<fp16, [1, 8, 1, 512]> var_3204_cast_fp16 = real_div(x = var_3188, y = var_3198)[name = string("op_3204_cast_fp16")];
+            bool attn_output_43_transpose_x_0 = const()[name = string("attn_output_43_transpose_x_0"), val = bool(false)];
+            bool attn_output_43_transpose_y_0 = const()[name = string("attn_output_43_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_43_cast_fp16 = matmul(transpose_x = attn_output_43_transpose_x_0, transpose_y = attn_output_43_transpose_y_0, x = var_3204_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_43_cast_fp16")];
+            tensor<int32, [4]> var_3215 = const()[name = string("op_3215"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_3222 = const()[name = string("op_3222"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_3216_cast_fp16 = transpose(perm = var_3215, x = attn_output_43_cast_fp16)[name = string("transpose_17")];
+            tensor<fp16, [1, 1, 2048]> attn_output_45_cast_fp16 = reshape(shape = var_3222, x = var_3216_cast_fp16)[name = string("attn_output_45_cast_fp16")];
+            tensor<int32, [3]> var_3227 = const()[name = string("op_3227"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_3243_pad_type_0 = const()[name = string("op_3243_pad_type_0"), val = string("valid")];
+            int32 var_3243_groups_0 = const()[name = string("op_3243_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_3243_strides_0 = const()[name = string("op_3243_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_3243_pad_0 = const()[name = string("op_3243_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_3243_dilations_0 = const()[name = string("op_3243_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_7_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406795456))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409416960))))[name = string("squeeze_7_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_3228_cast_fp16 = transpose(perm = var_3227, x = attn_output_45_cast_fp16)[name = string("transpose_16")];
+            tensor<fp16, [1, 2560, 1]> var_3243_cast_fp16 = conv(dilations = var_3243_dilations_0, groups = var_3243_groups_0, pad = var_3243_pad_0, pad_type = var_3243_pad_type_0, strides = var_3243_strides_0, weight = squeeze_7_cast_fp16_to_fp32_to_fp16_palettized, x = var_3228_cast_fp16)[name = string("op_3243_cast_fp16")];
+            tensor<int32, [3]> var_3247 = const()[name = string("op_3247"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3253 = const()[name = string("op_3253"), val = int32(-1)];
+            fp16 const_59_promoted_to_fp16 = const()[name = string("const_59_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_119_cast_fp16 = transpose(perm = var_3247, x = var_3243_cast_fp16)[name = string("transpose_15")];
+            tensor<fp16, [1, 1, 2560]> var_3255_cast_fp16 = mul(x = x_119_cast_fp16, y = const_59_promoted_to_fp16)[name = string("op_3255_cast_fp16")];
+            bool input_177_interleave_0 = const()[name = string("input_177_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_177_cast_fp16 = concat(axis = var_3253, interleave = input_177_interleave_0, values = (x_119_cast_fp16, var_3255_cast_fp16))[name = string("input_177_cast_fp16")];
+            tensor<int32, [1]> normed_177_axes_0 = const()[name = string("normed_177_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3250_to_fp16 = const()[name = string("op_3250_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_177_cast_fp16 = layer_norm(axes = normed_177_axes_0, epsilon = var_3250_to_fp16, x = input_177_cast_fp16)[name = string("normed_177_cast_fp16")];
+            tensor<int32, [2]> var_3260_split_sizes_0 = const()[name = string("op_3260_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3260_axis_0 = const()[name = string("op_3260_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3260_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3260_cast_fp16_1 = split(axis = var_3260_axis_0, split_sizes = var_3260_split_sizes_0, x = normed_177_cast_fp16)[name = string("op_3260_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_7_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409419584)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_47_cast_fp16 = mul(x = var_3260_cast_fp16_0, y = layers_7_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_47_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_121_cast_fp16 = add(x = x_111_cast_fp16, y = attn_output_47_cast_fp16)[name = string("x_121_cast_fp16")];
+            int32 var_3269 = const()[name = string("op_3269"), val = int32(-1)];
+            fp16 const_60_promoted_to_fp16 = const()[name = string("const_60_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_3271_cast_fp16 = mul(x = x_121_cast_fp16, y = const_60_promoted_to_fp16)[name = string("op_3271_cast_fp16")];
+            bool input_179_interleave_0 = const()[name = string("input_179_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_179_cast_fp16 = concat(axis = var_3269, interleave = input_179_interleave_0, values = (x_121_cast_fp16, var_3271_cast_fp16))[name = string("input_179_cast_fp16")];
+            tensor<int32, [1]> normed_181_axes_0 = const()[name = string("normed_181_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3266_to_fp16 = const()[name = string("op_3266_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_181_cast_fp16 = layer_norm(axes = normed_181_axes_0, epsilon = var_3266_to_fp16, x = input_179_cast_fp16)[name = string("normed_181_cast_fp16")];
+            tensor<int32, [2]> var_3276_split_sizes_0 = const()[name = string("op_3276_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3276_axis_0 = const()[name = string("op_3276_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3276_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3276_cast_fp16_1 = split(axis = var_3276_axis_0, split_sizes = var_3276_split_sizes_0, x = normed_181_cast_fp16)[name = string("op_3276_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_7_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409424768)))];
+            tensor<fp16, [1, 1, 2560]> h_45_cast_fp16 = mul(x = var_3276_cast_fp16_0, y = layers_7_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_45_cast_fp16")];
+            tensor<int32, [3]> var_3287 = const()[name = string("op_3287"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_181_axes_0 = const()[name = string("input_181_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3288 = transpose(perm = var_3287, x = h_45_cast_fp16)[name = string("transpose_14")];
+            tensor<fp16, [1, 2560, 1, 1]> input_181 = expand_dims(axes = input_181_axes_0, x = var_3288)[name = string("input_181")];
+            string gate_29_pad_type_0 = const()[name = string("gate_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_29_strides_0 = const()[name = string("gate_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_29_pad_0 = const()[name = string("gate_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_29_dilations_0 = const()[name = string("gate_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_29_groups_0 = const()[name = string("gate_29_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_29 = conv(dilations = gate_29_dilations_0, groups = gate_29_groups_0, pad = gate_29_pad_0, pad_type = gate_29_pad_type_0, strides = gate_29_strides_0, weight = layers_7_mlp_gate_proj_weight_palettized, x = input_181)[name = string("gate_29")];
+            string up_15_pad_type_0 = const()[name = string("up_15_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_15_strides_0 = const()[name = string("up_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_15_pad_0 = const()[name = string("up_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_15_dilations_0 = const()[name = string("up_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_15_groups_0 = const()[name = string("up_15_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_15 = conv(dilations = up_15_dilations_0, groups = up_15_groups_0, pad = up_15_pad_0, pad_type = up_15_pad_type_0, strides = up_15_strides_0, weight = layers_7_mlp_up_proj_weight_palettized, x = input_181)[name = string("up_15")];
+            string gate_31_mode_0 = const()[name = string("gate_31_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_31 = gelu(mode = gate_31_mode_0, x = gate_29)[name = string("gate_31")];
+            tensor<fp16, [1, 10240, 1, 1]> input_183 = mul(x = gate_31, y = up_15)[name = string("input_183")];
+            string mlp_out_15_pad_type_0 = const()[name = string("mlp_out_15_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_15_strides_0 = const()[name = string("mlp_out_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_15_pad_0 = const()[name = string("mlp_out_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_15_dilations_0 = const()[name = string("mlp_out_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_15_groups_0 = const()[name = string("mlp_out_15_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_15 = conv(dilations = mlp_out_15_dilations_0, groups = mlp_out_15_groups_0, pad = mlp_out_15_pad_0, pad_type = mlp_out_15_pad_type_0, strides = mlp_out_15_strides_0, weight = layers_7_mlp_down_proj_weight_palettized, x = input_183)[name = string("mlp_out_15")];
+            tensor<int32, [1]> var_3328_axes_0 = const()[name = string("op_3328_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3328 = squeeze(axes = var_3328_axes_0, x = mlp_out_15)[name = string("op_3328")];
+            tensor<int32, [3]> var_3332 = const()[name = string("op_3332"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3338 = const()[name = string("op_3338"), val = int32(-1)];
+            fp16 const_61_promoted = const()[name = string("const_61_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_123 = transpose(perm = var_3332, x = var_3328)[name = string("transpose_13")];
+            tensor<fp16, [1, 1, 2560]> var_3340 = mul(x = x_123, y = const_61_promoted)[name = string("op_3340")];
+            bool input_185_interleave_0 = const()[name = string("input_185_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_185 = concat(axis = var_3338, interleave = input_185_interleave_0, values = (x_123, var_3340))[name = string("input_185")];
+            tensor<int32, [1]> normed_185_axes_0 = const()[name = string("normed_185_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3335_to_fp16 = const()[name = string("op_3335_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_185_cast_fp16 = layer_norm(axes = normed_185_axes_0, epsilon = var_3335_to_fp16, x = input_185)[name = string("normed_185_cast_fp16")];
+            tensor<int32, [2]> var_3345_split_sizes_0 = const()[name = string("op_3345_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3345_axis_0 = const()[name = string("op_3345_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3345_0, tensor<fp16, [1, 1, 2560]> var_3345_1 = split(axis = var_3345_axis_0, split_sizes = var_3345_split_sizes_0, x = normed_185_cast_fp16)[name = string("op_3345")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_73 = mul(x = var_3345_0, y = layers_7_post_feedforward_layernorm_weight)[name = string("hidden_states_73")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_75_cast_fp16 = add(x = x_121_cast_fp16, y = hidden_states_73)[name = string("hidden_states_75_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_15_begin_0 = const()[name = string("per_layer_slice_15_begin_0"), val = tensor<int32, [3]>([0, 0, 7936])];
+            tensor<int32, [3]> per_layer_slice_15_end_0 = const()[name = string("per_layer_slice_15_end_0"), val = tensor<int32, [3]>([1, 1, 8192])];
+            tensor<bool, [3]> per_layer_slice_15_end_mask_0 = const()[name = string("per_layer_slice_15_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_15_cast_fp16 = slice_by_index(begin = per_layer_slice_15_begin_0, end = per_layer_slice_15_end_0, end_mask = per_layer_slice_15_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_15_cast_fp16")];
+            tensor<int32, [3]> var_3373 = const()[name = string("op_3373"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_187_axes_0 = const()[name = string("input_187_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3374 = transpose(perm = var_3373, x = hidden_states_75_cast_fp16)[name = string("transpose_12")];
+            tensor<fp16, [1, 2560, 1, 1]> input_187 = expand_dims(axes = input_187_axes_0, x = var_3374)[name = string("input_187")];
+            string gated_43_pad_type_0 = const()[name = string("gated_43_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_43_strides_0 = const()[name = string("gated_43_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_43_pad_0 = const()[name = string("gated_43_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_43_dilations_0 = const()[name = string("gated_43_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_43_groups_0 = const()[name = string("gated_43_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_43 = conv(dilations = gated_43_dilations_0, groups = gated_43_groups_0, pad = gated_43_pad_0, pad_type = gated_43_pad_type_0, strides = gated_43_strides_0, weight = layers_7_per_layer_input_gate_weight_palettized, x = input_187)[name = string("gated_43")];
+            string gated_45_mode_0 = const()[name = string("gated_45_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_45 = gelu(mode = gated_45_mode_0, x = gated_43)[name = string("gated_45")];
+            tensor<int32, [3]> var_3393 = const()[name = string("op_3393"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_15_axes_0 = const()[name = string("per_layer_slice_conv_15_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_3394_cast_fp16 = transpose(perm = var_3393, x = per_layer_slice_15_cast_fp16)[name = string("transpose_11")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_15_cast_fp16 = expand_dims(axes = per_layer_slice_conv_15_axes_0, x = var_3394_cast_fp16)[name = string("per_layer_slice_conv_15_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_189_cast_fp16 = mul(x = gated_45, y = per_layer_slice_conv_15_cast_fp16)[name = string("input_189_cast_fp16")];
+            string gated_47_pad_type_0 = const()[name = string("gated_47_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_47_strides_0 = const()[name = string("gated_47_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_47_pad_0 = const()[name = string("gated_47_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_47_dilations_0 = const()[name = string("gated_47_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_47_groups_0 = const()[name = string("gated_47_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_7_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409429952))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409757696))))[name = string("layers_7_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_47_cast_fp16 = conv(dilations = gated_47_dilations_0, groups = gated_47_groups_0, pad = gated_47_pad_0, pad_type = gated_47_pad_type_0, strides = gated_47_strides_0, weight = layers_7_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_189_cast_fp16)[name = string("gated_47_cast_fp16")];
+            tensor<int32, [1]> var_3410_axes_0 = const()[name = string("op_3410_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3410_cast_fp16 = squeeze(axes = var_3410_axes_0, x = gated_47_cast_fp16)[name = string("op_3410_cast_fp16")];
+            tensor<int32, [3]> var_3414 = const()[name = string("op_3414"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3420 = const()[name = string("op_3420"), val = int32(-1)];
+            fp16 const_62_promoted_to_fp16 = const()[name = string("const_62_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_125_cast_fp16 = transpose(perm = var_3414, x = var_3410_cast_fp16)[name = string("transpose_10")];
+            tensor<fp16, [1, 1, 2560]> var_3422_cast_fp16 = mul(x = x_125_cast_fp16, y = const_62_promoted_to_fp16)[name = string("op_3422_cast_fp16")];
+            bool input_191_interleave_0 = const()[name = string("input_191_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_191_cast_fp16 = concat(axis = var_3420, interleave = input_191_interleave_0, values = (x_125_cast_fp16, var_3422_cast_fp16))[name = string("input_191_cast_fp16")];
+            tensor<int32, [1]> normed_189_axes_0 = const()[name = string("normed_189_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3417_to_fp16 = const()[name = string("op_3417_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_189_cast_fp16 = layer_norm(axes = normed_189_axes_0, epsilon = var_3417_to_fp16, x = input_191_cast_fp16)[name = string("normed_189_cast_fp16")];
+            tensor<int32, [2]> var_3427_split_sizes_0 = const()[name = string("op_3427_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3427_axis_0 = const()[name = string("op_3427_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3427_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3427_cast_fp16_1 = split(axis = var_3427_axis_0, split_sizes = var_3427_split_sizes_0, x = normed_189_cast_fp16)[name = string("op_3427_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_7_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409760320)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_79_cast_fp16 = mul(x = var_3427_cast_fp16_0, y = layers_7_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_79_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_81_cast_fp16 = add(x = hidden_states_75_cast_fp16, y = hidden_states_79_cast_fp16)[name = string("hidden_states_81_cast_fp16")];
+            tensor<fp16, [1]> const_63_promoted_to_fp16 = const()[name = string("const_63_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.a2p-1])];
+            tensor<fp16, [1, 1, 2560]> x_127_cast_fp16 = mul(x = hidden_states_81_cast_fp16, y = const_63_promoted_to_fp16)[name = string("x_127_cast_fp16")];
+            int32 var_3442 = const()[name = string("op_3442"), val = int32(-1)];
+            fp16 const_64_promoted_to_fp16 = const()[name = string("const_64_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_3444_cast_fp16 = mul(x = x_127_cast_fp16, y = const_64_promoted_to_fp16)[name = string("op_3444_cast_fp16")];
+            bool input_193_interleave_0 = const()[name = string("input_193_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_193_cast_fp16 = concat(axis = var_3442, interleave = input_193_interleave_0, values = (x_127_cast_fp16, var_3444_cast_fp16))[name = string("input_193_cast_fp16")];
+            tensor<int32, [1]> normed_193_axes_0 = const()[name = string("normed_193_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3439_to_fp16 = const()[name = string("op_3439_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_193_cast_fp16 = layer_norm(axes = normed_193_axes_0, epsilon = var_3439_to_fp16, x = input_193_cast_fp16)[name = string("normed_193_cast_fp16")];
+            tensor<int32, [2]> var_3449_split_sizes_0 = const()[name = string("op_3449_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3449_axis_0 = const()[name = string("op_3449_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3449_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3449_cast_fp16_1 = split(axis = var_3449_axis_0, split_sizes = var_3449_split_sizes_0, x = normed_193_cast_fp16)[name = string("op_3449_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_8_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409765504)))];
+            tensor<fp16, [1, 1, 2560]> h_49_cast_fp16 = mul(x = var_3449_cast_fp16_0, y = layers_8_input_layernorm_weight_promoted_to_fp16)[name = string("h_49_cast_fp16")];
+            tensor<int32, [3]> var_3455 = const()[name = string("op_3455"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_3458_axes_0 = const()[name = string("op_3458_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3456_cast_fp16 = transpose(perm = var_3455, x = h_49_cast_fp16)[name = string("transpose_9")];
+            tensor<fp16, [1, 2560, 1, 1]> var_3458_cast_fp16 = expand_dims(axes = var_3458_axes_0, x = var_3456_cast_fp16)[name = string("op_3458_cast_fp16")];
+            string var_3474_pad_type_0 = const()[name = string("op_3474_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_3474_strides_0 = const()[name = string("op_3474_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_3474_pad_0 = const()[name = string("op_3474_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_3474_dilations_0 = const()[name = string("op_3474_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_3474_groups_0 = const()[name = string("op_3474_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_3474 = conv(dilations = var_3474_dilations_0, groups = var_3474_groups_0, pad = var_3474_pad_0, pad_type = var_3474_pad_type_0, strides = var_3474_strides_0, weight = layers_8_self_attn_q_proj_weight_palettized, x = var_3458_cast_fp16)[name = string("op_3474")];
+            tensor<int32, [4]> var_3479 = const()[name = string("op_3479"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_3480 = reshape(shape = var_3479, x = var_3474)[name = string("op_3480")];
+            tensor<int32, [4]> var_3485 = const()[name = string("op_3485"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_3495 = const()[name = string("op_3495"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_3486 = transpose(perm = var_3485, x = var_3480)[name = string("transpose_8")];
+            tensor<fp16, [1, 8, 256]> x_129 = reshape(shape = var_3495, x = var_3486)[name = string("x_129")];
+            int32 var_3501 = const()[name = string("op_3501"), val = int32(-1)];
+            fp16 const_65_promoted = const()[name = string("const_65_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_3503 = mul(x = x_129, y = const_65_promoted)[name = string("op_3503")];
+            bool input_197_interleave_0 = const()[name = string("input_197_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_197 = concat(axis = var_3501, interleave = input_197_interleave_0, values = (x_129, var_3503))[name = string("input_197")];
+            tensor<int32, [1]> normed_197_axes_0 = const()[name = string("normed_197_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3498_to_fp16 = const()[name = string("op_3498_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_197_cast_fp16 = layer_norm(axes = normed_197_axes_0, epsilon = var_3498_to_fp16, x = input_197)[name = string("normed_197_cast_fp16")];
+            tensor<int32, [2]> var_3508_split_sizes_0 = const()[name = string("op_3508_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_3508_axis_0 = const()[name = string("op_3508_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_3508_0, tensor<fp16, [1, 8, 256]> var_3508_1 = split(axis = var_3508_axis_0, split_sizes = var_3508_split_sizes_0, x = normed_197_cast_fp16)[name = string("op_3508")];
+            tensor<fp16, [1, 8, 256]> var_3510 = mul(x = var_3508_0, y = layers_0_self_attn_q_norm_weight)[name = string("op_3510")];
+            tensor<int32, [4]> var_3515 = const()[name = string("op_3515"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_51 = reshape(shape = var_3515, x = var_3510)[name = string("q_51")];
+            tensor<fp16, [1, 8, 1, 256]> var_3517_cast_fp16 = mul(x = q_51, y = cos_s)[name = string("op_3517_cast_fp16")];
+            tensor<int32, [2]> var_3518_split_sizes_0 = const()[name = string("op_3518_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_3518_axis_0 = const()[name = string("op_3518_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_3518_0, tensor<fp16, [1, 8, 1, 128]> var_3518_1 = split(axis = var_3518_axis_0, split_sizes = var_3518_split_sizes_0, x = q_51)[name = string("op_3518")];
+            fp16 const_66_promoted = const()[name = string("const_66_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_3520 = mul(x = var_3518_1, y = const_66_promoted)[name = string("op_3520")];
+            int32 var_3522 = const()[name = string("op_3522"), val = int32(-1)];
+            bool var_3523_interleave_0 = const()[name = string("op_3523_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_3523 = concat(axis = var_3522, interleave = var_3523_interleave_0, values = (var_3520, var_3518_0))[name = string("op_3523")];
+            tensor<fp16, [1, 8, 1, 256]> var_3524_cast_fp16 = mul(x = var_3523, y = sin_s)[name = string("op_3524_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_cast_fp16 = add(x = var_3517_cast_fp16, y = var_3524_cast_fp16)[name = string("q_cast_fp16")];
+            bool attn_weights_33_transpose_x_0 = const()[name = string("attn_weights_33_transpose_x_0"), val = bool(false)];
+            bool attn_weights_33_transpose_y_0 = const()[name = string("attn_weights_33_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_33_cast_fp16 = matmul(transpose_x = attn_weights_33_transpose_x_0, transpose_y = attn_weights_33_transpose_y_0, x = q_cast_fp16, y = transpose_36_cast_fp16)[name = string("attn_weights_33_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_131_cast_fp16 = add(x = attn_weights_33_cast_fp16, y = causal_mask_sliding)[name = string("x_131_cast_fp16")];
+            tensor<int32, [1]> reduce_max_8_axes_0 = const()[name = string("reduce_max_8_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_8_keep_dims_0 = const()[name = string("reduce_max_8_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_8 = reduce_max(axes = reduce_max_8_axes_0, keep_dims = reduce_max_8_keep_dims_0, x = x_131_cast_fp16)[name = string("reduce_max_8")];
+            tensor<fp16, [1, 8, 1, 512]> var_3556 = sub(x = x_131_cast_fp16, y = reduce_max_8)[name = string("op_3556")];
+            tensor<fp16, [1, 8, 1, 512]> var_3562 = exp(x = var_3556)[name = string("op_3562")];
+            tensor<int32, [1]> var_3572_axes_0 = const()[name = string("op_3572_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3572_keep_dims_0 = const()[name = string("op_3572_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_3572 = reduce_sum(axes = var_3572_axes_0, keep_dims = var_3572_keep_dims_0, x = var_3562)[name = string("op_3572")];
+            tensor<fp16, [1, 8, 1, 512]> var_3578_cast_fp16 = real_div(x = var_3562, y = var_3572)[name = string("op_3578_cast_fp16")];
+            bool attn_output_49_transpose_x_0 = const()[name = string("attn_output_49_transpose_x_0"), val = bool(false)];
+            bool attn_output_49_transpose_y_0 = const()[name = string("attn_output_49_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_49_cast_fp16 = matmul(transpose_x = attn_output_49_transpose_x_0, transpose_y = attn_output_49_transpose_y_0, x = var_3578_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_49_cast_fp16")];
+            tensor<int32, [4]> var_3589 = const()[name = string("op_3589"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_3596 = const()[name = string("op_3596"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_3590_cast_fp16 = transpose(perm = var_3589, x = attn_output_49_cast_fp16)[name = string("transpose_7")];
+            tensor<fp16, [1, 1, 2048]> attn_output_51_cast_fp16 = reshape(shape = var_3596, x = var_3590_cast_fp16)[name = string("attn_output_51_cast_fp16")];
+            tensor<int32, [3]> var_3601 = const()[name = string("op_3601"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_3617_pad_type_0 = const()[name = string("op_3617_pad_type_0"), val = string("valid")];
+            int32 var_3617_groups_0 = const()[name = string("op_3617_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_3617_strides_0 = const()[name = string("op_3617_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_3617_pad_0 = const()[name = string("op_3617_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_3617_dilations_0 = const()[name = string("op_3617_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_8_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409770688))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412392192))))[name = string("squeeze_8_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_3602_cast_fp16 = transpose(perm = var_3601, x = attn_output_51_cast_fp16)[name = string("transpose_6")];
+            tensor<fp16, [1, 2560, 1]> var_3617_cast_fp16 = conv(dilations = var_3617_dilations_0, groups = var_3617_groups_0, pad = var_3617_pad_0, pad_type = var_3617_pad_type_0, strides = var_3617_strides_0, weight = squeeze_8_cast_fp16_to_fp32_to_fp16_palettized, x = var_3602_cast_fp16)[name = string("op_3617_cast_fp16")];
+            tensor<int32, [3]> var_3621 = const()[name = string("op_3621"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3627 = const()[name = string("op_3627"), val = int32(-1)];
+            fp16 const_67_promoted_to_fp16 = const()[name = string("const_67_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_135_cast_fp16 = transpose(perm = var_3621, x = var_3617_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 1, 2560]> var_3629_cast_fp16 = mul(x = x_135_cast_fp16, y = const_67_promoted_to_fp16)[name = string("op_3629_cast_fp16")];
+            bool input_201_interleave_0 = const()[name = string("input_201_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_201_cast_fp16 = concat(axis = var_3627, interleave = input_201_interleave_0, values = (x_135_cast_fp16, var_3629_cast_fp16))[name = string("input_201_cast_fp16")];
+            tensor<int32, [1]> normed_201_axes_0 = const()[name = string("normed_201_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3624_to_fp16 = const()[name = string("op_3624_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_201_cast_fp16 = layer_norm(axes = normed_201_axes_0, epsilon = var_3624_to_fp16, x = input_201_cast_fp16)[name = string("normed_201_cast_fp16")];
+            tensor<int32, [2]> var_3634_split_sizes_0 = const()[name = string("op_3634_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3634_axis_0 = const()[name = string("op_3634_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3634_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3634_cast_fp16_1 = split(axis = var_3634_axis_0, split_sizes = var_3634_split_sizes_0, x = normed_201_cast_fp16)[name = string("op_3634_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_8_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412394816)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_cast_fp16 = mul(x = var_3634_cast_fp16_0, y = layers_8_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_137_cast_fp16 = add(x = x_127_cast_fp16, y = attn_output_cast_fp16)[name = string("x_137_cast_fp16")];
+            int32 var_3643 = const()[name = string("op_3643"), val = int32(-1)];
+            fp16 const_68_promoted_to_fp16 = const()[name = string("const_68_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_3645_cast_fp16 = mul(x = x_137_cast_fp16, y = const_68_promoted_to_fp16)[name = string("op_3645_cast_fp16")];
+            bool input_203_interleave_0 = const()[name = string("input_203_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_203_cast_fp16 = concat(axis = var_3643, interleave = input_203_interleave_0, values = (x_137_cast_fp16, var_3645_cast_fp16))[name = string("input_203_cast_fp16")];
+            tensor<int32, [1]> normed_205_axes_0 = const()[name = string("normed_205_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3640_to_fp16 = const()[name = string("op_3640_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_205_cast_fp16 = layer_norm(axes = normed_205_axes_0, epsilon = var_3640_to_fp16, x = input_203_cast_fp16)[name = string("normed_205_cast_fp16")];
+            tensor<int32, [2]> var_3650_split_sizes_0 = const()[name = string("op_3650_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3650_axis_0 = const()[name = string("op_3650_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3650_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3650_cast_fp16_1 = split(axis = var_3650_axis_0, split_sizes = var_3650_split_sizes_0, x = normed_205_cast_fp16)[name = string("op_3650_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_8_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412400000)))];
+            tensor<fp16, [1, 1, 2560]> h_51_cast_fp16 = mul(x = var_3650_cast_fp16_0, y = layers_8_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_51_cast_fp16")];
+            tensor<int32, [3]> var_3661 = const()[name = string("op_3661"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_205_axes_0 = const()[name = string("input_205_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3662 = transpose(perm = var_3661, x = h_51_cast_fp16)[name = string("transpose_4")];
+            tensor<fp16, [1, 2560, 1, 1]> input_205 = expand_dims(axes = input_205_axes_0, x = var_3662)[name = string("input_205")];
+            string gate_33_pad_type_0 = const()[name = string("gate_33_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_33_strides_0 = const()[name = string("gate_33_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_33_pad_0 = const()[name = string("gate_33_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_33_dilations_0 = const()[name = string("gate_33_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_33_groups_0 = const()[name = string("gate_33_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_33 = conv(dilations = gate_33_dilations_0, groups = gate_33_groups_0, pad = gate_33_pad_0, pad_type = gate_33_pad_type_0, strides = gate_33_strides_0, weight = layers_8_mlp_gate_proj_weight_palettized, x = input_205)[name = string("gate_33")];
+            string up_pad_type_0 = const()[name = string("up_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_strides_0 = const()[name = string("up_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_pad_0 = const()[name = string("up_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_dilations_0 = const()[name = string("up_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_groups_0 = const()[name = string("up_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up = conv(dilations = up_dilations_0, groups = up_groups_0, pad = up_pad_0, pad_type = up_pad_type_0, strides = up_strides_0, weight = layers_8_mlp_up_proj_weight_palettized, x = input_205)[name = string("up")];
+            string gate_mode_0 = const()[name = string("gate_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate = gelu(mode = gate_mode_0, x = gate_33)[name = string("gate")];
+            tensor<fp16, [1, 10240, 1, 1]> input_207 = mul(x = gate, y = up)[name = string("input_207")];
+            string mlp_out_pad_type_0 = const()[name = string("mlp_out_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_strides_0 = const()[name = string("mlp_out_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_pad_0 = const()[name = string("mlp_out_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_dilations_0 = const()[name = string("mlp_out_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_groups_0 = const()[name = string("mlp_out_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out = conv(dilations = mlp_out_dilations_0, groups = mlp_out_groups_0, pad = mlp_out_pad_0, pad_type = mlp_out_pad_type_0, strides = mlp_out_strides_0, weight = layers_8_mlp_down_proj_weight_palettized, x = input_207)[name = string("mlp_out")];
+            tensor<int32, [1]> var_3702_axes_0 = const()[name = string("op_3702_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3702 = squeeze(axes = var_3702_axes_0, x = mlp_out)[name = string("op_3702")];
+            tensor<int32, [3]> var_3706 = const()[name = string("op_3706"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3712 = const()[name = string("op_3712"), val = int32(-1)];
+            fp16 const_69_promoted = const()[name = string("const_69_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_139 = transpose(perm = var_3706, x = var_3702)[name = string("transpose_3")];
+            tensor<fp16, [1, 1, 2560]> var_3714 = mul(x = x_139, y = const_69_promoted)[name = string("op_3714")];
+            bool input_209_interleave_0 = const()[name = string("input_209_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_209 = concat(axis = var_3712, interleave = input_209_interleave_0, values = (x_139, var_3714))[name = string("input_209")];
+            tensor<int32, [1]> normed_209_axes_0 = const()[name = string("normed_209_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3709_to_fp16 = const()[name = string("op_3709_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_209_cast_fp16 = layer_norm(axes = normed_209_axes_0, epsilon = var_3709_to_fp16, x = input_209)[name = string("normed_209_cast_fp16")];
+            tensor<int32, [2]> var_3719_split_sizes_0 = const()[name = string("op_3719_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3719_axis_0 = const()[name = string("op_3719_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3719_0, tensor<fp16, [1, 1, 2560]> var_3719_1 = split(axis = var_3719_axis_0, split_sizes = var_3719_split_sizes_0, x = normed_209_cast_fp16)[name = string("op_3719")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_83 = mul(x = var_3719_0, y = layers_8_post_feedforward_layernorm_weight)[name = string("hidden_states_83")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_85_cast_fp16 = add(x = x_137_cast_fp16, y = hidden_states_83)[name = string("hidden_states_85_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_begin_0 = const()[name = string("per_layer_slice_begin_0"), val = tensor<int32, [3]>([0, 0, 8192])];
+            tensor<int32, [3]> per_layer_slice_end_0 = const()[name = string("per_layer_slice_end_0"), val = tensor<int32, [3]>([1, 1, 8448])];
+            tensor<bool, [3]> per_layer_slice_end_mask_0 = const()[name = string("per_layer_slice_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_cast_fp16 = slice_by_index(begin = per_layer_slice_begin_0, end = per_layer_slice_end_0, end_mask = per_layer_slice_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_cast_fp16")];
+            tensor<int32, [3]> var_3747 = const()[name = string("op_3747"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_211_axes_0 = const()[name = string("input_211_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3748 = transpose(perm = var_3747, x = hidden_states_85_cast_fp16)[name = string("transpose_2")];
+            tensor<fp16, [1, 2560, 1, 1]> input_211 = expand_dims(axes = input_211_axes_0, x = var_3748)[name = string("input_211")];
+            string gated_49_pad_type_0 = const()[name = string("gated_49_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_49_strides_0 = const()[name = string("gated_49_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_49_pad_0 = const()[name = string("gated_49_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_49_dilations_0 = const()[name = string("gated_49_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_49_groups_0 = const()[name = string("gated_49_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_49 = conv(dilations = gated_49_dilations_0, groups = gated_49_groups_0, pad = gated_49_pad_0, pad_type = gated_49_pad_type_0, strides = gated_49_strides_0, weight = layers_8_per_layer_input_gate_weight_palettized, x = input_211)[name = string("gated_49")];
+            string gated_51_mode_0 = const()[name = string("gated_51_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_51 = gelu(mode = gated_51_mode_0, x = gated_49)[name = string("gated_51")];
+            tensor<int32, [3]> var_3767 = const()[name = string("op_3767"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_axes_0 = const()[name = string("per_layer_slice_conv_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_3768_cast_fp16 = transpose(perm = var_3767, x = per_layer_slice_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_cast_fp16 = expand_dims(axes = per_layer_slice_conv_axes_0, x = var_3768_cast_fp16)[name = string("per_layer_slice_conv_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_213_cast_fp16 = mul(x = gated_51, y = per_layer_slice_conv_cast_fp16)[name = string("input_213_cast_fp16")];
+            string gated_pad_type_0 = const()[name = string("gated_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_strides_0 = const()[name = string("gated_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_pad_0 = const()[name = string("gated_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_dilations_0 = const()[name = string("gated_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_groups_0 = const()[name = string("gated_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_8_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412405184))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412732928))))[name = string("layers_8_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_cast_fp16 = conv(dilations = gated_dilations_0, groups = gated_groups_0, pad = gated_pad_0, pad_type = gated_pad_type_0, strides = gated_strides_0, weight = layers_8_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_213_cast_fp16)[name = string("gated_cast_fp16")];
+            tensor<int32, [1]> var_3784_axes_0 = const()[name = string("op_3784_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3784_cast_fp16 = squeeze(axes = var_3784_axes_0, x = gated_cast_fp16)[name = string("op_3784_cast_fp16")];
+            tensor<int32, [3]> var_3788 = const()[name = string("op_3788"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3794 = const()[name = string("op_3794"), val = int32(-1)];
+            fp16 const_70_promoted_to_fp16 = const()[name = string("const_70_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_cast_fp16 = transpose(perm = var_3788, x = var_3784_cast_fp16)[name = string("transpose_0")];
+            tensor<fp16, [1, 1, 2560]> var_3796_cast_fp16 = mul(x = x_cast_fp16, y = const_70_promoted_to_fp16)[name = string("op_3796_cast_fp16")];
+            bool input_interleave_0 = const()[name = string("input_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_cast_fp16 = concat(axis = var_3794, interleave = input_interleave_0, values = (x_cast_fp16, var_3796_cast_fp16))[name = string("input_cast_fp16")];
+            tensor<int32, [1]> normed_213_axes_0 = const()[name = string("normed_213_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3791_to_fp16 = const()[name = string("op_3791_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_213_cast_fp16 = layer_norm(axes = normed_213_axes_0, epsilon = var_3791_to_fp16, x = input_cast_fp16)[name = string("normed_213_cast_fp16")];
+            tensor<int32, [2]> var_3801_split_sizes_0 = const()[name = string("op_3801_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3801_axis_0 = const()[name = string("op_3801_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3801_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3801_cast_fp16_1 = split(axis = var_3801_axis_0, split_sizes = var_3801_split_sizes_0, x = normed_213_cast_fp16)[name = string("op_3801_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_8_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412735552)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_89_cast_fp16 = mul(x = var_3801_cast_fp16_0, y = layers_8_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_89_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_cast_fp16 = add(x = hidden_states_85_cast_fp16, y = hidden_states_89_cast_fp16)[name = string("hidden_states_cast_fp16")];
+            tensor<fp16, [1]> const_71_promoted_to_fp16 = const()[name = string("const_71_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.b4p-1])];
+            tensor<fp16, [1, 1, 2560]> hidden_states_out = mul(x = hidden_states_cast_fp16, y = const_71_promoted_to_fp16)[name = string("op_3811_cast_fp16")];
+            tensor<fp16, [1, 1, 2048, 1]> update_mask_tmp = identity(x = update_mask)[name = string("update_mask_tmp")];
+        } -> (hidden_states_out);
+    func verify_qK<ios18>(tensor<fp16, [1, 1, 3, 2048]> causal_mask_full, tensor<fp16, [1, 1, 3, 512]> causal_mask_sliding, tensor<fp16, [1, 1, 3, 512]> cos_f, tensor<fp16, [1, 1, 3, 256]> cos_s, tensor<fp16, [1, 3, 2560]> hidden_states, tensor<fp16, [1, 2, 512, 256]> kv13_k, tensor<fp16, [1, 2, 512, 256]> kv13_v, tensor<fp16, [1, 2, 2048, 512]> kv14_k, tensor<fp16, [1, 2, 2048, 512]> kv14_v, tensor<fp16, [1, 3, 10752]> per_layer_combined, tensor<fp16, [1, 1, 3, 512]> sin_f, tensor<fp16, [1, 1, 3, 256]> sin_s) {
+            tensor<fp16, [2048, 2560, 1, 1]> layers_0_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2621568))))[name = string("layers_0_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_0_self_attn_q_norm_weight = const()[name = string("layers_0_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2623680)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_0_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2624256))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(15731520))))[name = string("layers_0_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_0_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(15741824))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(28849088))))[name = string("layers_0_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_0_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(28859392))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(41966656))))[name = string("layers_0_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_0_post_feedforward_layernorm_weight = const()[name = string("layers_0_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(41969280)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_0_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(41974464))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(42302208))))[name = string("layers_0_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_1_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(42302528))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(44924032))))[name = string("layers_1_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_1_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(44926144))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(58033408))))[name = string("layers_1_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_1_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(58043712))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(71150976))))[name = string("layers_1_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_1_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(71161280))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(84268544))))[name = string("layers_1_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_1_post_feedforward_layernorm_weight = const()[name = string("layers_1_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(84271168)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_1_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(84276352))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(84604096))))[name = string("layers_1_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_2_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(84604416))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(87225920))))[name = string("layers_2_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_2_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(87228032))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(100335296))))[name = string("layers_2_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_2_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(100345600))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113452864))))[name = string("layers_2_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_2_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(113463168))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(126570432))))[name = string("layers_2_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_2_post_feedforward_layernorm_weight = const()[name = string("layers_2_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(126573056)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_2_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(126578240))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(126905984))))[name = string("layers_2_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_3_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(126906304))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(129527808))))[name = string("layers_3_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_3_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(129529920))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(142637184))))[name = string("layers_3_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_3_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(142647488))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(155754752))))[name = string("layers_3_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_3_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(155765056))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(168872320))))[name = string("layers_3_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_3_post_feedforward_layernorm_weight = const()[name = string("layers_3_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(168874944)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_3_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(168880128))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(169207872))))[name = string("layers_3_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_4_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(169208192))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(171829696))))[name = string("layers_4_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_4_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(171831808))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(184939072))))[name = string("layers_4_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_4_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(184949376))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(198056640))))[name = string("layers_4_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_4_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(198066944))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(211174208))))[name = string("layers_4_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_4_post_feedforward_layernorm_weight = const()[name = string("layers_4_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(211176832)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_4_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(211182016))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(211509760))))[name = string("layers_4_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [4096, 2560, 1, 1]> layers_5_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(211510080))), lut = tensor<fp16, [128, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(216753024))))[name = string("layers_5_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [512]> layers_5_self_attn_q_norm_weight = const()[name = string("layers_5_self_attn_q_norm_weight"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(216757184)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_5_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(216758272))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(229865536))))[name = string("layers_5_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_5_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(229875840))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(242983104))))[name = string("layers_5_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_5_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(242993408))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(256100672))))[name = string("layers_5_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_5_post_feedforward_layernorm_weight = const()[name = string("layers_5_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(256103296)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_5_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(256108480))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(256436224))))[name = string("layers_5_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_6_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(256436544))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(259058048))))[name = string("layers_6_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_6_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(259060160))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(272167424))))[name = string("layers_6_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_6_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(272177728))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(285284992))))[name = string("layers_6_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_6_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(285295296))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(298402560))))[name = string("layers_6_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_6_post_feedforward_layernorm_weight = const()[name = string("layers_6_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(298405184)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_6_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(298410368))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(298738112))))[name = string("layers_6_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_7_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(298738432))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(301359936))))[name = string("layers_7_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_7_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(301362048))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(314469312))))[name = string("layers_7_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_7_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(314479616))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(327586880))))[name = string("layers_7_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_7_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(327597184))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(340704448))))[name = string("layers_7_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_7_post_feedforward_layernorm_weight = const()[name = string("layers_7_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(340707072)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_7_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(340712256))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(341040000))))[name = string("layers_7_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_8_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(341040320))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(343661824))))[name = string("layers_8_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_8_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(343663936))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(356771200))))[name = string("layers_8_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_8_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(356781504))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(369888768))))[name = string("layers_8_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_8_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(369899072))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(383006336))))[name = string("layers_8_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_8_post_feedforward_layernorm_weight = const()[name = string("layers_8_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(383008960)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_8_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(383014144))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(383341888))))[name = string("layers_8_per_layer_input_gate_weight_palettized")];
+            int32 var_449 = const()[name = string("op_449"), val = int32(-1)];
+            fp16 const_0_promoted_to_fp16 = const()[name = string("const_0_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_451_cast_fp16 = mul(x = hidden_states, y = const_0_promoted_to_fp16)[name = string("op_451_cast_fp16")];
+            bool input_1_interleave_0 = const()[name = string("input_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_1_cast_fp16 = concat(axis = var_449, interleave = input_1_interleave_0, values = (hidden_states, var_451_cast_fp16))[name = string("input_1_cast_fp16")];
+            tensor<int32, [1]> normed_1_axes_0 = const()[name = string("normed_1_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_446_to_fp16 = const()[name = string("op_446_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_1_cast_fp16 = layer_norm(axes = normed_1_axes_0, epsilon = var_446_to_fp16, x = input_1_cast_fp16)[name = string("normed_1_cast_fp16")];
+            tensor<int32, [2]> var_456_split_sizes_0 = const()[name = string("op_456_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_456_axis_0 = const()[name = string("op_456_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_456_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_456_cast_fp16_1 = split(axis = var_456_axis_0, split_sizes = var_456_split_sizes_0, x = normed_1_cast_fp16)[name = string("op_456_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_0_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(383342208)))];
+            tensor<fp16, [1, 3, 2560]> h_1_cast_fp16 = mul(x = var_456_cast_fp16_0, y = layers_0_input_layernorm_weight_promoted_to_fp16)[name = string("h_1_cast_fp16")];
+            tensor<int32, [3]> var_462 = const()[name = string("op_462"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_465_axes_0 = const()[name = string("op_465_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_463_cast_fp16 = transpose(perm = var_462, x = h_1_cast_fp16)[name = string("transpose_110")];
+            tensor<fp16, [1, 2560, 1, 3]> var_465_cast_fp16 = expand_dims(axes = var_465_axes_0, x = var_463_cast_fp16)[name = string("op_465_cast_fp16")];
+            string q_1_pad_type_0 = const()[name = string("q_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_1_strides_0 = const()[name = string("q_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_1_pad_0 = const()[name = string("q_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_1_dilations_0 = const()[name = string("q_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_1_groups_0 = const()[name = string("q_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_1 = conv(dilations = q_1_dilations_0, groups = q_1_groups_0, pad = q_1_pad_0, pad_type = q_1_pad_type_0, strides = q_1_strides_0, weight = layers_0_self_attn_q_proj_weight_palettized, x = var_465_cast_fp16)[name = string("q_1")];
+            tensor<int32, [4]> var_486 = const()[name = string("op_486"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_487 = reshape(shape = var_486, x = q_1)[name = string("op_487")];
+            tensor<int32, [4]> transpose_36_perm_0 = const()[name = string("transpose_36_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_510 = const()[name = string("op_510"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_36 = transpose(perm = transpose_36_perm_0, x = var_487)[name = string("transpose_109")];
+            tensor<fp16, [3, 8, 256]> x_1 = reshape(shape = var_510, x = transpose_36)[name = string("x_1")];
+            int32 var_516 = const()[name = string("op_516"), val = int32(-1)];
+            fp16 const_1_promoted = const()[name = string("const_1_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_518 = mul(x = x_1, y = const_1_promoted)[name = string("op_518")];
+            bool input_5_interleave_0 = const()[name = string("input_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_5 = concat(axis = var_516, interleave = input_5_interleave_0, values = (x_1, var_518))[name = string("input_5")];
+            tensor<int32, [1]> normed_5_axes_0 = const()[name = string("normed_5_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_513_to_fp16 = const()[name = string("op_513_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_5_cast_fp16 = layer_norm(axes = normed_5_axes_0, epsilon = var_513_to_fp16, x = input_5)[name = string("normed_5_cast_fp16")];
+            tensor<int32, [2]> var_523_split_sizes_0 = const()[name = string("op_523_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_523_axis_0 = const()[name = string("op_523_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_523_0, tensor<fp16, [3, 8, 256]> var_523_1 = split(axis = var_523_axis_0, split_sizes = var_523_split_sizes_0, x = normed_5_cast_fp16)[name = string("op_523")];
+            tensor<fp16, [3, 8, 256]> q_5 = mul(x = var_523_0, y = layers_0_self_attn_q_norm_weight)[name = string("q_5")];
+            tensor<int32, [4]> var_530 = const()[name = string("op_530"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_531 = reshape(shape = var_530, x = q_5)[name = string("op_531")];
+            tensor<int32, [4]> var_536 = const()[name = string("op_536"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_7 = transpose(perm = var_536, x = var_531)[name = string("transpose_108")];
+            tensor<fp16, [1, 8, 3, 256]> var_538_cast_fp16 = mul(x = q_7, y = cos_s)[name = string("op_538_cast_fp16")];
+            tensor<int32, [2]> var_539_split_sizes_0 = const()[name = string("op_539_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_539_axis_0 = const()[name = string("op_539_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_539_0, tensor<fp16, [1, 8, 3, 128]> var_539_1 = split(axis = var_539_axis_0, split_sizes = var_539_split_sizes_0, x = q_7)[name = string("op_539")];
+            fp16 const_2_promoted = const()[name = string("const_2_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_541 = mul(x = var_539_1, y = const_2_promoted)[name = string("op_541")];
+            int32 var_543 = const()[name = string("op_543"), val = int32(-1)];
+            bool var_544_interleave_0 = const()[name = string("op_544_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_544 = concat(axis = var_543, interleave = var_544_interleave_0, values = (var_541, var_539_0))[name = string("op_544")];
+            tensor<fp16, [1, 8, 3, 256]> var_545_cast_fp16 = mul(x = var_544, y = sin_s)[name = string("op_545_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_9_cast_fp16 = add(x = var_538_cast_fp16, y = var_545_cast_fp16)[name = string("q_9_cast_fp16")];
+            tensor<int32, [4]> transpose_0_perm_0 = const()[name = string("transpose_0_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_0_reps_0 = const()[name = string("tile_0_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_0_cast_fp16 = transpose(perm = transpose_0_perm_0, x = kv13_k)[name = string("transpose_107")];
+            tensor<fp16, [8, 1, 512, 256]> tile_0_cast_fp16 = tile(reps = tile_0_reps_0, x = transpose_0_cast_fp16)[name = string("tile_0_cast_fp16")];
+            tensor<int32, [5]> concat_0 = const()[name = string("concat_0"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_0_cast_fp16 = reshape(shape = concat_0, x = tile_0_cast_fp16)[name = string("reshape_0_cast_fp16")];
+            tensor<int32, [5]> transpose_1_perm_0 = const()[name = string("transpose_1_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_1 = const()[name = string("concat_1"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_1_cast_fp16 = transpose(perm = transpose_1_perm_0, x = reshape_0_cast_fp16)[name = string("transpose_106")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_1_cast_fp16 = reshape(shape = concat_1, x = transpose_1_cast_fp16)[name = string("reshape_1_cast_fp16")];
+            tensor<int32, [4]> transpose_37_perm_0 = const()[name = string("transpose_37_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_2_perm_0 = const()[name = string("transpose_2_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_1_reps_0 = const()[name = string("tile_1_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_2_cast_fp16 = transpose(perm = transpose_2_perm_0, x = kv13_v)[name = string("transpose_105")];
+            tensor<fp16, [8, 1, 512, 256]> tile_1_cast_fp16 = tile(reps = tile_1_reps_0, x = transpose_2_cast_fp16)[name = string("tile_1_cast_fp16")];
+            tensor<int32, [5]> concat_2 = const()[name = string("concat_2"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_2_cast_fp16 = reshape(shape = concat_2, x = tile_1_cast_fp16)[name = string("reshape_2_cast_fp16")];
+            tensor<int32, [5]> transpose_3_perm_0 = const()[name = string("transpose_3_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_3 = const()[name = string("concat_3"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_3_cast_fp16 = transpose(perm = transpose_3_perm_0, x = reshape_2_cast_fp16)[name = string("transpose_104")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_3_cast_fp16 = reshape(shape = concat_3, x = transpose_3_cast_fp16)[name = string("reshape_3_cast_fp16")];
+            tensor<int32, [4]> V_expanded_1_perm_0 = const()[name = string("V_expanded_1_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(false)];
+            bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_37_cast_fp16 = transpose(perm = transpose_37_perm_0, x = reshape_1_cast_fp16)[name = string("transpose_103")];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = q_9_cast_fp16, y = transpose_37_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = causal_mask_sliding)[name = string("x_3_cast_fp16")];
+            tensor<int32, [1]> reduce_max_0_axes_0 = const()[name = string("reduce_max_0_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_0_keep_dims_0 = const()[name = string("reduce_max_0_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_0 = reduce_max(axes = reduce_max_0_axes_0, keep_dims = reduce_max_0_keep_dims_0, x = x_3_cast_fp16)[name = string("reduce_max_0")];
+            tensor<fp16, [1, 8, 3, 512]> var_577 = sub(x = x_3_cast_fp16, y = reduce_max_0)[name = string("op_577")];
+            tensor<fp16, [1, 8, 3, 512]> var_583 = exp(x = var_577)[name = string("op_583")];
+            tensor<int32, [1]> var_593_axes_0 = const()[name = string("op_593_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_593_keep_dims_0 = const()[name = string("op_593_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_593 = reduce_sum(axes = var_593_axes_0, keep_dims = var_593_keep_dims_0, x = var_583)[name = string("op_593")];
+            tensor<fp16, [1, 8, 3, 512]> var_599_cast_fp16 = real_div(x = var_583, y = var_593)[name = string("op_599_cast_fp16")];
+            bool attn_output_1_transpose_x_0 = const()[name = string("attn_output_1_transpose_x_0"), val = bool(false)];
+            bool attn_output_1_transpose_y_0 = const()[name = string("attn_output_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_1_cast_fp16 = transpose(perm = V_expanded_1_perm_0, x = reshape_3_cast_fp16)[name = string("transpose_102")];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_1_cast_fp16 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = var_599_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_1_cast_fp16")];
+            tensor<int32, [4]> var_610 = const()[name = string("op_610"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_617 = const()[name = string("op_617"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_611_cast_fp16 = transpose(perm = var_610, x = attn_output_1_cast_fp16)[name = string("transpose_101")];
+            tensor<fp16, [1, 3, 2048]> attn_output_3_cast_fp16 = reshape(shape = var_617, x = var_611_cast_fp16)[name = string("attn_output_3_cast_fp16")];
+            tensor<int32, [3]> var_622 = const()[name = string("op_622"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_638_pad_type_0 = const()[name = string("op_638_pad_type_0"), val = string("valid")];
+            int32 var_638_groups_0 = const()[name = string("op_638_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_638_strides_0 = const()[name = string("op_638_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_638_pad_0 = const()[name = string("op_638_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_638_dilations_0 = const()[name = string("op_638_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_0_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(383347392))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(385968896))))[name = string("squeeze_0_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_623_cast_fp16 = transpose(perm = var_622, x = attn_output_3_cast_fp16)[name = string("transpose_100")];
+            tensor<fp16, [1, 2560, 3]> var_638_cast_fp16 = conv(dilations = var_638_dilations_0, groups = var_638_groups_0, pad = var_638_pad_0, pad_type = var_638_pad_type_0, strides = var_638_strides_0, weight = squeeze_0_cast_fp16_to_fp32_to_fp16_palettized, x = var_623_cast_fp16)[name = string("op_638_cast_fp16")];
+            tensor<int32, [3]> var_642 = const()[name = string("op_642"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_648 = const()[name = string("op_648"), val = int32(-1)];
+            fp16 const_3_promoted_to_fp16 = const()[name = string("const_3_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_7_cast_fp16 = transpose(perm = var_642, x = var_638_cast_fp16)[name = string("transpose_99")];
+            tensor<fp16, [1, 3, 2560]> var_650_cast_fp16 = mul(x = x_7_cast_fp16, y = const_3_promoted_to_fp16)[name = string("op_650_cast_fp16")];
+            bool input_9_interleave_0 = const()[name = string("input_9_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_9_cast_fp16 = concat(axis = var_648, interleave = input_9_interleave_0, values = (x_7_cast_fp16, var_650_cast_fp16))[name = string("input_9_cast_fp16")];
+            tensor<int32, [1]> normed_9_axes_0 = const()[name = string("normed_9_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_645_to_fp16 = const()[name = string("op_645_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_9_cast_fp16 = layer_norm(axes = normed_9_axes_0, epsilon = var_645_to_fp16, x = input_9_cast_fp16)[name = string("normed_9_cast_fp16")];
+            tensor<int32, [2]> var_655_split_sizes_0 = const()[name = string("op_655_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_655_axis_0 = const()[name = string("op_655_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_655_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_655_cast_fp16_1 = split(axis = var_655_axis_0, split_sizes = var_655_split_sizes_0, x = normed_9_cast_fp16)[name = string("op_655_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_0_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(385971520)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_5_cast_fp16 = mul(x = var_655_cast_fp16_0, y = layers_0_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_5_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_9_cast_fp16 = add(x = hidden_states, y = attn_output_5_cast_fp16)[name = string("x_9_cast_fp16")];
+            int32 var_664 = const()[name = string("op_664"), val = int32(-1)];
+            fp16 const_4_promoted_to_fp16 = const()[name = string("const_4_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_666_cast_fp16 = mul(x = x_9_cast_fp16, y = const_4_promoted_to_fp16)[name = string("op_666_cast_fp16")];
+            bool input_11_interleave_0 = const()[name = string("input_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_11_cast_fp16 = concat(axis = var_664, interleave = input_11_interleave_0, values = (x_9_cast_fp16, var_666_cast_fp16))[name = string("input_11_cast_fp16")];
+            tensor<int32, [1]> normed_13_axes_0 = const()[name = string("normed_13_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_661_to_fp16 = const()[name = string("op_661_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_13_cast_fp16 = layer_norm(axes = normed_13_axes_0, epsilon = var_661_to_fp16, x = input_11_cast_fp16)[name = string("normed_13_cast_fp16")];
+            tensor<int32, [2]> var_671_split_sizes_0 = const()[name = string("op_671_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_671_axis_0 = const()[name = string("op_671_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_671_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_671_cast_fp16_1 = split(axis = var_671_axis_0, split_sizes = var_671_split_sizes_0, x = normed_13_cast_fp16)[name = string("op_671_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_0_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(385976704)))];
+            tensor<fp16, [1, 3, 2560]> h_3_cast_fp16 = mul(x = var_671_cast_fp16_0, y = layers_0_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_3_cast_fp16")];
+            tensor<int32, [3]> var_682 = const()[name = string("op_682"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_13_axes_0 = const()[name = string("input_13_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_683 = transpose(perm = var_682, x = h_3_cast_fp16)[name = string("transpose_98")];
+            tensor<fp16, [1, 2560, 1, 3]> input_13 = expand_dims(axes = input_13_axes_0, x = var_683)[name = string("input_13")];
+            string gate_1_pad_type_0 = const()[name = string("gate_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_1_strides_0 = const()[name = string("gate_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_1_pad_0 = const()[name = string("gate_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_1_dilations_0 = const()[name = string("gate_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_1_groups_0 = const()[name = string("gate_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_1 = conv(dilations = gate_1_dilations_0, groups = gate_1_groups_0, pad = gate_1_pad_0, pad_type = gate_1_pad_type_0, strides = gate_1_strides_0, weight = layers_0_mlp_gate_proj_weight_palettized, x = input_13)[name = string("gate_1")];
+            string up_1_pad_type_0 = const()[name = string("up_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_1_strides_0 = const()[name = string("up_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_1_pad_0 = const()[name = string("up_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_1_dilations_0 = const()[name = string("up_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_1_groups_0 = const()[name = string("up_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_1 = conv(dilations = up_1_dilations_0, groups = up_1_groups_0, pad = up_1_pad_0, pad_type = up_1_pad_type_0, strides = up_1_strides_0, weight = layers_0_mlp_up_proj_weight_palettized, x = input_13)[name = string("up_1")];
+            string gate_3_mode_0 = const()[name = string("gate_3_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_3 = gelu(mode = gate_3_mode_0, x = gate_1)[name = string("gate_3")];
+            tensor<fp16, [1, 10240, 1, 3]> input_15 = mul(x = gate_3, y = up_1)[name = string("input_15")];
+            string mlp_out_1_pad_type_0 = const()[name = string("mlp_out_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_1_strides_0 = const()[name = string("mlp_out_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_1_pad_0 = const()[name = string("mlp_out_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_1_dilations_0 = const()[name = string("mlp_out_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_1_groups_0 = const()[name = string("mlp_out_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_1 = conv(dilations = mlp_out_1_dilations_0, groups = mlp_out_1_groups_0, pad = mlp_out_1_pad_0, pad_type = mlp_out_1_pad_type_0, strides = mlp_out_1_strides_0, weight = layers_0_mlp_down_proj_weight_palettized, x = input_15)[name = string("mlp_out_1")];
+            tensor<int32, [1]> var_723_axes_0 = const()[name = string("op_723_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_723 = squeeze(axes = var_723_axes_0, x = mlp_out_1)[name = string("op_723")];
+            tensor<int32, [3]> var_727 = const()[name = string("op_727"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_733 = const()[name = string("op_733"), val = int32(-1)];
+            fp16 const_5_promoted = const()[name = string("const_5_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_11 = transpose(perm = var_727, x = var_723)[name = string("transpose_97")];
+            tensor<fp16, [1, 3, 2560]> var_735 = mul(x = x_11, y = const_5_promoted)[name = string("op_735")];
+            bool input_17_interleave_0 = const()[name = string("input_17_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_17 = concat(axis = var_733, interleave = input_17_interleave_0, values = (x_11, var_735))[name = string("input_17")];
+            tensor<int32, [1]> normed_17_axes_0 = const()[name = string("normed_17_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_730_to_fp16 = const()[name = string("op_730_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_17_cast_fp16 = layer_norm(axes = normed_17_axes_0, epsilon = var_730_to_fp16, x = input_17)[name = string("normed_17_cast_fp16")];
+            tensor<int32, [2]> var_740_split_sizes_0 = const()[name = string("op_740_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_740_axis_0 = const()[name = string("op_740_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_740_0, tensor<fp16, [1, 3, 2560]> var_740_1 = split(axis = var_740_axis_0, split_sizes = var_740_split_sizes_0, x = normed_17_cast_fp16)[name = string("op_740")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_3 = mul(x = var_740_0, y = layers_0_post_feedforward_layernorm_weight)[name = string("hidden_states_3")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_5_cast_fp16 = add(x = x_9_cast_fp16, y = hidden_states_3)[name = string("hidden_states_5_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_1_begin_0 = const()[name = string("per_layer_slice_1_begin_0"), val = tensor<int32, [3]>([0, 0, 6144])];
+            tensor<int32, [3]> per_layer_slice_1_end_0 = const()[name = string("per_layer_slice_1_end_0"), val = tensor<int32, [3]>([1, 3, 6400])];
+            tensor<bool, [3]> per_layer_slice_1_end_mask_0 = const()[name = string("per_layer_slice_1_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_1_cast_fp16 = slice_by_index(begin = per_layer_slice_1_begin_0, end = per_layer_slice_1_end_0, end_mask = per_layer_slice_1_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_1_cast_fp16")];
+            tensor<int32, [3]> var_768 = const()[name = string("op_768"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_19_axes_0 = const()[name = string("input_19_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_769 = transpose(perm = var_768, x = hidden_states_5_cast_fp16)[name = string("transpose_96")];
+            tensor<fp16, [1, 2560, 1, 3]> input_19 = expand_dims(axes = input_19_axes_0, x = var_769)[name = string("input_19")];
+            string gated_1_pad_type_0 = const()[name = string("gated_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_1_strides_0 = const()[name = string("gated_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_1_pad_0 = const()[name = string("gated_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_1_dilations_0 = const()[name = string("gated_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_1_groups_0 = const()[name = string("gated_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_1 = conv(dilations = gated_1_dilations_0, groups = gated_1_groups_0, pad = gated_1_pad_0, pad_type = gated_1_pad_type_0, strides = gated_1_strides_0, weight = layers_0_per_layer_input_gate_weight_palettized, x = input_19)[name = string("gated_1")];
+            string gated_3_mode_0 = const()[name = string("gated_3_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_3 = gelu(mode = gated_3_mode_0, x = gated_1)[name = string("gated_3")];
+            tensor<int32, [3]> var_788 = const()[name = string("op_788"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_1_axes_0 = const()[name = string("per_layer_slice_conv_1_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_789_cast_fp16 = transpose(perm = var_788, x = per_layer_slice_1_cast_fp16)[name = string("transpose_95")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_1_cast_fp16 = expand_dims(axes = per_layer_slice_conv_1_axes_0, x = var_789_cast_fp16)[name = string("per_layer_slice_conv_1_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_21_cast_fp16 = mul(x = gated_3, y = per_layer_slice_conv_1_cast_fp16)[name = string("input_21_cast_fp16")];
+            string gated_5_pad_type_0 = const()[name = string("gated_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_5_strides_0 = const()[name = string("gated_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_5_pad_0 = const()[name = string("gated_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_5_dilations_0 = const()[name = string("gated_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_5_groups_0 = const()[name = string("gated_5_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_0_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(385981888))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(386309632))))[name = string("layers_0_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_5_cast_fp16 = conv(dilations = gated_5_dilations_0, groups = gated_5_groups_0, pad = gated_5_pad_0, pad_type = gated_5_pad_type_0, strides = gated_5_strides_0, weight = layers_0_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_21_cast_fp16)[name = string("gated_5_cast_fp16")];
+            tensor<int32, [1]> var_805_axes_0 = const()[name = string("op_805_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_805_cast_fp16 = squeeze(axes = var_805_axes_0, x = gated_5_cast_fp16)[name = string("op_805_cast_fp16")];
+            tensor<int32, [3]> var_809 = const()[name = string("op_809"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_815 = const()[name = string("op_815"), val = int32(-1)];
+            fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_13_cast_fp16 = transpose(perm = var_809, x = var_805_cast_fp16)[name = string("transpose_94")];
+            tensor<fp16, [1, 3, 2560]> var_817_cast_fp16 = mul(x = x_13_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_817_cast_fp16")];
+            bool input_23_interleave_0 = const()[name = string("input_23_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_23_cast_fp16 = concat(axis = var_815, interleave = input_23_interleave_0, values = (x_13_cast_fp16, var_817_cast_fp16))[name = string("input_23_cast_fp16")];
+            tensor<int32, [1]> normed_21_axes_0 = const()[name = string("normed_21_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_812_to_fp16 = const()[name = string("op_812_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_21_cast_fp16 = layer_norm(axes = normed_21_axes_0, epsilon = var_812_to_fp16, x = input_23_cast_fp16)[name = string("normed_21_cast_fp16")];
+            tensor<int32, [2]> var_822_split_sizes_0 = const()[name = string("op_822_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_822_axis_0 = const()[name = string("op_822_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_822_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_822_cast_fp16_1 = split(axis = var_822_axis_0, split_sizes = var_822_split_sizes_0, x = normed_21_cast_fp16)[name = string("op_822_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_0_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(386312256)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_9_cast_fp16 = mul(x = var_822_cast_fp16_0, y = layers_0_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_9_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_11_cast_fp16 = add(x = hidden_states_5_cast_fp16, y = hidden_states_9_cast_fp16)[name = string("hidden_states_11_cast_fp16")];
+            tensor<fp16, [1]> const_7_promoted_to_fp16 = const()[name = string("const_7_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.02p-1])];
+            tensor<fp16, [1, 3, 2560]> x_15_cast_fp16 = mul(x = hidden_states_11_cast_fp16, y = const_7_promoted_to_fp16)[name = string("x_15_cast_fp16")];
+            int32 var_837 = const()[name = string("op_837"), val = int32(-1)];
+            fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_839_cast_fp16 = mul(x = x_15_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_839_cast_fp16")];
+            bool input_25_interleave_0 = const()[name = string("input_25_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_25_cast_fp16 = concat(axis = var_837, interleave = input_25_interleave_0, values = (x_15_cast_fp16, var_839_cast_fp16))[name = string("input_25_cast_fp16")];
+            tensor<int32, [1]> normed_25_axes_0 = const()[name = string("normed_25_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_834_to_fp16 = const()[name = string("op_834_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_25_cast_fp16 = layer_norm(axes = normed_25_axes_0, epsilon = var_834_to_fp16, x = input_25_cast_fp16)[name = string("normed_25_cast_fp16")];
+            tensor<int32, [2]> var_844_split_sizes_0 = const()[name = string("op_844_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_844_axis_0 = const()[name = string("op_844_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_844_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_844_cast_fp16_1 = split(axis = var_844_axis_0, split_sizes = var_844_split_sizes_0, x = normed_25_cast_fp16)[name = string("op_844_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_1_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(386317440)))];
+            tensor<fp16, [1, 3, 2560]> h_7_cast_fp16 = mul(x = var_844_cast_fp16_0, y = layers_1_input_layernorm_weight_promoted_to_fp16)[name = string("h_7_cast_fp16")];
+            tensor<int32, [3]> var_850 = const()[name = string("op_850"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_853_axes_0 = const()[name = string("op_853_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_851_cast_fp16 = transpose(perm = var_850, x = h_7_cast_fp16)[name = string("transpose_93")];
+            tensor<fp16, [1, 2560, 1, 3]> var_853_cast_fp16 = expand_dims(axes = var_853_axes_0, x = var_851_cast_fp16)[name = string("op_853_cast_fp16")];
+            string q_11_pad_type_0 = const()[name = string("q_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_11_strides_0 = const()[name = string("q_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_11_pad_0 = const()[name = string("q_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_11_dilations_0 = const()[name = string("q_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_11_groups_0 = const()[name = string("q_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_11 = conv(dilations = q_11_dilations_0, groups = q_11_groups_0, pad = q_11_pad_0, pad_type = q_11_pad_type_0, strides = q_11_strides_0, weight = layers_1_self_attn_q_proj_weight_palettized, x = var_853_cast_fp16)[name = string("q_11")];
+            tensor<int32, [4]> var_874 = const()[name = string("op_874"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_875 = reshape(shape = var_874, x = q_11)[name = string("op_875")];
+            tensor<int32, [4]> transpose_38_perm_0 = const()[name = string("transpose_38_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_898 = const()[name = string("op_898"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_38 = transpose(perm = transpose_38_perm_0, x = var_875)[name = string("transpose_92")];
+            tensor<fp16, [3, 8, 256]> x_17 = reshape(shape = var_898, x = transpose_38)[name = string("x_17")];
+            int32 var_904 = const()[name = string("op_904"), val = int32(-1)];
+            fp16 const_9_promoted = const()[name = string("const_9_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_906 = mul(x = x_17, y = const_9_promoted)[name = string("op_906")];
+            bool input_29_interleave_0 = const()[name = string("input_29_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_29 = concat(axis = var_904, interleave = input_29_interleave_0, values = (x_17, var_906))[name = string("input_29")];
+            tensor<int32, [1]> normed_29_axes_0 = const()[name = string("normed_29_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_901_to_fp16 = const()[name = string("op_901_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_29_cast_fp16 = layer_norm(axes = normed_29_axes_0, epsilon = var_901_to_fp16, x = input_29)[name = string("normed_29_cast_fp16")];
+            tensor<int32, [2]> var_911_split_sizes_0 = const()[name = string("op_911_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_911_axis_0 = const()[name = string("op_911_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_911_0, tensor<fp16, [3, 8, 256]> var_911_1 = split(axis = var_911_axis_0, split_sizes = var_911_split_sizes_0, x = normed_29_cast_fp16)[name = string("op_911")];
+            tensor<fp16, [3, 8, 256]> q_15 = mul(x = var_911_0, y = layers_0_self_attn_q_norm_weight)[name = string("q_15")];
+            tensor<int32, [4]> var_918 = const()[name = string("op_918"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_919 = reshape(shape = var_918, x = q_15)[name = string("op_919")];
+            tensor<int32, [4]> var_924 = const()[name = string("op_924"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_17 = transpose(perm = var_924, x = var_919)[name = string("transpose_91")];
+            tensor<fp16, [1, 8, 3, 256]> var_926_cast_fp16 = mul(x = q_17, y = cos_s)[name = string("op_926_cast_fp16")];
+            tensor<int32, [2]> var_927_split_sizes_0 = const()[name = string("op_927_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_927_axis_0 = const()[name = string("op_927_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_927_0, tensor<fp16, [1, 8, 3, 128]> var_927_1 = split(axis = var_927_axis_0, split_sizes = var_927_split_sizes_0, x = q_17)[name = string("op_927")];
+            fp16 const_10_promoted = const()[name = string("const_10_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_929 = mul(x = var_927_1, y = const_10_promoted)[name = string("op_929")];
+            int32 var_931 = const()[name = string("op_931"), val = int32(-1)];
+            bool var_932_interleave_0 = const()[name = string("op_932_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_932 = concat(axis = var_931, interleave = var_932_interleave_0, values = (var_929, var_927_0))[name = string("op_932")];
+            tensor<fp16, [1, 8, 3, 256]> var_933_cast_fp16 = mul(x = var_932, y = sin_s)[name = string("op_933_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_19_cast_fp16 = add(x = var_926_cast_fp16, y = var_933_cast_fp16)[name = string("q_19_cast_fp16")];
+            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(false)];
+            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = q_19_cast_fp16, y = transpose_37_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_19_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = causal_mask_sliding)[name = string("x_19_cast_fp16")];
+            tensor<int32, [1]> reduce_max_1_axes_0 = const()[name = string("reduce_max_1_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_1_keep_dims_0 = const()[name = string("reduce_max_1_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_1 = reduce_max(axes = reduce_max_1_axes_0, keep_dims = reduce_max_1_keep_dims_0, x = x_19_cast_fp16)[name = string("reduce_max_1")];
+            tensor<fp16, [1, 8, 3, 512]> var_965 = sub(x = x_19_cast_fp16, y = reduce_max_1)[name = string("op_965")];
+            tensor<fp16, [1, 8, 3, 512]> var_971 = exp(x = var_965)[name = string("op_971")];
+            tensor<int32, [1]> var_981_axes_0 = const()[name = string("op_981_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_981_keep_dims_0 = const()[name = string("op_981_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_981 = reduce_sum(axes = var_981_axes_0, keep_dims = var_981_keep_dims_0, x = var_971)[name = string("op_981")];
+            tensor<fp16, [1, 8, 3, 512]> var_987_cast_fp16 = real_div(x = var_971, y = var_981)[name = string("op_987_cast_fp16")];
+            bool attn_output_7_transpose_x_0 = const()[name = string("attn_output_7_transpose_x_0"), val = bool(false)];
+            bool attn_output_7_transpose_y_0 = const()[name = string("attn_output_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_7_cast_fp16 = matmul(transpose_x = attn_output_7_transpose_x_0, transpose_y = attn_output_7_transpose_y_0, x = var_987_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_7_cast_fp16")];
+            tensor<int32, [4]> var_998 = const()[name = string("op_998"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1005 = const()[name = string("op_1005"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_999_cast_fp16 = transpose(perm = var_998, x = attn_output_7_cast_fp16)[name = string("transpose_90")];
+            tensor<fp16, [1, 3, 2048]> attn_output_9_cast_fp16 = reshape(shape = var_1005, x = var_999_cast_fp16)[name = string("attn_output_9_cast_fp16")];
+            tensor<int32, [3]> var_1010 = const()[name = string("op_1010"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_1026_pad_type_0 = const()[name = string("op_1026_pad_type_0"), val = string("valid")];
+            int32 var_1026_groups_0 = const()[name = string("op_1026_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_1026_strides_0 = const()[name = string("op_1026_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_1026_pad_0 = const()[name = string("op_1026_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_1026_dilations_0 = const()[name = string("op_1026_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_1_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(386322624))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388944128))))[name = string("squeeze_1_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_1011_cast_fp16 = transpose(perm = var_1010, x = attn_output_9_cast_fp16)[name = string("transpose_89")];
+            tensor<fp16, [1, 2560, 3]> var_1026_cast_fp16 = conv(dilations = var_1026_dilations_0, groups = var_1026_groups_0, pad = var_1026_pad_0, pad_type = var_1026_pad_type_0, strides = var_1026_strides_0, weight = squeeze_1_cast_fp16_to_fp32_to_fp16_palettized, x = var_1011_cast_fp16)[name = string("op_1026_cast_fp16")];
+            tensor<int32, [3]> var_1030 = const()[name = string("op_1030"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1036 = const()[name = string("op_1036"), val = int32(-1)];
+            fp16 const_11_promoted_to_fp16 = const()[name = string("const_11_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_23_cast_fp16 = transpose(perm = var_1030, x = var_1026_cast_fp16)[name = string("transpose_88")];
+            tensor<fp16, [1, 3, 2560]> var_1038_cast_fp16 = mul(x = x_23_cast_fp16, y = const_11_promoted_to_fp16)[name = string("op_1038_cast_fp16")];
+            bool input_33_interleave_0 = const()[name = string("input_33_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_33_cast_fp16 = concat(axis = var_1036, interleave = input_33_interleave_0, values = (x_23_cast_fp16, var_1038_cast_fp16))[name = string("input_33_cast_fp16")];
+            tensor<int32, [1]> normed_33_axes_0 = const()[name = string("normed_33_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1033_to_fp16 = const()[name = string("op_1033_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_33_cast_fp16 = layer_norm(axes = normed_33_axes_0, epsilon = var_1033_to_fp16, x = input_33_cast_fp16)[name = string("normed_33_cast_fp16")];
+            tensor<int32, [2]> var_1043_split_sizes_0 = const()[name = string("op_1043_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1043_axis_0 = const()[name = string("op_1043_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1043_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1043_cast_fp16_1 = split(axis = var_1043_axis_0, split_sizes = var_1043_split_sizes_0, x = normed_33_cast_fp16)[name = string("op_1043_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_1_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388946752)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_11_cast_fp16 = mul(x = var_1043_cast_fp16_0, y = layers_1_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_11_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_25_cast_fp16 = add(x = x_15_cast_fp16, y = attn_output_11_cast_fp16)[name = string("x_25_cast_fp16")];
+            int32 var_1052 = const()[name = string("op_1052"), val = int32(-1)];
+            fp16 const_12_promoted_to_fp16 = const()[name = string("const_12_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_1054_cast_fp16 = mul(x = x_25_cast_fp16, y = const_12_promoted_to_fp16)[name = string("op_1054_cast_fp16")];
+            bool input_35_interleave_0 = const()[name = string("input_35_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_35_cast_fp16 = concat(axis = var_1052, interleave = input_35_interleave_0, values = (x_25_cast_fp16, var_1054_cast_fp16))[name = string("input_35_cast_fp16")];
+            tensor<int32, [1]> normed_37_axes_0 = const()[name = string("normed_37_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1049_to_fp16 = const()[name = string("op_1049_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_37_cast_fp16 = layer_norm(axes = normed_37_axes_0, epsilon = var_1049_to_fp16, x = input_35_cast_fp16)[name = string("normed_37_cast_fp16")];
+            tensor<int32, [2]> var_1059_split_sizes_0 = const()[name = string("op_1059_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1059_axis_0 = const()[name = string("op_1059_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1059_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1059_cast_fp16_1 = split(axis = var_1059_axis_0, split_sizes = var_1059_split_sizes_0, x = normed_37_cast_fp16)[name = string("op_1059_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_1_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388951936)))];
+            tensor<fp16, [1, 3, 2560]> h_9_cast_fp16 = mul(x = var_1059_cast_fp16_0, y = layers_1_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_9_cast_fp16")];
+            tensor<int32, [3]> var_1070 = const()[name = string("op_1070"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_37_axes_0 = const()[name = string("input_37_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1071 = transpose(perm = var_1070, x = h_9_cast_fp16)[name = string("transpose_87")];
+            tensor<fp16, [1, 2560, 1, 3]> input_37 = expand_dims(axes = input_37_axes_0, x = var_1071)[name = string("input_37")];
+            string gate_5_pad_type_0 = const()[name = string("gate_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_5_strides_0 = const()[name = string("gate_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_5_pad_0 = const()[name = string("gate_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_5_dilations_0 = const()[name = string("gate_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_5_groups_0 = const()[name = string("gate_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_5 = conv(dilations = gate_5_dilations_0, groups = gate_5_groups_0, pad = gate_5_pad_0, pad_type = gate_5_pad_type_0, strides = gate_5_strides_0, weight = layers_1_mlp_gate_proj_weight_palettized, x = input_37)[name = string("gate_5")];
+            string up_3_pad_type_0 = const()[name = string("up_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_3_strides_0 = const()[name = string("up_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_3_pad_0 = const()[name = string("up_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_3_dilations_0 = const()[name = string("up_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_3_groups_0 = const()[name = string("up_3_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_3 = conv(dilations = up_3_dilations_0, groups = up_3_groups_0, pad = up_3_pad_0, pad_type = up_3_pad_type_0, strides = up_3_strides_0, weight = layers_1_mlp_up_proj_weight_palettized, x = input_37)[name = string("up_3")];
+            string gate_7_mode_0 = const()[name = string("gate_7_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_7 = gelu(mode = gate_7_mode_0, x = gate_5)[name = string("gate_7")];
+            tensor<fp16, [1, 10240, 1, 3]> input_39 = mul(x = gate_7, y = up_3)[name = string("input_39")];
+            string mlp_out_3_pad_type_0 = const()[name = string("mlp_out_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_3_strides_0 = const()[name = string("mlp_out_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_3_pad_0 = const()[name = string("mlp_out_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_3_dilations_0 = const()[name = string("mlp_out_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_3_groups_0 = const()[name = string("mlp_out_3_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_3 = conv(dilations = mlp_out_3_dilations_0, groups = mlp_out_3_groups_0, pad = mlp_out_3_pad_0, pad_type = mlp_out_3_pad_type_0, strides = mlp_out_3_strides_0, weight = layers_1_mlp_down_proj_weight_palettized, x = input_39)[name = string("mlp_out_3")];
+            tensor<int32, [1]> var_1111_axes_0 = const()[name = string("op_1111_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1111 = squeeze(axes = var_1111_axes_0, x = mlp_out_3)[name = string("op_1111")];
+            tensor<int32, [3]> var_1115 = const()[name = string("op_1115"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1121 = const()[name = string("op_1121"), val = int32(-1)];
+            fp16 const_13_promoted = const()[name = string("const_13_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_27 = transpose(perm = var_1115, x = var_1111)[name = string("transpose_86")];
+            tensor<fp16, [1, 3, 2560]> var_1123 = mul(x = x_27, y = const_13_promoted)[name = string("op_1123")];
+            bool input_41_interleave_0 = const()[name = string("input_41_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_41 = concat(axis = var_1121, interleave = input_41_interleave_0, values = (x_27, var_1123))[name = string("input_41")];
+            tensor<int32, [1]> normed_41_axes_0 = const()[name = string("normed_41_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1118_to_fp16 = const()[name = string("op_1118_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_41_cast_fp16 = layer_norm(axes = normed_41_axes_0, epsilon = var_1118_to_fp16, x = input_41)[name = string("normed_41_cast_fp16")];
+            tensor<int32, [2]> var_1128_split_sizes_0 = const()[name = string("op_1128_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1128_axis_0 = const()[name = string("op_1128_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1128_0, tensor<fp16, [1, 3, 2560]> var_1128_1 = split(axis = var_1128_axis_0, split_sizes = var_1128_split_sizes_0, x = normed_41_cast_fp16)[name = string("op_1128")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_13 = mul(x = var_1128_0, y = layers_1_post_feedforward_layernorm_weight)[name = string("hidden_states_13")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_15_cast_fp16 = add(x = x_25_cast_fp16, y = hidden_states_13)[name = string("hidden_states_15_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_3_begin_0 = const()[name = string("per_layer_slice_3_begin_0"), val = tensor<int32, [3]>([0, 0, 6400])];
+            tensor<int32, [3]> per_layer_slice_3_end_0 = const()[name = string("per_layer_slice_3_end_0"), val = tensor<int32, [3]>([1, 3, 6656])];
+            tensor<bool, [3]> per_layer_slice_3_end_mask_0 = const()[name = string("per_layer_slice_3_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_3_cast_fp16 = slice_by_index(begin = per_layer_slice_3_begin_0, end = per_layer_slice_3_end_0, end_mask = per_layer_slice_3_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_3_cast_fp16")];
+            tensor<int32, [3]> var_1156 = const()[name = string("op_1156"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_43_axes_0 = const()[name = string("input_43_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1157 = transpose(perm = var_1156, x = hidden_states_15_cast_fp16)[name = string("transpose_85")];
+            tensor<fp16, [1, 2560, 1, 3]> input_43 = expand_dims(axes = input_43_axes_0, x = var_1157)[name = string("input_43")];
+            string gated_7_pad_type_0 = const()[name = string("gated_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_7_strides_0 = const()[name = string("gated_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_7_pad_0 = const()[name = string("gated_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_7_dilations_0 = const()[name = string("gated_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_7_groups_0 = const()[name = string("gated_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_7 = conv(dilations = gated_7_dilations_0, groups = gated_7_groups_0, pad = gated_7_pad_0, pad_type = gated_7_pad_type_0, strides = gated_7_strides_0, weight = layers_1_per_layer_input_gate_weight_palettized, x = input_43)[name = string("gated_7")];
+            string gated_9_mode_0 = const()[name = string("gated_9_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_9 = gelu(mode = gated_9_mode_0, x = gated_7)[name = string("gated_9")];
+            tensor<int32, [3]> var_1176 = const()[name = string("op_1176"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_3_axes_0 = const()[name = string("per_layer_slice_conv_3_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_1177_cast_fp16 = transpose(perm = var_1176, x = per_layer_slice_3_cast_fp16)[name = string("transpose_84")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_3_cast_fp16 = expand_dims(axes = per_layer_slice_conv_3_axes_0, x = var_1177_cast_fp16)[name = string("per_layer_slice_conv_3_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_45_cast_fp16 = mul(x = gated_9, y = per_layer_slice_conv_3_cast_fp16)[name = string("input_45_cast_fp16")];
+            string gated_11_pad_type_0 = const()[name = string("gated_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_11_strides_0 = const()[name = string("gated_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_11_pad_0 = const()[name = string("gated_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_11_dilations_0 = const()[name = string("gated_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_11_groups_0 = const()[name = string("gated_11_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_1_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388957120))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(389284864))))[name = string("layers_1_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_11_cast_fp16 = conv(dilations = gated_11_dilations_0, groups = gated_11_groups_0, pad = gated_11_pad_0, pad_type = gated_11_pad_type_0, strides = gated_11_strides_0, weight = layers_1_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_45_cast_fp16)[name = string("gated_11_cast_fp16")];
+            tensor<int32, [1]> var_1193_axes_0 = const()[name = string("op_1193_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1193_cast_fp16 = squeeze(axes = var_1193_axes_0, x = gated_11_cast_fp16)[name = string("op_1193_cast_fp16")];
+            tensor<int32, [3]> var_1197 = const()[name = string("op_1197"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1203 = const()[name = string("op_1203"), val = int32(-1)];
+            fp16 const_14_promoted_to_fp16 = const()[name = string("const_14_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_29_cast_fp16 = transpose(perm = var_1197, x = var_1193_cast_fp16)[name = string("transpose_83")];
+            tensor<fp16, [1, 3, 2560]> var_1205_cast_fp16 = mul(x = x_29_cast_fp16, y = const_14_promoted_to_fp16)[name = string("op_1205_cast_fp16")];
+            bool input_47_interleave_0 = const()[name = string("input_47_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_47_cast_fp16 = concat(axis = var_1203, interleave = input_47_interleave_0, values = (x_29_cast_fp16, var_1205_cast_fp16))[name = string("input_47_cast_fp16")];
+            tensor<int32, [1]> normed_45_axes_0 = const()[name = string("normed_45_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1200_to_fp16 = const()[name = string("op_1200_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_45_cast_fp16 = layer_norm(axes = normed_45_axes_0, epsilon = var_1200_to_fp16, x = input_47_cast_fp16)[name = string("normed_45_cast_fp16")];
+            tensor<int32, [2]> var_1210_split_sizes_0 = const()[name = string("op_1210_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1210_axis_0 = const()[name = string("op_1210_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1210_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1210_cast_fp16_1 = split(axis = var_1210_axis_0, split_sizes = var_1210_split_sizes_0, x = normed_45_cast_fp16)[name = string("op_1210_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_1_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(389287488)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_19_cast_fp16 = mul(x = var_1210_cast_fp16_0, y = layers_1_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_19_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_21_cast_fp16 = add(x = hidden_states_15_cast_fp16, y = hidden_states_19_cast_fp16)[name = string("hidden_states_21_cast_fp16")];
+            tensor<fp16, [1]> const_15_promoted_to_fp16 = const()[name = string("const_15_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.6ep-1])];
+            tensor<fp16, [1, 3, 2560]> x_31_cast_fp16 = mul(x = hidden_states_21_cast_fp16, y = const_15_promoted_to_fp16)[name = string("x_31_cast_fp16")];
+            int32 var_1225 = const()[name = string("op_1225"), val = int32(-1)];
+            fp16 const_16_promoted_to_fp16 = const()[name = string("const_16_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_1227_cast_fp16 = mul(x = x_31_cast_fp16, y = const_16_promoted_to_fp16)[name = string("op_1227_cast_fp16")];
+            bool input_49_interleave_0 = const()[name = string("input_49_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_49_cast_fp16 = concat(axis = var_1225, interleave = input_49_interleave_0, values = (x_31_cast_fp16, var_1227_cast_fp16))[name = string("input_49_cast_fp16")];
+            tensor<int32, [1]> normed_49_axes_0 = const()[name = string("normed_49_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1222_to_fp16 = const()[name = string("op_1222_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_49_cast_fp16 = layer_norm(axes = normed_49_axes_0, epsilon = var_1222_to_fp16, x = input_49_cast_fp16)[name = string("normed_49_cast_fp16")];
+            tensor<int32, [2]> var_1232_split_sizes_0 = const()[name = string("op_1232_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1232_axis_0 = const()[name = string("op_1232_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1232_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1232_cast_fp16_1 = split(axis = var_1232_axis_0, split_sizes = var_1232_split_sizes_0, x = normed_49_cast_fp16)[name = string("op_1232_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_2_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(389292672)))];
+            tensor<fp16, [1, 3, 2560]> h_13_cast_fp16 = mul(x = var_1232_cast_fp16_0, y = layers_2_input_layernorm_weight_promoted_to_fp16)[name = string("h_13_cast_fp16")];
+            tensor<int32, [3]> var_1238 = const()[name = string("op_1238"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1241_axes_0 = const()[name = string("op_1241_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1239_cast_fp16 = transpose(perm = var_1238, x = h_13_cast_fp16)[name = string("transpose_82")];
+            tensor<fp16, [1, 2560, 1, 3]> var_1241_cast_fp16 = expand_dims(axes = var_1241_axes_0, x = var_1239_cast_fp16)[name = string("op_1241_cast_fp16")];
+            string q_21_pad_type_0 = const()[name = string("q_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_21_strides_0 = const()[name = string("q_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_21_pad_0 = const()[name = string("q_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_21_dilations_0 = const()[name = string("q_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_21_groups_0 = const()[name = string("q_21_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_21 = conv(dilations = q_21_dilations_0, groups = q_21_groups_0, pad = q_21_pad_0, pad_type = q_21_pad_type_0, strides = q_21_strides_0, weight = layers_2_self_attn_q_proj_weight_palettized, x = var_1241_cast_fp16)[name = string("q_21")];
+            tensor<int32, [4]> var_1262 = const()[name = string("op_1262"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_1263 = reshape(shape = var_1262, x = q_21)[name = string("op_1263")];
+            tensor<int32, [4]> transpose_40_perm_0 = const()[name = string("transpose_40_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_1286 = const()[name = string("op_1286"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_40 = transpose(perm = transpose_40_perm_0, x = var_1263)[name = string("transpose_81")];
+            tensor<fp16, [3, 8, 256]> x_33 = reshape(shape = var_1286, x = transpose_40)[name = string("x_33")];
+            int32 var_1292 = const()[name = string("op_1292"), val = int32(-1)];
+            fp16 const_17_promoted = const()[name = string("const_17_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_1294 = mul(x = x_33, y = const_17_promoted)[name = string("op_1294")];
+            bool input_53_interleave_0 = const()[name = string("input_53_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_53 = concat(axis = var_1292, interleave = input_53_interleave_0, values = (x_33, var_1294))[name = string("input_53")];
+            tensor<int32, [1]> normed_53_axes_0 = const()[name = string("normed_53_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1289_to_fp16 = const()[name = string("op_1289_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_53_cast_fp16 = layer_norm(axes = normed_53_axes_0, epsilon = var_1289_to_fp16, x = input_53)[name = string("normed_53_cast_fp16")];
+            tensor<int32, [2]> var_1299_split_sizes_0 = const()[name = string("op_1299_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_1299_axis_0 = const()[name = string("op_1299_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_1299_0, tensor<fp16, [3, 8, 256]> var_1299_1 = split(axis = var_1299_axis_0, split_sizes = var_1299_split_sizes_0, x = normed_53_cast_fp16)[name = string("op_1299")];
+            tensor<fp16, [3, 8, 256]> q_25 = mul(x = var_1299_0, y = layers_0_self_attn_q_norm_weight)[name = string("q_25")];
+            tensor<int32, [4]> var_1306 = const()[name = string("op_1306"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_1307 = reshape(shape = var_1306, x = q_25)[name = string("op_1307")];
+            tensor<int32, [4]> var_1312 = const()[name = string("op_1312"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_27 = transpose(perm = var_1312, x = var_1307)[name = string("transpose_80")];
+            tensor<fp16, [1, 8, 3, 256]> var_1314_cast_fp16 = mul(x = q_27, y = cos_s)[name = string("op_1314_cast_fp16")];
+            tensor<int32, [2]> var_1315_split_sizes_0 = const()[name = string("op_1315_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_1315_axis_0 = const()[name = string("op_1315_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_1315_0, tensor<fp16, [1, 8, 3, 128]> var_1315_1 = split(axis = var_1315_axis_0, split_sizes = var_1315_split_sizes_0, x = q_27)[name = string("op_1315")];
+            fp16 const_18_promoted = const()[name = string("const_18_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_1317 = mul(x = var_1315_1, y = const_18_promoted)[name = string("op_1317")];
+            int32 var_1319 = const()[name = string("op_1319"), val = int32(-1)];
+            bool var_1320_interleave_0 = const()[name = string("op_1320_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_1320 = concat(axis = var_1319, interleave = var_1320_interleave_0, values = (var_1317, var_1315_0))[name = string("op_1320")];
+            tensor<fp16, [1, 8, 3, 256]> var_1321_cast_fp16 = mul(x = var_1320, y = sin_s)[name = string("op_1321_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_29_cast_fp16 = add(x = var_1314_cast_fp16, y = var_1321_cast_fp16)[name = string("q_29_cast_fp16")];
+            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(false)];
+            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = q_29_cast_fp16, y = transpose_37_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_35_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = causal_mask_sliding)[name = string("x_35_cast_fp16")];
+            tensor<int32, [1]> reduce_max_2_axes_0 = const()[name = string("reduce_max_2_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_2_keep_dims_0 = const()[name = string("reduce_max_2_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_2 = reduce_max(axes = reduce_max_2_axes_0, keep_dims = reduce_max_2_keep_dims_0, x = x_35_cast_fp16)[name = string("reduce_max_2")];
+            tensor<fp16, [1, 8, 3, 512]> var_1353 = sub(x = x_35_cast_fp16, y = reduce_max_2)[name = string("op_1353")];
+            tensor<fp16, [1, 8, 3, 512]> var_1359 = exp(x = var_1353)[name = string("op_1359")];
+            tensor<int32, [1]> var_1369_axes_0 = const()[name = string("op_1369_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1369_keep_dims_0 = const()[name = string("op_1369_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_1369 = reduce_sum(axes = var_1369_axes_0, keep_dims = var_1369_keep_dims_0, x = var_1359)[name = string("op_1369")];
+            tensor<fp16, [1, 8, 3, 512]> var_1375_cast_fp16 = real_div(x = var_1359, y = var_1369)[name = string("op_1375_cast_fp16")];
+            bool attn_output_13_transpose_x_0 = const()[name = string("attn_output_13_transpose_x_0"), val = bool(false)];
+            bool attn_output_13_transpose_y_0 = const()[name = string("attn_output_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_13_cast_fp16 = matmul(transpose_x = attn_output_13_transpose_x_0, transpose_y = attn_output_13_transpose_y_0, x = var_1375_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_13_cast_fp16")];
+            tensor<int32, [4]> var_1386 = const()[name = string("op_1386"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1393 = const()[name = string("op_1393"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_1387_cast_fp16 = transpose(perm = var_1386, x = attn_output_13_cast_fp16)[name = string("transpose_79")];
+            tensor<fp16, [1, 3, 2048]> attn_output_15_cast_fp16 = reshape(shape = var_1393, x = var_1387_cast_fp16)[name = string("attn_output_15_cast_fp16")];
+            tensor<int32, [3]> var_1398 = const()[name = string("op_1398"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_1414_pad_type_0 = const()[name = string("op_1414_pad_type_0"), val = string("valid")];
+            int32 var_1414_groups_0 = const()[name = string("op_1414_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_1414_strides_0 = const()[name = string("op_1414_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_1414_pad_0 = const()[name = string("op_1414_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_1414_dilations_0 = const()[name = string("op_1414_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_2_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(389297856))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391919360))))[name = string("squeeze_2_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_1399_cast_fp16 = transpose(perm = var_1398, x = attn_output_15_cast_fp16)[name = string("transpose_78")];
+            tensor<fp16, [1, 2560, 3]> var_1414_cast_fp16 = conv(dilations = var_1414_dilations_0, groups = var_1414_groups_0, pad = var_1414_pad_0, pad_type = var_1414_pad_type_0, strides = var_1414_strides_0, weight = squeeze_2_cast_fp16_to_fp32_to_fp16_palettized, x = var_1399_cast_fp16)[name = string("op_1414_cast_fp16")];
+            tensor<int32, [3]> var_1418 = const()[name = string("op_1418"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1424 = const()[name = string("op_1424"), val = int32(-1)];
+            fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_39_cast_fp16 = transpose(perm = var_1418, x = var_1414_cast_fp16)[name = string("transpose_77")];
+            tensor<fp16, [1, 3, 2560]> var_1426_cast_fp16 = mul(x = x_39_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_1426_cast_fp16")];
+            bool input_57_interleave_0 = const()[name = string("input_57_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_57_cast_fp16 = concat(axis = var_1424, interleave = input_57_interleave_0, values = (x_39_cast_fp16, var_1426_cast_fp16))[name = string("input_57_cast_fp16")];
+            tensor<int32, [1]> normed_57_axes_0 = const()[name = string("normed_57_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1421_to_fp16 = const()[name = string("op_1421_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_57_cast_fp16 = layer_norm(axes = normed_57_axes_0, epsilon = var_1421_to_fp16, x = input_57_cast_fp16)[name = string("normed_57_cast_fp16")];
+            tensor<int32, [2]> var_1431_split_sizes_0 = const()[name = string("op_1431_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1431_axis_0 = const()[name = string("op_1431_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1431_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1431_cast_fp16_1 = split(axis = var_1431_axis_0, split_sizes = var_1431_split_sizes_0, x = normed_57_cast_fp16)[name = string("op_1431_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_2_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391921984)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_17_cast_fp16 = mul(x = var_1431_cast_fp16_0, y = layers_2_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_17_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_41_cast_fp16 = add(x = x_31_cast_fp16, y = attn_output_17_cast_fp16)[name = string("x_41_cast_fp16")];
+            int32 var_1440 = const()[name = string("op_1440"), val = int32(-1)];
+            fp16 const_20_promoted_to_fp16 = const()[name = string("const_20_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_1442_cast_fp16 = mul(x = x_41_cast_fp16, y = const_20_promoted_to_fp16)[name = string("op_1442_cast_fp16")];
+            bool input_59_interleave_0 = const()[name = string("input_59_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_59_cast_fp16 = concat(axis = var_1440, interleave = input_59_interleave_0, values = (x_41_cast_fp16, var_1442_cast_fp16))[name = string("input_59_cast_fp16")];
+            tensor<int32, [1]> normed_61_axes_0 = const()[name = string("normed_61_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1437_to_fp16 = const()[name = string("op_1437_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_61_cast_fp16 = layer_norm(axes = normed_61_axes_0, epsilon = var_1437_to_fp16, x = input_59_cast_fp16)[name = string("normed_61_cast_fp16")];
+            tensor<int32, [2]> var_1447_split_sizes_0 = const()[name = string("op_1447_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1447_axis_0 = const()[name = string("op_1447_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1447_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1447_cast_fp16_1 = split(axis = var_1447_axis_0, split_sizes = var_1447_split_sizes_0, x = normed_61_cast_fp16)[name = string("op_1447_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_2_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391927168)))];
+            tensor<fp16, [1, 3, 2560]> h_15_cast_fp16 = mul(x = var_1447_cast_fp16_0, y = layers_2_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_15_cast_fp16")];
+            tensor<int32, [3]> var_1458 = const()[name = string("op_1458"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_61_axes_0 = const()[name = string("input_61_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1459 = transpose(perm = var_1458, x = h_15_cast_fp16)[name = string("transpose_76")];
+            tensor<fp16, [1, 2560, 1, 3]> input_61 = expand_dims(axes = input_61_axes_0, x = var_1459)[name = string("input_61")];
+            string gate_9_pad_type_0 = const()[name = string("gate_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_9_strides_0 = const()[name = string("gate_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_9_pad_0 = const()[name = string("gate_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_9_dilations_0 = const()[name = string("gate_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_9_groups_0 = const()[name = string("gate_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_9 = conv(dilations = gate_9_dilations_0, groups = gate_9_groups_0, pad = gate_9_pad_0, pad_type = gate_9_pad_type_0, strides = gate_9_strides_0, weight = layers_2_mlp_gate_proj_weight_palettized, x = input_61)[name = string("gate_9")];
+            string up_5_pad_type_0 = const()[name = string("up_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_5_strides_0 = const()[name = string("up_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_5_pad_0 = const()[name = string("up_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_5_dilations_0 = const()[name = string("up_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_5_groups_0 = const()[name = string("up_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_5 = conv(dilations = up_5_dilations_0, groups = up_5_groups_0, pad = up_5_pad_0, pad_type = up_5_pad_type_0, strides = up_5_strides_0, weight = layers_2_mlp_up_proj_weight_palettized, x = input_61)[name = string("up_5")];
+            string gate_11_mode_0 = const()[name = string("gate_11_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_11 = gelu(mode = gate_11_mode_0, x = gate_9)[name = string("gate_11")];
+            tensor<fp16, [1, 10240, 1, 3]> input_63 = mul(x = gate_11, y = up_5)[name = string("input_63")];
+            string mlp_out_5_pad_type_0 = const()[name = string("mlp_out_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_5_strides_0 = const()[name = string("mlp_out_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_5_pad_0 = const()[name = string("mlp_out_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_5_dilations_0 = const()[name = string("mlp_out_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_5_groups_0 = const()[name = string("mlp_out_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_5 = conv(dilations = mlp_out_5_dilations_0, groups = mlp_out_5_groups_0, pad = mlp_out_5_pad_0, pad_type = mlp_out_5_pad_type_0, strides = mlp_out_5_strides_0, weight = layers_2_mlp_down_proj_weight_palettized, x = input_63)[name = string("mlp_out_5")];
+            tensor<int32, [1]> var_1499_axes_0 = const()[name = string("op_1499_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1499 = squeeze(axes = var_1499_axes_0, x = mlp_out_5)[name = string("op_1499")];
+            tensor<int32, [3]> var_1503 = const()[name = string("op_1503"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1509 = const()[name = string("op_1509"), val = int32(-1)];
+            fp16 const_21_promoted = const()[name = string("const_21_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_43 = transpose(perm = var_1503, x = var_1499)[name = string("transpose_75")];
+            tensor<fp16, [1, 3, 2560]> var_1511 = mul(x = x_43, y = const_21_promoted)[name = string("op_1511")];
+            bool input_65_interleave_0 = const()[name = string("input_65_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_65 = concat(axis = var_1509, interleave = input_65_interleave_0, values = (x_43, var_1511))[name = string("input_65")];
+            tensor<int32, [1]> normed_65_axes_0 = const()[name = string("normed_65_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1506_to_fp16 = const()[name = string("op_1506_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_65_cast_fp16 = layer_norm(axes = normed_65_axes_0, epsilon = var_1506_to_fp16, x = input_65)[name = string("normed_65_cast_fp16")];
+            tensor<int32, [2]> var_1516_split_sizes_0 = const()[name = string("op_1516_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1516_axis_0 = const()[name = string("op_1516_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1516_0, tensor<fp16, [1, 3, 2560]> var_1516_1 = split(axis = var_1516_axis_0, split_sizes = var_1516_split_sizes_0, x = normed_65_cast_fp16)[name = string("op_1516")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_23 = mul(x = var_1516_0, y = layers_2_post_feedforward_layernorm_weight)[name = string("hidden_states_23")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_25_cast_fp16 = add(x = x_41_cast_fp16, y = hidden_states_23)[name = string("hidden_states_25_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_5_begin_0 = const()[name = string("per_layer_slice_5_begin_0"), val = tensor<int32, [3]>([0, 0, 6656])];
+            tensor<int32, [3]> per_layer_slice_5_end_0 = const()[name = string("per_layer_slice_5_end_0"), val = tensor<int32, [3]>([1, 3, 6912])];
+            tensor<bool, [3]> per_layer_slice_5_end_mask_0 = const()[name = string("per_layer_slice_5_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_5_cast_fp16 = slice_by_index(begin = per_layer_slice_5_begin_0, end = per_layer_slice_5_end_0, end_mask = per_layer_slice_5_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_5_cast_fp16")];
+            tensor<int32, [3]> var_1544 = const()[name = string("op_1544"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_67_axes_0 = const()[name = string("input_67_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1545 = transpose(perm = var_1544, x = hidden_states_25_cast_fp16)[name = string("transpose_74")];
+            tensor<fp16, [1, 2560, 1, 3]> input_67 = expand_dims(axes = input_67_axes_0, x = var_1545)[name = string("input_67")];
+            string gated_13_pad_type_0 = const()[name = string("gated_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_13_strides_0 = const()[name = string("gated_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_13_pad_0 = const()[name = string("gated_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_13_dilations_0 = const()[name = string("gated_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_13_groups_0 = const()[name = string("gated_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_13 = conv(dilations = gated_13_dilations_0, groups = gated_13_groups_0, pad = gated_13_pad_0, pad_type = gated_13_pad_type_0, strides = gated_13_strides_0, weight = layers_2_per_layer_input_gate_weight_palettized, x = input_67)[name = string("gated_13")];
+            string gated_15_mode_0 = const()[name = string("gated_15_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_15 = gelu(mode = gated_15_mode_0, x = gated_13)[name = string("gated_15")];
+            tensor<int32, [3]> var_1564 = const()[name = string("op_1564"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_5_axes_0 = const()[name = string("per_layer_slice_conv_5_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_1565_cast_fp16 = transpose(perm = var_1564, x = per_layer_slice_5_cast_fp16)[name = string("transpose_73")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_5_cast_fp16 = expand_dims(axes = per_layer_slice_conv_5_axes_0, x = var_1565_cast_fp16)[name = string("per_layer_slice_conv_5_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_69_cast_fp16 = mul(x = gated_15, y = per_layer_slice_conv_5_cast_fp16)[name = string("input_69_cast_fp16")];
+            string gated_17_pad_type_0 = const()[name = string("gated_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_17_strides_0 = const()[name = string("gated_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_17_pad_0 = const()[name = string("gated_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_17_dilations_0 = const()[name = string("gated_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_17_groups_0 = const()[name = string("gated_17_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_2_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391932352))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(392260096))))[name = string("layers_2_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_17_cast_fp16 = conv(dilations = gated_17_dilations_0, groups = gated_17_groups_0, pad = gated_17_pad_0, pad_type = gated_17_pad_type_0, strides = gated_17_strides_0, weight = layers_2_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_69_cast_fp16)[name = string("gated_17_cast_fp16")];
+            tensor<int32, [1]> var_1581_axes_0 = const()[name = string("op_1581_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1581_cast_fp16 = squeeze(axes = var_1581_axes_0, x = gated_17_cast_fp16)[name = string("op_1581_cast_fp16")];
+            tensor<int32, [3]> var_1585 = const()[name = string("op_1585"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1591 = const()[name = string("op_1591"), val = int32(-1)];
+            fp16 const_22_promoted_to_fp16 = const()[name = string("const_22_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_45_cast_fp16 = transpose(perm = var_1585, x = var_1581_cast_fp16)[name = string("transpose_72")];
+            tensor<fp16, [1, 3, 2560]> var_1593_cast_fp16 = mul(x = x_45_cast_fp16, y = const_22_promoted_to_fp16)[name = string("op_1593_cast_fp16")];
+            bool input_71_interleave_0 = const()[name = string("input_71_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_71_cast_fp16 = concat(axis = var_1591, interleave = input_71_interleave_0, values = (x_45_cast_fp16, var_1593_cast_fp16))[name = string("input_71_cast_fp16")];
+            tensor<int32, [1]> normed_69_axes_0 = const()[name = string("normed_69_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1588_to_fp16 = const()[name = string("op_1588_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_69_cast_fp16 = layer_norm(axes = normed_69_axes_0, epsilon = var_1588_to_fp16, x = input_71_cast_fp16)[name = string("normed_69_cast_fp16")];
+            tensor<int32, [2]> var_1598_split_sizes_0 = const()[name = string("op_1598_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1598_axis_0 = const()[name = string("op_1598_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1598_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1598_cast_fp16_1 = split(axis = var_1598_axis_0, split_sizes = var_1598_split_sizes_0, x = normed_69_cast_fp16)[name = string("op_1598_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_2_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(392262720)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_29_cast_fp16 = mul(x = var_1598_cast_fp16_0, y = layers_2_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_29_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_31_cast_fp16 = add(x = hidden_states_25_cast_fp16, y = hidden_states_29_cast_fp16)[name = string("hidden_states_31_cast_fp16")];
+            tensor<fp16, [1]> const_23_promoted_to_fp16 = const()[name = string("const_23_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.6ep-1])];
+            tensor<fp16, [1, 3, 2560]> x_47_cast_fp16 = mul(x = hidden_states_31_cast_fp16, y = const_23_promoted_to_fp16)[name = string("x_47_cast_fp16")];
+            int32 var_1613 = const()[name = string("op_1613"), val = int32(-1)];
+            fp16 const_24_promoted_to_fp16 = const()[name = string("const_24_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_1615_cast_fp16 = mul(x = x_47_cast_fp16, y = const_24_promoted_to_fp16)[name = string("op_1615_cast_fp16")];
+            bool input_73_interleave_0 = const()[name = string("input_73_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_73_cast_fp16 = concat(axis = var_1613, interleave = input_73_interleave_0, values = (x_47_cast_fp16, var_1615_cast_fp16))[name = string("input_73_cast_fp16")];
+            tensor<int32, [1]> normed_73_axes_0 = const()[name = string("normed_73_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1610_to_fp16 = const()[name = string("op_1610_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_73_cast_fp16 = layer_norm(axes = normed_73_axes_0, epsilon = var_1610_to_fp16, x = input_73_cast_fp16)[name = string("normed_73_cast_fp16")];
+            tensor<int32, [2]> var_1620_split_sizes_0 = const()[name = string("op_1620_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1620_axis_0 = const()[name = string("op_1620_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1620_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1620_cast_fp16_1 = split(axis = var_1620_axis_0, split_sizes = var_1620_split_sizes_0, x = normed_73_cast_fp16)[name = string("op_1620_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_3_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(392267904)))];
+            tensor<fp16, [1, 3, 2560]> h_19_cast_fp16 = mul(x = var_1620_cast_fp16_0, y = layers_3_input_layernorm_weight_promoted_to_fp16)[name = string("h_19_cast_fp16")];
+            tensor<int32, [3]> var_1626 = const()[name = string("op_1626"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1629_axes_0 = const()[name = string("op_1629_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1627_cast_fp16 = transpose(perm = var_1626, x = h_19_cast_fp16)[name = string("transpose_71")];
+            tensor<fp16, [1, 2560, 1, 3]> var_1629_cast_fp16 = expand_dims(axes = var_1629_axes_0, x = var_1627_cast_fp16)[name = string("op_1629_cast_fp16")];
+            string q_31_pad_type_0 = const()[name = string("q_31_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_31_strides_0 = const()[name = string("q_31_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_31_pad_0 = const()[name = string("q_31_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_31_dilations_0 = const()[name = string("q_31_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_31_groups_0 = const()[name = string("q_31_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_31 = conv(dilations = q_31_dilations_0, groups = q_31_groups_0, pad = q_31_pad_0, pad_type = q_31_pad_type_0, strides = q_31_strides_0, weight = layers_3_self_attn_q_proj_weight_palettized, x = var_1629_cast_fp16)[name = string("q_31")];
+            tensor<int32, [4]> var_1650 = const()[name = string("op_1650"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_1651 = reshape(shape = var_1650, x = q_31)[name = string("op_1651")];
+            tensor<int32, [4]> transpose_42_perm_0 = const()[name = string("transpose_42_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_1674 = const()[name = string("op_1674"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_42 = transpose(perm = transpose_42_perm_0, x = var_1651)[name = string("transpose_70")];
+            tensor<fp16, [3, 8, 256]> x_49 = reshape(shape = var_1674, x = transpose_42)[name = string("x_49")];
+            int32 var_1680 = const()[name = string("op_1680"), val = int32(-1)];
+            fp16 const_25_promoted = const()[name = string("const_25_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_1682 = mul(x = x_49, y = const_25_promoted)[name = string("op_1682")];
+            bool input_77_interleave_0 = const()[name = string("input_77_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_77 = concat(axis = var_1680, interleave = input_77_interleave_0, values = (x_49, var_1682))[name = string("input_77")];
+            tensor<int32, [1]> normed_77_axes_0 = const()[name = string("normed_77_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1677_to_fp16 = const()[name = string("op_1677_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_77_cast_fp16 = layer_norm(axes = normed_77_axes_0, epsilon = var_1677_to_fp16, x = input_77)[name = string("normed_77_cast_fp16")];
+            tensor<int32, [2]> var_1687_split_sizes_0 = const()[name = string("op_1687_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_1687_axis_0 = const()[name = string("op_1687_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_1687_0, tensor<fp16, [3, 8, 256]> var_1687_1 = split(axis = var_1687_axis_0, split_sizes = var_1687_split_sizes_0, x = normed_77_cast_fp16)[name = string("op_1687")];
+            tensor<fp16, [3, 8, 256]> q_35 = mul(x = var_1687_0, y = layers_0_self_attn_q_norm_weight)[name = string("q_35")];
+            tensor<int32, [4]> var_1694 = const()[name = string("op_1694"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_1695 = reshape(shape = var_1694, x = q_35)[name = string("op_1695")];
+            tensor<int32, [4]> var_1700 = const()[name = string("op_1700"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_37 = transpose(perm = var_1700, x = var_1695)[name = string("transpose_69")];
+            tensor<fp16, [1, 8, 3, 256]> var_1702_cast_fp16 = mul(x = q_37, y = cos_s)[name = string("op_1702_cast_fp16")];
+            tensor<int32, [2]> var_1703_split_sizes_0 = const()[name = string("op_1703_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_1703_axis_0 = const()[name = string("op_1703_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_1703_0, tensor<fp16, [1, 8, 3, 128]> var_1703_1 = split(axis = var_1703_axis_0, split_sizes = var_1703_split_sizes_0, x = q_37)[name = string("op_1703")];
+            fp16 const_26_promoted = const()[name = string("const_26_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_1705 = mul(x = var_1703_1, y = const_26_promoted)[name = string("op_1705")];
+            int32 var_1707 = const()[name = string("op_1707"), val = int32(-1)];
+            bool var_1708_interleave_0 = const()[name = string("op_1708_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_1708 = concat(axis = var_1707, interleave = var_1708_interleave_0, values = (var_1705, var_1703_0))[name = string("op_1708")];
+            tensor<fp16, [1, 8, 3, 256]> var_1709_cast_fp16 = mul(x = var_1708, y = sin_s)[name = string("op_1709_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_39_cast_fp16 = add(x = var_1702_cast_fp16, y = var_1709_cast_fp16)[name = string("q_39_cast_fp16")];
+            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(false)];
+            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = q_39_cast_fp16, y = transpose_37_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_51_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = causal_mask_sliding)[name = string("x_51_cast_fp16")];
+            tensor<int32, [1]> reduce_max_3_axes_0 = const()[name = string("reduce_max_3_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_3_keep_dims_0 = const()[name = string("reduce_max_3_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_3 = reduce_max(axes = reduce_max_3_axes_0, keep_dims = reduce_max_3_keep_dims_0, x = x_51_cast_fp16)[name = string("reduce_max_3")];
+            tensor<fp16, [1, 8, 3, 512]> var_1741 = sub(x = x_51_cast_fp16, y = reduce_max_3)[name = string("op_1741")];
+            tensor<fp16, [1, 8, 3, 512]> var_1747 = exp(x = var_1741)[name = string("op_1747")];
+            tensor<int32, [1]> var_1757_axes_0 = const()[name = string("op_1757_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1757_keep_dims_0 = const()[name = string("op_1757_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_1757 = reduce_sum(axes = var_1757_axes_0, keep_dims = var_1757_keep_dims_0, x = var_1747)[name = string("op_1757")];
+            tensor<fp16, [1, 8, 3, 512]> var_1763_cast_fp16 = real_div(x = var_1747, y = var_1757)[name = string("op_1763_cast_fp16")];
+            bool attn_output_19_transpose_x_0 = const()[name = string("attn_output_19_transpose_x_0"), val = bool(false)];
+            bool attn_output_19_transpose_y_0 = const()[name = string("attn_output_19_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_19_cast_fp16 = matmul(transpose_x = attn_output_19_transpose_x_0, transpose_y = attn_output_19_transpose_y_0, x = var_1763_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_19_cast_fp16")];
+            tensor<int32, [4]> var_1774 = const()[name = string("op_1774"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1781 = const()[name = string("op_1781"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_1775_cast_fp16 = transpose(perm = var_1774, x = attn_output_19_cast_fp16)[name = string("transpose_68")];
+            tensor<fp16, [1, 3, 2048]> attn_output_21_cast_fp16 = reshape(shape = var_1781, x = var_1775_cast_fp16)[name = string("attn_output_21_cast_fp16")];
+            tensor<int32, [3]> var_1786 = const()[name = string("op_1786"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_1802_pad_type_0 = const()[name = string("op_1802_pad_type_0"), val = string("valid")];
+            int32 var_1802_groups_0 = const()[name = string("op_1802_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_1802_strides_0 = const()[name = string("op_1802_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_1802_pad_0 = const()[name = string("op_1802_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_1802_dilations_0 = const()[name = string("op_1802_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_3_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(392273088))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(394894592))))[name = string("squeeze_3_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_1787_cast_fp16 = transpose(perm = var_1786, x = attn_output_21_cast_fp16)[name = string("transpose_67")];
+            tensor<fp16, [1, 2560, 3]> var_1802_cast_fp16 = conv(dilations = var_1802_dilations_0, groups = var_1802_groups_0, pad = var_1802_pad_0, pad_type = var_1802_pad_type_0, strides = var_1802_strides_0, weight = squeeze_3_cast_fp16_to_fp32_to_fp16_palettized, x = var_1787_cast_fp16)[name = string("op_1802_cast_fp16")];
+            tensor<int32, [3]> var_1806 = const()[name = string("op_1806"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1812 = const()[name = string("op_1812"), val = int32(-1)];
+            fp16 const_27_promoted_to_fp16 = const()[name = string("const_27_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_55_cast_fp16 = transpose(perm = var_1806, x = var_1802_cast_fp16)[name = string("transpose_66")];
+            tensor<fp16, [1, 3, 2560]> var_1814_cast_fp16 = mul(x = x_55_cast_fp16, y = const_27_promoted_to_fp16)[name = string("op_1814_cast_fp16")];
+            bool input_81_interleave_0 = const()[name = string("input_81_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_81_cast_fp16 = concat(axis = var_1812, interleave = input_81_interleave_0, values = (x_55_cast_fp16, var_1814_cast_fp16))[name = string("input_81_cast_fp16")];
+            tensor<int32, [1]> normed_81_axes_0 = const()[name = string("normed_81_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1809_to_fp16 = const()[name = string("op_1809_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_81_cast_fp16 = layer_norm(axes = normed_81_axes_0, epsilon = var_1809_to_fp16, x = input_81_cast_fp16)[name = string("normed_81_cast_fp16")];
+            tensor<int32, [2]> var_1819_split_sizes_0 = const()[name = string("op_1819_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1819_axis_0 = const()[name = string("op_1819_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1819_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1819_cast_fp16_1 = split(axis = var_1819_axis_0, split_sizes = var_1819_split_sizes_0, x = normed_81_cast_fp16)[name = string("op_1819_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_3_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(394897216)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_23_cast_fp16 = mul(x = var_1819_cast_fp16_0, y = layers_3_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_23_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_57_cast_fp16 = add(x = x_47_cast_fp16, y = attn_output_23_cast_fp16)[name = string("x_57_cast_fp16")];
+            int32 var_1828 = const()[name = string("op_1828"), val = int32(-1)];
+            fp16 const_28_promoted_to_fp16 = const()[name = string("const_28_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_1830_cast_fp16 = mul(x = x_57_cast_fp16, y = const_28_promoted_to_fp16)[name = string("op_1830_cast_fp16")];
+            bool input_83_interleave_0 = const()[name = string("input_83_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_83_cast_fp16 = concat(axis = var_1828, interleave = input_83_interleave_0, values = (x_57_cast_fp16, var_1830_cast_fp16))[name = string("input_83_cast_fp16")];
+            tensor<int32, [1]> normed_85_axes_0 = const()[name = string("normed_85_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1825_to_fp16 = const()[name = string("op_1825_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_85_cast_fp16 = layer_norm(axes = normed_85_axes_0, epsilon = var_1825_to_fp16, x = input_83_cast_fp16)[name = string("normed_85_cast_fp16")];
+            tensor<int32, [2]> var_1835_split_sizes_0 = const()[name = string("op_1835_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1835_axis_0 = const()[name = string("op_1835_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1835_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1835_cast_fp16_1 = split(axis = var_1835_axis_0, split_sizes = var_1835_split_sizes_0, x = normed_85_cast_fp16)[name = string("op_1835_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_3_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(394902400)))];
+            tensor<fp16, [1, 3, 2560]> h_21_cast_fp16 = mul(x = var_1835_cast_fp16_0, y = layers_3_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_21_cast_fp16")];
+            tensor<int32, [3]> var_1846 = const()[name = string("op_1846"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_85_axes_0 = const()[name = string("input_85_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1847 = transpose(perm = var_1846, x = h_21_cast_fp16)[name = string("transpose_65")];
+            tensor<fp16, [1, 2560, 1, 3]> input_85 = expand_dims(axes = input_85_axes_0, x = var_1847)[name = string("input_85")];
+            string gate_13_pad_type_0 = const()[name = string("gate_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_13_strides_0 = const()[name = string("gate_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_13_pad_0 = const()[name = string("gate_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_13_dilations_0 = const()[name = string("gate_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_13_groups_0 = const()[name = string("gate_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_13 = conv(dilations = gate_13_dilations_0, groups = gate_13_groups_0, pad = gate_13_pad_0, pad_type = gate_13_pad_type_0, strides = gate_13_strides_0, weight = layers_3_mlp_gate_proj_weight_palettized, x = input_85)[name = string("gate_13")];
+            string up_7_pad_type_0 = const()[name = string("up_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_7_strides_0 = const()[name = string("up_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_7_pad_0 = const()[name = string("up_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_7_dilations_0 = const()[name = string("up_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_7_groups_0 = const()[name = string("up_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_7 = conv(dilations = up_7_dilations_0, groups = up_7_groups_0, pad = up_7_pad_0, pad_type = up_7_pad_type_0, strides = up_7_strides_0, weight = layers_3_mlp_up_proj_weight_palettized, x = input_85)[name = string("up_7")];
+            string gate_15_mode_0 = const()[name = string("gate_15_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_15 = gelu(mode = gate_15_mode_0, x = gate_13)[name = string("gate_15")];
+            tensor<fp16, [1, 10240, 1, 3]> input_87 = mul(x = gate_15, y = up_7)[name = string("input_87")];
+            string mlp_out_7_pad_type_0 = const()[name = string("mlp_out_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_7_strides_0 = const()[name = string("mlp_out_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_7_pad_0 = const()[name = string("mlp_out_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_7_dilations_0 = const()[name = string("mlp_out_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_7_groups_0 = const()[name = string("mlp_out_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_7 = conv(dilations = mlp_out_7_dilations_0, groups = mlp_out_7_groups_0, pad = mlp_out_7_pad_0, pad_type = mlp_out_7_pad_type_0, strides = mlp_out_7_strides_0, weight = layers_3_mlp_down_proj_weight_palettized, x = input_87)[name = string("mlp_out_7")];
+            tensor<int32, [1]> var_1887_axes_0 = const()[name = string("op_1887_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1887 = squeeze(axes = var_1887_axes_0, x = mlp_out_7)[name = string("op_1887")];
+            tensor<int32, [3]> var_1891 = const()[name = string("op_1891"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1897 = const()[name = string("op_1897"), val = int32(-1)];
+            fp16 const_29_promoted = const()[name = string("const_29_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_59 = transpose(perm = var_1891, x = var_1887)[name = string("transpose_64")];
+            tensor<fp16, [1, 3, 2560]> var_1899 = mul(x = x_59, y = const_29_promoted)[name = string("op_1899")];
+            bool input_89_interleave_0 = const()[name = string("input_89_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_89 = concat(axis = var_1897, interleave = input_89_interleave_0, values = (x_59, var_1899))[name = string("input_89")];
+            tensor<int32, [1]> normed_89_axes_0 = const()[name = string("normed_89_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1894_to_fp16 = const()[name = string("op_1894_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_89_cast_fp16 = layer_norm(axes = normed_89_axes_0, epsilon = var_1894_to_fp16, x = input_89)[name = string("normed_89_cast_fp16")];
+            tensor<int32, [2]> var_1904_split_sizes_0 = const()[name = string("op_1904_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1904_axis_0 = const()[name = string("op_1904_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1904_0, tensor<fp16, [1, 3, 2560]> var_1904_1 = split(axis = var_1904_axis_0, split_sizes = var_1904_split_sizes_0, x = normed_89_cast_fp16)[name = string("op_1904")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_33 = mul(x = var_1904_0, y = layers_3_post_feedforward_layernorm_weight)[name = string("hidden_states_33")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_35_cast_fp16 = add(x = x_57_cast_fp16, y = hidden_states_33)[name = string("hidden_states_35_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_7_begin_0 = const()[name = string("per_layer_slice_7_begin_0"), val = tensor<int32, [3]>([0, 0, 6912])];
+            tensor<int32, [3]> per_layer_slice_7_end_0 = const()[name = string("per_layer_slice_7_end_0"), val = tensor<int32, [3]>([1, 3, 7168])];
+            tensor<bool, [3]> per_layer_slice_7_end_mask_0 = const()[name = string("per_layer_slice_7_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_7_cast_fp16 = slice_by_index(begin = per_layer_slice_7_begin_0, end = per_layer_slice_7_end_0, end_mask = per_layer_slice_7_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_7_cast_fp16")];
+            tensor<int32, [3]> var_1932 = const()[name = string("op_1932"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_91_axes_0 = const()[name = string("input_91_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1933 = transpose(perm = var_1932, x = hidden_states_35_cast_fp16)[name = string("transpose_63")];
+            tensor<fp16, [1, 2560, 1, 3]> input_91 = expand_dims(axes = input_91_axes_0, x = var_1933)[name = string("input_91")];
+            string gated_19_pad_type_0 = const()[name = string("gated_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_19_strides_0 = const()[name = string("gated_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_19_pad_0 = const()[name = string("gated_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_19_dilations_0 = const()[name = string("gated_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_19_groups_0 = const()[name = string("gated_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_19 = conv(dilations = gated_19_dilations_0, groups = gated_19_groups_0, pad = gated_19_pad_0, pad_type = gated_19_pad_type_0, strides = gated_19_strides_0, weight = layers_3_per_layer_input_gate_weight_palettized, x = input_91)[name = string("gated_19")];
+            string gated_21_mode_0 = const()[name = string("gated_21_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_21 = gelu(mode = gated_21_mode_0, x = gated_19)[name = string("gated_21")];
+            tensor<int32, [3]> var_1952 = const()[name = string("op_1952"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_7_axes_0 = const()[name = string("per_layer_slice_conv_7_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_1953_cast_fp16 = transpose(perm = var_1952, x = per_layer_slice_7_cast_fp16)[name = string("transpose_62")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_7_cast_fp16 = expand_dims(axes = per_layer_slice_conv_7_axes_0, x = var_1953_cast_fp16)[name = string("per_layer_slice_conv_7_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_93_cast_fp16 = mul(x = gated_21, y = per_layer_slice_conv_7_cast_fp16)[name = string("input_93_cast_fp16")];
+            string gated_23_pad_type_0 = const()[name = string("gated_23_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_23_strides_0 = const()[name = string("gated_23_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_23_pad_0 = const()[name = string("gated_23_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_23_dilations_0 = const()[name = string("gated_23_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_23_groups_0 = const()[name = string("gated_23_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_3_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(394907584))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(395235328))))[name = string("layers_3_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_23_cast_fp16 = conv(dilations = gated_23_dilations_0, groups = gated_23_groups_0, pad = gated_23_pad_0, pad_type = gated_23_pad_type_0, strides = gated_23_strides_0, weight = layers_3_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_93_cast_fp16)[name = string("gated_23_cast_fp16")];
+            tensor<int32, [1]> var_1969_axes_0 = const()[name = string("op_1969_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1969_cast_fp16 = squeeze(axes = var_1969_axes_0, x = gated_23_cast_fp16)[name = string("op_1969_cast_fp16")];
+            tensor<int32, [3]> var_1973 = const()[name = string("op_1973"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1979 = const()[name = string("op_1979"), val = int32(-1)];
+            fp16 const_30_promoted_to_fp16 = const()[name = string("const_30_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_61_cast_fp16 = transpose(perm = var_1973, x = var_1969_cast_fp16)[name = string("transpose_61")];
+            tensor<fp16, [1, 3, 2560]> var_1981_cast_fp16 = mul(x = x_61_cast_fp16, y = const_30_promoted_to_fp16)[name = string("op_1981_cast_fp16")];
+            bool input_95_interleave_0 = const()[name = string("input_95_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_95_cast_fp16 = concat(axis = var_1979, interleave = input_95_interleave_0, values = (x_61_cast_fp16, var_1981_cast_fp16))[name = string("input_95_cast_fp16")];
+            tensor<int32, [1]> normed_93_axes_0 = const()[name = string("normed_93_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1976_to_fp16 = const()[name = string("op_1976_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_93_cast_fp16 = layer_norm(axes = normed_93_axes_0, epsilon = var_1976_to_fp16, x = input_95_cast_fp16)[name = string("normed_93_cast_fp16")];
+            tensor<int32, [2]> var_1986_split_sizes_0 = const()[name = string("op_1986_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1986_axis_0 = const()[name = string("op_1986_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1986_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1986_cast_fp16_1 = split(axis = var_1986_axis_0, split_sizes = var_1986_split_sizes_0, x = normed_93_cast_fp16)[name = string("op_1986_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_3_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(395237952)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_39_cast_fp16 = mul(x = var_1986_cast_fp16_0, y = layers_3_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_39_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_41_cast_fp16 = add(x = hidden_states_35_cast_fp16, y = hidden_states_39_cast_fp16)[name = string("hidden_states_41_cast_fp16")];
+            tensor<fp16, [1]> const_31_promoted_to_fp16 = const()[name = string("const_31_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.62p-1])];
+            tensor<fp16, [1, 3, 2560]> x_63_cast_fp16 = mul(x = hidden_states_41_cast_fp16, y = const_31_promoted_to_fp16)[name = string("x_63_cast_fp16")];
+            int32 var_2001 = const()[name = string("op_2001"), val = int32(-1)];
+            fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_2003_cast_fp16 = mul(x = x_63_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_2003_cast_fp16")];
+            bool input_97_interleave_0 = const()[name = string("input_97_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_97_cast_fp16 = concat(axis = var_2001, interleave = input_97_interleave_0, values = (x_63_cast_fp16, var_2003_cast_fp16))[name = string("input_97_cast_fp16")];
+            tensor<int32, [1]> normed_97_axes_0 = const()[name = string("normed_97_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1998_to_fp16 = const()[name = string("op_1998_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_97_cast_fp16 = layer_norm(axes = normed_97_axes_0, epsilon = var_1998_to_fp16, x = input_97_cast_fp16)[name = string("normed_97_cast_fp16")];
+            tensor<int32, [2]> var_2008_split_sizes_0 = const()[name = string("op_2008_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2008_axis_0 = const()[name = string("op_2008_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2008_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2008_cast_fp16_1 = split(axis = var_2008_axis_0, split_sizes = var_2008_split_sizes_0, x = normed_97_cast_fp16)[name = string("op_2008_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_4_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(395243136)))];
+            tensor<fp16, [1, 3, 2560]> h_25_cast_fp16 = mul(x = var_2008_cast_fp16_0, y = layers_4_input_layernorm_weight_promoted_to_fp16)[name = string("h_25_cast_fp16")];
+            tensor<int32, [3]> var_2014 = const()[name = string("op_2014"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_2017_axes_0 = const()[name = string("op_2017_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2015_cast_fp16 = transpose(perm = var_2014, x = h_25_cast_fp16)[name = string("transpose_60")];
+            tensor<fp16, [1, 2560, 1, 3]> var_2017_cast_fp16 = expand_dims(axes = var_2017_axes_0, x = var_2015_cast_fp16)[name = string("op_2017_cast_fp16")];
+            string q_41_pad_type_0 = const()[name = string("q_41_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_41_strides_0 = const()[name = string("q_41_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_41_pad_0 = const()[name = string("q_41_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_41_dilations_0 = const()[name = string("q_41_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_41_groups_0 = const()[name = string("q_41_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_41 = conv(dilations = q_41_dilations_0, groups = q_41_groups_0, pad = q_41_pad_0, pad_type = q_41_pad_type_0, strides = q_41_strides_0, weight = layers_4_self_attn_q_proj_weight_palettized, x = var_2017_cast_fp16)[name = string("q_41")];
+            tensor<int32, [4]> var_2038 = const()[name = string("op_2038"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_2039 = reshape(shape = var_2038, x = q_41)[name = string("op_2039")];
+            tensor<int32, [4]> transpose_44_perm_0 = const()[name = string("transpose_44_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_2062 = const()[name = string("op_2062"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_44 = transpose(perm = transpose_44_perm_0, x = var_2039)[name = string("transpose_59")];
+            tensor<fp16, [3, 8, 256]> x_65 = reshape(shape = var_2062, x = transpose_44)[name = string("x_65")];
+            int32 var_2068 = const()[name = string("op_2068"), val = int32(-1)];
+            fp16 const_33_promoted = const()[name = string("const_33_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_2070 = mul(x = x_65, y = const_33_promoted)[name = string("op_2070")];
+            bool input_101_interleave_0 = const()[name = string("input_101_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_101 = concat(axis = var_2068, interleave = input_101_interleave_0, values = (x_65, var_2070))[name = string("input_101")];
+            tensor<int32, [1]> normed_101_axes_0 = const()[name = string("normed_101_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2065_to_fp16 = const()[name = string("op_2065_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_101_cast_fp16 = layer_norm(axes = normed_101_axes_0, epsilon = var_2065_to_fp16, x = input_101)[name = string("normed_101_cast_fp16")];
+            tensor<int32, [2]> var_2075_split_sizes_0 = const()[name = string("op_2075_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2075_axis_0 = const()[name = string("op_2075_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_2075_0, tensor<fp16, [3, 8, 256]> var_2075_1 = split(axis = var_2075_axis_0, split_sizes = var_2075_split_sizes_0, x = normed_101_cast_fp16)[name = string("op_2075")];
+            tensor<fp16, [3, 8, 256]> q_45 = mul(x = var_2075_0, y = layers_0_self_attn_q_norm_weight)[name = string("q_45")];
+            tensor<int32, [4]> var_2082 = const()[name = string("op_2082"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_2083 = reshape(shape = var_2082, x = q_45)[name = string("op_2083")];
+            tensor<int32, [4]> var_2088 = const()[name = string("op_2088"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_47 = transpose(perm = var_2088, x = var_2083)[name = string("transpose_58")];
+            tensor<fp16, [1, 8, 3, 256]> var_2090_cast_fp16 = mul(x = q_47, y = cos_s)[name = string("op_2090_cast_fp16")];
+            tensor<int32, [2]> var_2091_split_sizes_0 = const()[name = string("op_2091_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2091_axis_0 = const()[name = string("op_2091_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_2091_0, tensor<fp16, [1, 8, 3, 128]> var_2091_1 = split(axis = var_2091_axis_0, split_sizes = var_2091_split_sizes_0, x = q_47)[name = string("op_2091")];
+            fp16 const_34_promoted = const()[name = string("const_34_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_2093 = mul(x = var_2091_1, y = const_34_promoted)[name = string("op_2093")];
+            int32 var_2095 = const()[name = string("op_2095"), val = int32(-1)];
+            bool var_2096_interleave_0 = const()[name = string("op_2096_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_2096 = concat(axis = var_2095, interleave = var_2096_interleave_0, values = (var_2093, var_2091_0))[name = string("op_2096")];
+            tensor<fp16, [1, 8, 3, 256]> var_2097_cast_fp16 = mul(x = var_2096, y = sin_s)[name = string("op_2097_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_49_cast_fp16 = add(x = var_2090_cast_fp16, y = var_2097_cast_fp16)[name = string("q_49_cast_fp16")];
+            bool attn_weights_17_transpose_x_0 = const()[name = string("attn_weights_17_transpose_x_0"), val = bool(false)];
+            bool attn_weights_17_transpose_y_0 = const()[name = string("attn_weights_17_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_17_cast_fp16 = matmul(transpose_x = attn_weights_17_transpose_x_0, transpose_y = attn_weights_17_transpose_y_0, x = q_49_cast_fp16, y = transpose_37_cast_fp16)[name = string("attn_weights_17_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_67_cast_fp16 = add(x = attn_weights_17_cast_fp16, y = causal_mask_sliding)[name = string("x_67_cast_fp16")];
+            tensor<int32, [1]> reduce_max_4_axes_0 = const()[name = string("reduce_max_4_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_4_keep_dims_0 = const()[name = string("reduce_max_4_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_4 = reduce_max(axes = reduce_max_4_axes_0, keep_dims = reduce_max_4_keep_dims_0, x = x_67_cast_fp16)[name = string("reduce_max_4")];
+            tensor<fp16, [1, 8, 3, 512]> var_2129 = sub(x = x_67_cast_fp16, y = reduce_max_4)[name = string("op_2129")];
+            tensor<fp16, [1, 8, 3, 512]> var_2135 = exp(x = var_2129)[name = string("op_2135")];
+            tensor<int32, [1]> var_2145_axes_0 = const()[name = string("op_2145_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2145_keep_dims_0 = const()[name = string("op_2145_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_2145 = reduce_sum(axes = var_2145_axes_0, keep_dims = var_2145_keep_dims_0, x = var_2135)[name = string("op_2145")];
+            tensor<fp16, [1, 8, 3, 512]> var_2151_cast_fp16 = real_div(x = var_2135, y = var_2145)[name = string("op_2151_cast_fp16")];
+            bool attn_output_25_transpose_x_0 = const()[name = string("attn_output_25_transpose_x_0"), val = bool(false)];
+            bool attn_output_25_transpose_y_0 = const()[name = string("attn_output_25_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_25_cast_fp16 = matmul(transpose_x = attn_output_25_transpose_x_0, transpose_y = attn_output_25_transpose_y_0, x = var_2151_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_25_cast_fp16")];
+            tensor<int32, [4]> var_2162 = const()[name = string("op_2162"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_2169 = const()[name = string("op_2169"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_2163_cast_fp16 = transpose(perm = var_2162, x = attn_output_25_cast_fp16)[name = string("transpose_57")];
+            tensor<fp16, [1, 3, 2048]> attn_output_27_cast_fp16 = reshape(shape = var_2169, x = var_2163_cast_fp16)[name = string("attn_output_27_cast_fp16")];
+            tensor<int32, [3]> var_2174 = const()[name = string("op_2174"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_2190_pad_type_0 = const()[name = string("op_2190_pad_type_0"), val = string("valid")];
+            int32 var_2190_groups_0 = const()[name = string("op_2190_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_2190_strides_0 = const()[name = string("op_2190_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_2190_pad_0 = const()[name = string("op_2190_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_2190_dilations_0 = const()[name = string("op_2190_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_4_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(395248320))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397869824))))[name = string("squeeze_4_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_2175_cast_fp16 = transpose(perm = var_2174, x = attn_output_27_cast_fp16)[name = string("transpose_56")];
+            tensor<fp16, [1, 2560, 3]> var_2190_cast_fp16 = conv(dilations = var_2190_dilations_0, groups = var_2190_groups_0, pad = var_2190_pad_0, pad_type = var_2190_pad_type_0, strides = var_2190_strides_0, weight = squeeze_4_cast_fp16_to_fp32_to_fp16_palettized, x = var_2175_cast_fp16)[name = string("op_2190_cast_fp16")];
+            tensor<int32, [3]> var_2194 = const()[name = string("op_2194"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2200 = const()[name = string("op_2200"), val = int32(-1)];
+            fp16 const_35_promoted_to_fp16 = const()[name = string("const_35_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_71_cast_fp16 = transpose(perm = var_2194, x = var_2190_cast_fp16)[name = string("transpose_55")];
+            tensor<fp16, [1, 3, 2560]> var_2202_cast_fp16 = mul(x = x_71_cast_fp16, y = const_35_promoted_to_fp16)[name = string("op_2202_cast_fp16")];
+            bool input_105_interleave_0 = const()[name = string("input_105_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_105_cast_fp16 = concat(axis = var_2200, interleave = input_105_interleave_0, values = (x_71_cast_fp16, var_2202_cast_fp16))[name = string("input_105_cast_fp16")];
+            tensor<int32, [1]> normed_105_axes_0 = const()[name = string("normed_105_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2197_to_fp16 = const()[name = string("op_2197_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_105_cast_fp16 = layer_norm(axes = normed_105_axes_0, epsilon = var_2197_to_fp16, x = input_105_cast_fp16)[name = string("normed_105_cast_fp16")];
+            tensor<int32, [2]> var_2207_split_sizes_0 = const()[name = string("op_2207_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2207_axis_0 = const()[name = string("op_2207_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2207_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2207_cast_fp16_1 = split(axis = var_2207_axis_0, split_sizes = var_2207_split_sizes_0, x = normed_105_cast_fp16)[name = string("op_2207_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_4_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397872448)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_29_cast_fp16 = mul(x = var_2207_cast_fp16_0, y = layers_4_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_29_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_73_cast_fp16 = add(x = x_63_cast_fp16, y = attn_output_29_cast_fp16)[name = string("x_73_cast_fp16")];
+            int32 var_2216 = const()[name = string("op_2216"), val = int32(-1)];
+            fp16 const_36_promoted_to_fp16 = const()[name = string("const_36_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_2218_cast_fp16 = mul(x = x_73_cast_fp16, y = const_36_promoted_to_fp16)[name = string("op_2218_cast_fp16")];
+            bool input_107_interleave_0 = const()[name = string("input_107_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_107_cast_fp16 = concat(axis = var_2216, interleave = input_107_interleave_0, values = (x_73_cast_fp16, var_2218_cast_fp16))[name = string("input_107_cast_fp16")];
+            tensor<int32, [1]> normed_109_axes_0 = const()[name = string("normed_109_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2213_to_fp16 = const()[name = string("op_2213_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_109_cast_fp16 = layer_norm(axes = normed_109_axes_0, epsilon = var_2213_to_fp16, x = input_107_cast_fp16)[name = string("normed_109_cast_fp16")];
+            tensor<int32, [2]> var_2223_split_sizes_0 = const()[name = string("op_2223_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2223_axis_0 = const()[name = string("op_2223_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2223_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2223_cast_fp16_1 = split(axis = var_2223_axis_0, split_sizes = var_2223_split_sizes_0, x = normed_109_cast_fp16)[name = string("op_2223_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_4_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397877632)))];
+            tensor<fp16, [1, 3, 2560]> h_27_cast_fp16 = mul(x = var_2223_cast_fp16_0, y = layers_4_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_27_cast_fp16")];
+            tensor<int32, [3]> var_2234 = const()[name = string("op_2234"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_109_axes_0 = const()[name = string("input_109_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2235 = transpose(perm = var_2234, x = h_27_cast_fp16)[name = string("transpose_54")];
+            tensor<fp16, [1, 2560, 1, 3]> input_109 = expand_dims(axes = input_109_axes_0, x = var_2235)[name = string("input_109")];
+            string gate_17_pad_type_0 = const()[name = string("gate_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_17_strides_0 = const()[name = string("gate_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_17_pad_0 = const()[name = string("gate_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_17_dilations_0 = const()[name = string("gate_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_17_groups_0 = const()[name = string("gate_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_17 = conv(dilations = gate_17_dilations_0, groups = gate_17_groups_0, pad = gate_17_pad_0, pad_type = gate_17_pad_type_0, strides = gate_17_strides_0, weight = layers_4_mlp_gate_proj_weight_palettized, x = input_109)[name = string("gate_17")];
+            string up_9_pad_type_0 = const()[name = string("up_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_9_strides_0 = const()[name = string("up_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_9_pad_0 = const()[name = string("up_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_9_dilations_0 = const()[name = string("up_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_9_groups_0 = const()[name = string("up_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_9 = conv(dilations = up_9_dilations_0, groups = up_9_groups_0, pad = up_9_pad_0, pad_type = up_9_pad_type_0, strides = up_9_strides_0, weight = layers_4_mlp_up_proj_weight_palettized, x = input_109)[name = string("up_9")];
+            string gate_19_mode_0 = const()[name = string("gate_19_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_19 = gelu(mode = gate_19_mode_0, x = gate_17)[name = string("gate_19")];
+            tensor<fp16, [1, 10240, 1, 3]> input_111 = mul(x = gate_19, y = up_9)[name = string("input_111")];
+            string mlp_out_9_pad_type_0 = const()[name = string("mlp_out_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_9_strides_0 = const()[name = string("mlp_out_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_9_pad_0 = const()[name = string("mlp_out_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_9_dilations_0 = const()[name = string("mlp_out_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_9_groups_0 = const()[name = string("mlp_out_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_9 = conv(dilations = mlp_out_9_dilations_0, groups = mlp_out_9_groups_0, pad = mlp_out_9_pad_0, pad_type = mlp_out_9_pad_type_0, strides = mlp_out_9_strides_0, weight = layers_4_mlp_down_proj_weight_palettized, x = input_111)[name = string("mlp_out_9")];
+            tensor<int32, [1]> var_2275_axes_0 = const()[name = string("op_2275_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2275 = squeeze(axes = var_2275_axes_0, x = mlp_out_9)[name = string("op_2275")];
+            tensor<int32, [3]> var_2279 = const()[name = string("op_2279"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2285 = const()[name = string("op_2285"), val = int32(-1)];
+            fp16 const_37_promoted = const()[name = string("const_37_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_75 = transpose(perm = var_2279, x = var_2275)[name = string("transpose_53")];
+            tensor<fp16, [1, 3, 2560]> var_2287 = mul(x = x_75, y = const_37_promoted)[name = string("op_2287")];
+            bool input_113_interleave_0 = const()[name = string("input_113_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_113 = concat(axis = var_2285, interleave = input_113_interleave_0, values = (x_75, var_2287))[name = string("input_113")];
+            tensor<int32, [1]> normed_113_axes_0 = const()[name = string("normed_113_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2282_to_fp16 = const()[name = string("op_2282_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_113_cast_fp16 = layer_norm(axes = normed_113_axes_0, epsilon = var_2282_to_fp16, x = input_113)[name = string("normed_113_cast_fp16")];
+            tensor<int32, [2]> var_2292_split_sizes_0 = const()[name = string("op_2292_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2292_axis_0 = const()[name = string("op_2292_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2292_0, tensor<fp16, [1, 3, 2560]> var_2292_1 = split(axis = var_2292_axis_0, split_sizes = var_2292_split_sizes_0, x = normed_113_cast_fp16)[name = string("op_2292")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_43 = mul(x = var_2292_0, y = layers_4_post_feedforward_layernorm_weight)[name = string("hidden_states_43")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_45_cast_fp16 = add(x = x_73_cast_fp16, y = hidden_states_43)[name = string("hidden_states_45_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_9_begin_0 = const()[name = string("per_layer_slice_9_begin_0"), val = tensor<int32, [3]>([0, 0, 7168])];
+            tensor<int32, [3]> per_layer_slice_9_end_0 = const()[name = string("per_layer_slice_9_end_0"), val = tensor<int32, [3]>([1, 3, 7424])];
+            tensor<bool, [3]> per_layer_slice_9_end_mask_0 = const()[name = string("per_layer_slice_9_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_9_cast_fp16 = slice_by_index(begin = per_layer_slice_9_begin_0, end = per_layer_slice_9_end_0, end_mask = per_layer_slice_9_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_9_cast_fp16")];
+            tensor<int32, [3]> var_2320 = const()[name = string("op_2320"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_115_axes_0 = const()[name = string("input_115_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2321 = transpose(perm = var_2320, x = hidden_states_45_cast_fp16)[name = string("transpose_52")];
+            tensor<fp16, [1, 2560, 1, 3]> input_115 = expand_dims(axes = input_115_axes_0, x = var_2321)[name = string("input_115")];
+            string gated_25_pad_type_0 = const()[name = string("gated_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_25_strides_0 = const()[name = string("gated_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_25_pad_0 = const()[name = string("gated_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_25_dilations_0 = const()[name = string("gated_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_25_groups_0 = const()[name = string("gated_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_25 = conv(dilations = gated_25_dilations_0, groups = gated_25_groups_0, pad = gated_25_pad_0, pad_type = gated_25_pad_type_0, strides = gated_25_strides_0, weight = layers_4_per_layer_input_gate_weight_palettized, x = input_115)[name = string("gated_25")];
+            string gated_27_mode_0 = const()[name = string("gated_27_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_27 = gelu(mode = gated_27_mode_0, x = gated_25)[name = string("gated_27")];
+            tensor<int32, [3]> var_2340 = const()[name = string("op_2340"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_9_axes_0 = const()[name = string("per_layer_slice_conv_9_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_2341_cast_fp16 = transpose(perm = var_2340, x = per_layer_slice_9_cast_fp16)[name = string("transpose_51")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_9_cast_fp16 = expand_dims(axes = per_layer_slice_conv_9_axes_0, x = var_2341_cast_fp16)[name = string("per_layer_slice_conv_9_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_117_cast_fp16 = mul(x = gated_27, y = per_layer_slice_conv_9_cast_fp16)[name = string("input_117_cast_fp16")];
+            string gated_29_pad_type_0 = const()[name = string("gated_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_29_strides_0 = const()[name = string("gated_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_29_pad_0 = const()[name = string("gated_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_29_dilations_0 = const()[name = string("gated_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_29_groups_0 = const()[name = string("gated_29_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_4_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397882816))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(398210560))))[name = string("layers_4_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_29_cast_fp16 = conv(dilations = gated_29_dilations_0, groups = gated_29_groups_0, pad = gated_29_pad_0, pad_type = gated_29_pad_type_0, strides = gated_29_strides_0, weight = layers_4_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_117_cast_fp16)[name = string("gated_29_cast_fp16")];
+            tensor<int32, [1]> var_2357_axes_0 = const()[name = string("op_2357_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2357_cast_fp16 = squeeze(axes = var_2357_axes_0, x = gated_29_cast_fp16)[name = string("op_2357_cast_fp16")];
+            tensor<int32, [3]> var_2361 = const()[name = string("op_2361"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2367 = const()[name = string("op_2367"), val = int32(-1)];
+            fp16 const_38_promoted_to_fp16 = const()[name = string("const_38_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_77_cast_fp16 = transpose(perm = var_2361, x = var_2357_cast_fp16)[name = string("transpose_50")];
+            tensor<fp16, [1, 3, 2560]> var_2369_cast_fp16 = mul(x = x_77_cast_fp16, y = const_38_promoted_to_fp16)[name = string("op_2369_cast_fp16")];
+            bool input_119_interleave_0 = const()[name = string("input_119_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_119_cast_fp16 = concat(axis = var_2367, interleave = input_119_interleave_0, values = (x_77_cast_fp16, var_2369_cast_fp16))[name = string("input_119_cast_fp16")];
+            tensor<int32, [1]> normed_117_axes_0 = const()[name = string("normed_117_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2364_to_fp16 = const()[name = string("op_2364_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_117_cast_fp16 = layer_norm(axes = normed_117_axes_0, epsilon = var_2364_to_fp16, x = input_119_cast_fp16)[name = string("normed_117_cast_fp16")];
+            tensor<int32, [2]> var_2374_split_sizes_0 = const()[name = string("op_2374_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2374_axis_0 = const()[name = string("op_2374_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2374_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2374_cast_fp16_1 = split(axis = var_2374_axis_0, split_sizes = var_2374_split_sizes_0, x = normed_117_cast_fp16)[name = string("op_2374_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_4_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(398213184)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_49_cast_fp16 = mul(x = var_2374_cast_fp16_0, y = layers_4_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_49_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_51_cast_fp16 = add(x = hidden_states_45_cast_fp16, y = hidden_states_49_cast_fp16)[name = string("hidden_states_51_cast_fp16")];
+            tensor<fp16, [1]> const_39_promoted_to_fp16 = const()[name = string("const_39_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.3ap-1])];
+            tensor<fp16, [1, 3, 2560]> x_79_cast_fp16 = mul(x = hidden_states_51_cast_fp16, y = const_39_promoted_to_fp16)[name = string("x_79_cast_fp16")];
+            int32 var_2389 = const()[name = string("op_2389"), val = int32(-1)];
+            fp16 const_40_promoted_to_fp16 = const()[name = string("const_40_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_2391_cast_fp16 = mul(x = x_79_cast_fp16, y = const_40_promoted_to_fp16)[name = string("op_2391_cast_fp16")];
+            bool input_121_interleave_0 = const()[name = string("input_121_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_121_cast_fp16 = concat(axis = var_2389, interleave = input_121_interleave_0, values = (x_79_cast_fp16, var_2391_cast_fp16))[name = string("input_121_cast_fp16")];
+            tensor<int32, [1]> normed_121_axes_0 = const()[name = string("normed_121_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2386_to_fp16 = const()[name = string("op_2386_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_121_cast_fp16 = layer_norm(axes = normed_121_axes_0, epsilon = var_2386_to_fp16, x = input_121_cast_fp16)[name = string("normed_121_cast_fp16")];
+            tensor<int32, [2]> var_2396_split_sizes_0 = const()[name = string("op_2396_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2396_axis_0 = const()[name = string("op_2396_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2396_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2396_cast_fp16_1 = split(axis = var_2396_axis_0, split_sizes = var_2396_split_sizes_0, x = normed_121_cast_fp16)[name = string("op_2396_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_5_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(398218368)))];
+            tensor<fp16, [1, 3, 2560]> h_31_cast_fp16 = mul(x = var_2396_cast_fp16_0, y = layers_5_input_layernorm_weight_promoted_to_fp16)[name = string("h_31_cast_fp16")];
+            tensor<int32, [3]> var_2402 = const()[name = string("op_2402"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_2405_axes_0 = const()[name = string("op_2405_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2403_cast_fp16 = transpose(perm = var_2402, x = h_31_cast_fp16)[name = string("transpose_49")];
+            tensor<fp16, [1, 2560, 1, 3]> var_2405_cast_fp16 = expand_dims(axes = var_2405_axes_0, x = var_2403_cast_fp16)[name = string("op_2405_cast_fp16")];
+            string q_51_pad_type_0 = const()[name = string("q_51_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_51_strides_0 = const()[name = string("q_51_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_51_pad_0 = const()[name = string("q_51_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_51_dilations_0 = const()[name = string("q_51_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_51_groups_0 = const()[name = string("q_51_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 4096, 1, 3]> q_51 = conv(dilations = q_51_dilations_0, groups = q_51_groups_0, pad = q_51_pad_0, pad_type = q_51_pad_type_0, strides = q_51_strides_0, weight = layers_5_self_attn_q_proj_weight_palettized, x = var_2405_cast_fp16)[name = string("q_51")];
+            tensor<int32, [4]> var_2426 = const()[name = string("op_2426"), val = tensor<int32, [4]>([1, 8, 512, 3])];
+            tensor<fp16, [1, 8, 512, 3]> var_2427 = reshape(shape = var_2426, x = q_51)[name = string("op_2427")];
+            tensor<int32, [4]> transpose_46_perm_0 = const()[name = string("transpose_46_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_2450 = const()[name = string("op_2450"), val = tensor<int32, [3]>([3, 8, 512])];
+            tensor<fp16, [1, 3, 8, 512]> transpose_46 = transpose(perm = transpose_46_perm_0, x = var_2427)[name = string("transpose_48")];
+            tensor<fp16, [3, 8, 512]> x_81 = reshape(shape = var_2450, x = transpose_46)[name = string("x_81")];
+            int32 var_2456 = const()[name = string("op_2456"), val = int32(-1)];
+            fp16 const_41_promoted = const()[name = string("const_41_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 512]> var_2458 = mul(x = x_81, y = const_41_promoted)[name = string("op_2458")];
+            bool input_125_interleave_0 = const()[name = string("input_125_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 1024]> input_125 = concat(axis = var_2456, interleave = input_125_interleave_0, values = (x_81, var_2458))[name = string("input_125")];
+            tensor<int32, [1]> normed_125_axes_0 = const()[name = string("normed_125_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2453_to_fp16 = const()[name = string("op_2453_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 1024]> normed_125_cast_fp16 = layer_norm(axes = normed_125_axes_0, epsilon = var_2453_to_fp16, x = input_125)[name = string("normed_125_cast_fp16")];
+            tensor<int32, [2]> var_2463_split_sizes_0 = const()[name = string("op_2463_split_sizes_0"), val = tensor<int32, [2]>([512, 512])];
+            int32 var_2463_axis_0 = const()[name = string("op_2463_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 512]> var_2463_0, tensor<fp16, [3, 8, 512]> var_2463_1 = split(axis = var_2463_axis_0, split_sizes = var_2463_split_sizes_0, x = normed_125_cast_fp16)[name = string("op_2463")];
+            tensor<fp16, [3, 8, 512]> q_55 = mul(x = var_2463_0, y = layers_5_self_attn_q_norm_weight)[name = string("q_55")];
+            tensor<int32, [4]> var_2470 = const()[name = string("op_2470"), val = tensor<int32, [4]>([1, 3, 8, 512])];
+            tensor<fp16, [1, 3, 8, 512]> var_2471 = reshape(shape = var_2470, x = q_55)[name = string("op_2471")];
+            tensor<int32, [4]> var_2476 = const()[name = string("op_2476"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 512]> q_57 = transpose(perm = var_2476, x = var_2471)[name = string("transpose_47")];
+            tensor<fp16, [1, 8, 3, 512]> var_2478_cast_fp16 = mul(x = q_57, y = cos_f)[name = string("op_2478_cast_fp16")];
+            tensor<int32, [2]> var_2479_split_sizes_0 = const()[name = string("op_2479_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2479_axis_0 = const()[name = string("op_2479_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 256]> var_2479_0, tensor<fp16, [1, 8, 3, 256]> var_2479_1 = split(axis = var_2479_axis_0, split_sizes = var_2479_split_sizes_0, x = q_57)[name = string("op_2479")];
+            fp16 const_42_promoted = const()[name = string("const_42_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 256]> var_2481 = mul(x = var_2479_1, y = const_42_promoted)[name = string("op_2481")];
+            int32 var_2483 = const()[name = string("op_2483"), val = int32(-1)];
+            bool var_2484_interleave_0 = const()[name = string("op_2484_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 512]> var_2484 = concat(axis = var_2483, interleave = var_2484_interleave_0, values = (var_2481, var_2479_0))[name = string("op_2484")];
+            tensor<fp16, [1, 8, 3, 512]> var_2485_cast_fp16 = mul(x = var_2484, y = sin_f)[name = string("op_2485_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> q_59_cast_fp16 = add(x = var_2478_cast_fp16, y = var_2485_cast_fp16)[name = string("q_59_cast_fp16")];
+            tensor<int32, [4]> transpose_20_perm_0 = const()[name = string("transpose_20_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_10_reps_0 = const()[name = string("tile_10_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 2048, 512]> transpose_20_cast_fp16 = transpose(perm = transpose_20_perm_0, x = kv14_k)[name = string("transpose_46")];
+            tensor<fp16, [8, 1, 2048, 512]> tile_10_cast_fp16 = tile(reps = tile_10_reps_0, x = transpose_20_cast_fp16)[name = string("tile_10_cast_fp16")];
+            tensor<int32, [5]> concat_20 = const()[name = string("concat_20"), val = tensor<int32, [5]>([4, 2, 1, 2048, 512])];
+            tensor<fp16, [4, 2, 1, 2048, 512]> reshape_20_cast_fp16 = reshape(shape = concat_20, x = tile_10_cast_fp16)[name = string("reshape_20_cast_fp16")];
+            tensor<int32, [5]> transpose_21_perm_0 = const()[name = string("transpose_21_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_21 = const()[name = string("concat_21"), val = tensor<int32, [4]>([-1, 1, 2048, 512])];
+            tensor<fp16, [2, 4, 1, 2048, 512]> transpose_21_cast_fp16 = transpose(perm = transpose_21_perm_0, x = reshape_20_cast_fp16)[name = string("transpose_45")];
+            tensor<fp16, [8, 1, 2048, 512]> reshape_21_cast_fp16 = reshape(shape = concat_21, x = transpose_21_cast_fp16)[name = string("reshape_21_cast_fp16")];
+            tensor<int32, [4]> transpose_47_perm_0 = const()[name = string("transpose_47_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_22_perm_0 = const()[name = string("transpose_22_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_11_reps_0 = const()[name = string("tile_11_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 2048, 512]> transpose_22_cast_fp16 = transpose(perm = transpose_22_perm_0, x = kv14_v)[name = string("transpose_44")];
+            tensor<fp16, [8, 1, 2048, 512]> tile_11_cast_fp16 = tile(reps = tile_11_reps_0, x = transpose_22_cast_fp16)[name = string("tile_11_cast_fp16")];
+            tensor<int32, [5]> concat_22 = const()[name = string("concat_22"), val = tensor<int32, [5]>([4, 2, 1, 2048, 512])];
+            tensor<fp16, [4, 2, 1, 2048, 512]> reshape_22_cast_fp16 = reshape(shape = concat_22, x = tile_11_cast_fp16)[name = string("reshape_22_cast_fp16")];
+            tensor<int32, [5]> transpose_23_perm_0 = const()[name = string("transpose_23_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_23 = const()[name = string("concat_23"), val = tensor<int32, [4]>([-1, 1, 2048, 512])];
+            tensor<fp16, [2, 4, 1, 2048, 512]> transpose_23_cast_fp16 = transpose(perm = transpose_23_perm_0, x = reshape_22_cast_fp16)[name = string("transpose_43")];
+            tensor<fp16, [8, 1, 2048, 512]> reshape_23_cast_fp16 = reshape(shape = concat_23, x = transpose_23_cast_fp16)[name = string("reshape_23_cast_fp16")];
+            tensor<int32, [4]> V_expanded_11_perm_0 = const()[name = string("V_expanded_11_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_21_transpose_x_0 = const()[name = string("attn_weights_21_transpose_x_0"), val = bool(false)];
+            bool attn_weights_21_transpose_y_0 = const()[name = string("attn_weights_21_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 2048]> transpose_47_cast_fp16 = transpose(perm = transpose_47_perm_0, x = reshape_21_cast_fp16)[name = string("transpose_42")];
+            tensor<fp16, [1, 8, 3, 2048]> attn_weights_21_cast_fp16 = matmul(transpose_x = attn_weights_21_transpose_x_0, transpose_y = attn_weights_21_transpose_y_0, x = q_59_cast_fp16, y = transpose_47_cast_fp16)[name = string("attn_weights_21_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 2048]> x_83_cast_fp16 = add(x = attn_weights_21_cast_fp16, y = causal_mask_full)[name = string("x_83_cast_fp16")];
+            tensor<int32, [1]> reduce_max_5_axes_0 = const()[name = string("reduce_max_5_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_5_keep_dims_0 = const()[name = string("reduce_max_5_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_5 = reduce_max(axes = reduce_max_5_axes_0, keep_dims = reduce_max_5_keep_dims_0, x = x_83_cast_fp16)[name = string("reduce_max_5")];
+            tensor<fp16, [1, 8, 3, 2048]> var_2517 = sub(x = x_83_cast_fp16, y = reduce_max_5)[name = string("op_2517")];
+            tensor<fp16, [1, 8, 3, 2048]> var_2523 = exp(x = var_2517)[name = string("op_2523")];
+            tensor<int32, [1]> var_2533_axes_0 = const()[name = string("op_2533_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2533_keep_dims_0 = const()[name = string("op_2533_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_2533 = reduce_sum(axes = var_2533_axes_0, keep_dims = var_2533_keep_dims_0, x = var_2523)[name = string("op_2533")];
+            tensor<fp16, [1, 8, 3, 2048]> var_2539_cast_fp16 = real_div(x = var_2523, y = var_2533)[name = string("op_2539_cast_fp16")];
+            bool attn_output_31_transpose_x_0 = const()[name = string("attn_output_31_transpose_x_0"), val = bool(false)];
+            bool attn_output_31_transpose_y_0 = const()[name = string("attn_output_31_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 2048, 512]> V_expanded_11_cast_fp16 = transpose(perm = V_expanded_11_perm_0, x = reshape_23_cast_fp16)[name = string("transpose_41")];
+            tensor<fp16, [1, 8, 3, 512]> attn_output_31_cast_fp16 = matmul(transpose_x = attn_output_31_transpose_x_0, transpose_y = attn_output_31_transpose_y_0, x = var_2539_cast_fp16, y = V_expanded_11_cast_fp16)[name = string("attn_output_31_cast_fp16")];
+            tensor<int32, [4]> var_2550 = const()[name = string("op_2550"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_2557 = const()[name = string("op_2557"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 512]> var_2551_cast_fp16 = transpose(perm = var_2550, x = attn_output_31_cast_fp16)[name = string("transpose_40")];
+            tensor<fp16, [1, 3, 4096]> attn_output_33_cast_fp16 = reshape(shape = var_2557, x = var_2551_cast_fp16)[name = string("attn_output_33_cast_fp16")];
+            tensor<int32, [3]> var_2562 = const()[name = string("op_2562"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_2578_pad_type_0 = const()[name = string("op_2578_pad_type_0"), val = string("valid")];
+            int32 var_2578_groups_0 = const()[name = string("op_2578_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_2578_strides_0 = const()[name = string("op_2578_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_2578_pad_0 = const()[name = string("op_2578_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_2578_dilations_0 = const()[name = string("op_2578_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 4096, 1]> squeeze_5_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 4096, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(398223552))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403466496))))[name = string("squeeze_5_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 4096, 3]> var_2563_cast_fp16 = transpose(perm = var_2562, x = attn_output_33_cast_fp16)[name = string("transpose_39")];
+            tensor<fp16, [1, 2560, 3]> var_2578_cast_fp16 = conv(dilations = var_2578_dilations_0, groups = var_2578_groups_0, pad = var_2578_pad_0, pad_type = var_2578_pad_type_0, strides = var_2578_strides_0, weight = squeeze_5_cast_fp16_to_fp32_to_fp16_palettized, x = var_2563_cast_fp16)[name = string("op_2578_cast_fp16")];
+            tensor<int32, [3]> var_2582 = const()[name = string("op_2582"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2588 = const()[name = string("op_2588"), val = int32(-1)];
+            fp16 const_43_promoted_to_fp16 = const()[name = string("const_43_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_87_cast_fp16 = transpose(perm = var_2582, x = var_2578_cast_fp16)[name = string("transpose_38")];
+            tensor<fp16, [1, 3, 2560]> var_2590_cast_fp16 = mul(x = x_87_cast_fp16, y = const_43_promoted_to_fp16)[name = string("op_2590_cast_fp16")];
+            bool input_129_interleave_0 = const()[name = string("input_129_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_129_cast_fp16 = concat(axis = var_2588, interleave = input_129_interleave_0, values = (x_87_cast_fp16, var_2590_cast_fp16))[name = string("input_129_cast_fp16")];
+            tensor<int32, [1]> normed_129_axes_0 = const()[name = string("normed_129_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2585_to_fp16 = const()[name = string("op_2585_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_129_cast_fp16 = layer_norm(axes = normed_129_axes_0, epsilon = var_2585_to_fp16, x = input_129_cast_fp16)[name = string("normed_129_cast_fp16")];
+            tensor<int32, [2]> var_2595_split_sizes_0 = const()[name = string("op_2595_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2595_axis_0 = const()[name = string("op_2595_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2595_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2595_cast_fp16_1 = split(axis = var_2595_axis_0, split_sizes = var_2595_split_sizes_0, x = normed_129_cast_fp16)[name = string("op_2595_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_5_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403469120)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_35_cast_fp16 = mul(x = var_2595_cast_fp16_0, y = layers_5_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_35_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_89_cast_fp16 = add(x = x_79_cast_fp16, y = attn_output_35_cast_fp16)[name = string("x_89_cast_fp16")];
+            int32 var_2604 = const()[name = string("op_2604"), val = int32(-1)];
+            fp16 const_44_promoted_to_fp16 = const()[name = string("const_44_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_2606_cast_fp16 = mul(x = x_89_cast_fp16, y = const_44_promoted_to_fp16)[name = string("op_2606_cast_fp16")];
+            bool input_131_interleave_0 = const()[name = string("input_131_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_131_cast_fp16 = concat(axis = var_2604, interleave = input_131_interleave_0, values = (x_89_cast_fp16, var_2606_cast_fp16))[name = string("input_131_cast_fp16")];
+            tensor<int32, [1]> normed_133_axes_0 = const()[name = string("normed_133_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2601_to_fp16 = const()[name = string("op_2601_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_133_cast_fp16 = layer_norm(axes = normed_133_axes_0, epsilon = var_2601_to_fp16, x = input_131_cast_fp16)[name = string("normed_133_cast_fp16")];
+            tensor<int32, [2]> var_2611_split_sizes_0 = const()[name = string("op_2611_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2611_axis_0 = const()[name = string("op_2611_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2611_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2611_cast_fp16_1 = split(axis = var_2611_axis_0, split_sizes = var_2611_split_sizes_0, x = normed_133_cast_fp16)[name = string("op_2611_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_5_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403474304)))];
+            tensor<fp16, [1, 3, 2560]> h_33_cast_fp16 = mul(x = var_2611_cast_fp16_0, y = layers_5_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_33_cast_fp16")];
+            tensor<int32, [3]> var_2622 = const()[name = string("op_2622"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_133_axes_0 = const()[name = string("input_133_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2623 = transpose(perm = var_2622, x = h_33_cast_fp16)[name = string("transpose_37")];
+            tensor<fp16, [1, 2560, 1, 3]> input_133 = expand_dims(axes = input_133_axes_0, x = var_2623)[name = string("input_133")];
+            string gate_21_pad_type_0 = const()[name = string("gate_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_21_strides_0 = const()[name = string("gate_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_21_pad_0 = const()[name = string("gate_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_21_dilations_0 = const()[name = string("gate_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_21_groups_0 = const()[name = string("gate_21_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_21 = conv(dilations = gate_21_dilations_0, groups = gate_21_groups_0, pad = gate_21_pad_0, pad_type = gate_21_pad_type_0, strides = gate_21_strides_0, weight = layers_5_mlp_gate_proj_weight_palettized, x = input_133)[name = string("gate_21")];
+            string up_11_pad_type_0 = const()[name = string("up_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_11_strides_0 = const()[name = string("up_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_11_pad_0 = const()[name = string("up_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_11_dilations_0 = const()[name = string("up_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_11_groups_0 = const()[name = string("up_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_11 = conv(dilations = up_11_dilations_0, groups = up_11_groups_0, pad = up_11_pad_0, pad_type = up_11_pad_type_0, strides = up_11_strides_0, weight = layers_5_mlp_up_proj_weight_palettized, x = input_133)[name = string("up_11")];
+            string gate_23_mode_0 = const()[name = string("gate_23_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_23 = gelu(mode = gate_23_mode_0, x = gate_21)[name = string("gate_23")];
+            tensor<fp16, [1, 10240, 1, 3]> input_135 = mul(x = gate_23, y = up_11)[name = string("input_135")];
+            string mlp_out_11_pad_type_0 = const()[name = string("mlp_out_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_11_strides_0 = const()[name = string("mlp_out_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_11_pad_0 = const()[name = string("mlp_out_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_11_dilations_0 = const()[name = string("mlp_out_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_11_groups_0 = const()[name = string("mlp_out_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_11 = conv(dilations = mlp_out_11_dilations_0, groups = mlp_out_11_groups_0, pad = mlp_out_11_pad_0, pad_type = mlp_out_11_pad_type_0, strides = mlp_out_11_strides_0, weight = layers_5_mlp_down_proj_weight_palettized, x = input_135)[name = string("mlp_out_11")];
+            tensor<int32, [1]> var_2663_axes_0 = const()[name = string("op_2663_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2663 = squeeze(axes = var_2663_axes_0, x = mlp_out_11)[name = string("op_2663")];
+            tensor<int32, [3]> var_2667 = const()[name = string("op_2667"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2673 = const()[name = string("op_2673"), val = int32(-1)];
+            fp16 const_45_promoted = const()[name = string("const_45_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_91 = transpose(perm = var_2667, x = var_2663)[name = string("transpose_36")];
+            tensor<fp16, [1, 3, 2560]> var_2675 = mul(x = x_91, y = const_45_promoted)[name = string("op_2675")];
+            bool input_137_interleave_0 = const()[name = string("input_137_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_137 = concat(axis = var_2673, interleave = input_137_interleave_0, values = (x_91, var_2675))[name = string("input_137")];
+            tensor<int32, [1]> normed_137_axes_0 = const()[name = string("normed_137_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2670_to_fp16 = const()[name = string("op_2670_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_137_cast_fp16 = layer_norm(axes = normed_137_axes_0, epsilon = var_2670_to_fp16, x = input_137)[name = string("normed_137_cast_fp16")];
+            tensor<int32, [2]> var_2680_split_sizes_0 = const()[name = string("op_2680_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2680_axis_0 = const()[name = string("op_2680_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2680_0, tensor<fp16, [1, 3, 2560]> var_2680_1 = split(axis = var_2680_axis_0, split_sizes = var_2680_split_sizes_0, x = normed_137_cast_fp16)[name = string("op_2680")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_53 = mul(x = var_2680_0, y = layers_5_post_feedforward_layernorm_weight)[name = string("hidden_states_53")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_55_cast_fp16 = add(x = x_89_cast_fp16, y = hidden_states_53)[name = string("hidden_states_55_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_11_begin_0 = const()[name = string("per_layer_slice_11_begin_0"), val = tensor<int32, [3]>([0, 0, 7424])];
+            tensor<int32, [3]> per_layer_slice_11_end_0 = const()[name = string("per_layer_slice_11_end_0"), val = tensor<int32, [3]>([1, 3, 7680])];
+            tensor<bool, [3]> per_layer_slice_11_end_mask_0 = const()[name = string("per_layer_slice_11_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_11_cast_fp16 = slice_by_index(begin = per_layer_slice_11_begin_0, end = per_layer_slice_11_end_0, end_mask = per_layer_slice_11_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_11_cast_fp16")];
+            tensor<int32, [3]> var_2708 = const()[name = string("op_2708"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_139_axes_0 = const()[name = string("input_139_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2709 = transpose(perm = var_2708, x = hidden_states_55_cast_fp16)[name = string("transpose_35")];
+            tensor<fp16, [1, 2560, 1, 3]> input_139 = expand_dims(axes = input_139_axes_0, x = var_2709)[name = string("input_139")];
+            string gated_31_pad_type_0 = const()[name = string("gated_31_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_31_strides_0 = const()[name = string("gated_31_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_31_pad_0 = const()[name = string("gated_31_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_31_dilations_0 = const()[name = string("gated_31_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_31_groups_0 = const()[name = string("gated_31_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_31 = conv(dilations = gated_31_dilations_0, groups = gated_31_groups_0, pad = gated_31_pad_0, pad_type = gated_31_pad_type_0, strides = gated_31_strides_0, weight = layers_5_per_layer_input_gate_weight_palettized, x = input_139)[name = string("gated_31")];
+            string gated_33_mode_0 = const()[name = string("gated_33_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_33 = gelu(mode = gated_33_mode_0, x = gated_31)[name = string("gated_33")];
+            tensor<int32, [3]> var_2728 = const()[name = string("op_2728"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_11_axes_0 = const()[name = string("per_layer_slice_conv_11_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_2729_cast_fp16 = transpose(perm = var_2728, x = per_layer_slice_11_cast_fp16)[name = string("transpose_34")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_11_cast_fp16 = expand_dims(axes = per_layer_slice_conv_11_axes_0, x = var_2729_cast_fp16)[name = string("per_layer_slice_conv_11_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_141_cast_fp16 = mul(x = gated_33, y = per_layer_slice_conv_11_cast_fp16)[name = string("input_141_cast_fp16")];
+            string gated_35_pad_type_0 = const()[name = string("gated_35_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_35_strides_0 = const()[name = string("gated_35_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_35_pad_0 = const()[name = string("gated_35_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_35_dilations_0 = const()[name = string("gated_35_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_35_groups_0 = const()[name = string("gated_35_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_5_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403479488))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403807232))))[name = string("layers_5_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_35_cast_fp16 = conv(dilations = gated_35_dilations_0, groups = gated_35_groups_0, pad = gated_35_pad_0, pad_type = gated_35_pad_type_0, strides = gated_35_strides_0, weight = layers_5_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_141_cast_fp16)[name = string("gated_35_cast_fp16")];
+            tensor<int32, [1]> var_2745_axes_0 = const()[name = string("op_2745_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2745_cast_fp16 = squeeze(axes = var_2745_axes_0, x = gated_35_cast_fp16)[name = string("op_2745_cast_fp16")];
+            tensor<int32, [3]> var_2749 = const()[name = string("op_2749"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2755 = const()[name = string("op_2755"), val = int32(-1)];
+            fp16 const_46_promoted_to_fp16 = const()[name = string("const_46_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_93_cast_fp16 = transpose(perm = var_2749, x = var_2745_cast_fp16)[name = string("transpose_33")];
+            tensor<fp16, [1, 3, 2560]> var_2757_cast_fp16 = mul(x = x_93_cast_fp16, y = const_46_promoted_to_fp16)[name = string("op_2757_cast_fp16")];
+            bool input_143_interleave_0 = const()[name = string("input_143_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_143_cast_fp16 = concat(axis = var_2755, interleave = input_143_interleave_0, values = (x_93_cast_fp16, var_2757_cast_fp16))[name = string("input_143_cast_fp16")];
+            tensor<int32, [1]> normed_141_axes_0 = const()[name = string("normed_141_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2752_to_fp16 = const()[name = string("op_2752_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_141_cast_fp16 = layer_norm(axes = normed_141_axes_0, epsilon = var_2752_to_fp16, x = input_143_cast_fp16)[name = string("normed_141_cast_fp16")];
+            tensor<int32, [2]> var_2762_split_sizes_0 = const()[name = string("op_2762_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2762_axis_0 = const()[name = string("op_2762_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2762_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2762_cast_fp16_1 = split(axis = var_2762_axis_0, split_sizes = var_2762_split_sizes_0, x = normed_141_cast_fp16)[name = string("op_2762_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_5_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403809856)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_59_cast_fp16 = mul(x = var_2762_cast_fp16_0, y = layers_5_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_59_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_61_cast_fp16 = add(x = hidden_states_55_cast_fp16, y = hidden_states_59_cast_fp16)[name = string("hidden_states_61_cast_fp16")];
+            tensor<fp16, [1]> const_47_promoted_to_fp16 = const()[name = string("const_47_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.aep-2])];
+            tensor<fp16, [1, 3, 2560]> x_95_cast_fp16 = mul(x = hidden_states_61_cast_fp16, y = const_47_promoted_to_fp16)[name = string("x_95_cast_fp16")];
+            int32 var_2777 = const()[name = string("op_2777"), val = int32(-1)];
+            fp16 const_48_promoted_to_fp16 = const()[name = string("const_48_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_2779_cast_fp16 = mul(x = x_95_cast_fp16, y = const_48_promoted_to_fp16)[name = string("op_2779_cast_fp16")];
+            bool input_145_interleave_0 = const()[name = string("input_145_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_145_cast_fp16 = concat(axis = var_2777, interleave = input_145_interleave_0, values = (x_95_cast_fp16, var_2779_cast_fp16))[name = string("input_145_cast_fp16")];
+            tensor<int32, [1]> normed_145_axes_0 = const()[name = string("normed_145_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2774_to_fp16 = const()[name = string("op_2774_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_145_cast_fp16 = layer_norm(axes = normed_145_axes_0, epsilon = var_2774_to_fp16, x = input_145_cast_fp16)[name = string("normed_145_cast_fp16")];
+            tensor<int32, [2]> var_2784_split_sizes_0 = const()[name = string("op_2784_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2784_axis_0 = const()[name = string("op_2784_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2784_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2784_cast_fp16_1 = split(axis = var_2784_axis_0, split_sizes = var_2784_split_sizes_0, x = normed_145_cast_fp16)[name = string("op_2784_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_6_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403815040)))];
+            tensor<fp16, [1, 3, 2560]> h_37_cast_fp16 = mul(x = var_2784_cast_fp16_0, y = layers_6_input_layernorm_weight_promoted_to_fp16)[name = string("h_37_cast_fp16")];
+            tensor<int32, [3]> var_2790 = const()[name = string("op_2790"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_2793_axes_0 = const()[name = string("op_2793_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2791_cast_fp16 = transpose(perm = var_2790, x = h_37_cast_fp16)[name = string("transpose_32")];
+            tensor<fp16, [1, 2560, 1, 3]> var_2793_cast_fp16 = expand_dims(axes = var_2793_axes_0, x = var_2791_cast_fp16)[name = string("op_2793_cast_fp16")];
+            string q_61_pad_type_0 = const()[name = string("q_61_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_61_strides_0 = const()[name = string("q_61_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_61_pad_0 = const()[name = string("q_61_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_61_dilations_0 = const()[name = string("q_61_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_61_groups_0 = const()[name = string("q_61_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_61 = conv(dilations = q_61_dilations_0, groups = q_61_groups_0, pad = q_61_pad_0, pad_type = q_61_pad_type_0, strides = q_61_strides_0, weight = layers_6_self_attn_q_proj_weight_palettized, x = var_2793_cast_fp16)[name = string("q_61")];
+            tensor<int32, [4]> var_2814 = const()[name = string("op_2814"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_2815 = reshape(shape = var_2814, x = q_61)[name = string("op_2815")];
+            tensor<int32, [4]> transpose_48_perm_0 = const()[name = string("transpose_48_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_2838 = const()[name = string("op_2838"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_48 = transpose(perm = transpose_48_perm_0, x = var_2815)[name = string("transpose_31")];
+            tensor<fp16, [3, 8, 256]> x_97 = reshape(shape = var_2838, x = transpose_48)[name = string("x_97")];
+            int32 var_2844 = const()[name = string("op_2844"), val = int32(-1)];
+            fp16 const_49_promoted = const()[name = string("const_49_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_2846 = mul(x = x_97, y = const_49_promoted)[name = string("op_2846")];
+            bool input_149_interleave_0 = const()[name = string("input_149_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_149 = concat(axis = var_2844, interleave = input_149_interleave_0, values = (x_97, var_2846))[name = string("input_149")];
+            tensor<int32, [1]> normed_149_axes_0 = const()[name = string("normed_149_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2841_to_fp16 = const()[name = string("op_2841_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_149_cast_fp16 = layer_norm(axes = normed_149_axes_0, epsilon = var_2841_to_fp16, x = input_149)[name = string("normed_149_cast_fp16")];
+            tensor<int32, [2]> var_2851_split_sizes_0 = const()[name = string("op_2851_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2851_axis_0 = const()[name = string("op_2851_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_2851_0, tensor<fp16, [3, 8, 256]> var_2851_1 = split(axis = var_2851_axis_0, split_sizes = var_2851_split_sizes_0, x = normed_149_cast_fp16)[name = string("op_2851")];
+            tensor<fp16, [3, 8, 256]> q_65 = mul(x = var_2851_0, y = layers_0_self_attn_q_norm_weight)[name = string("q_65")];
+            tensor<int32, [4]> var_2858 = const()[name = string("op_2858"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_2859 = reshape(shape = var_2858, x = q_65)[name = string("op_2859")];
+            tensor<int32, [4]> var_2864 = const()[name = string("op_2864"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_67 = transpose(perm = var_2864, x = var_2859)[name = string("transpose_30")];
+            tensor<fp16, [1, 8, 3, 256]> var_2866_cast_fp16 = mul(x = q_67, y = cos_s)[name = string("op_2866_cast_fp16")];
+            tensor<int32, [2]> var_2867_split_sizes_0 = const()[name = string("op_2867_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2867_axis_0 = const()[name = string("op_2867_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_2867_0, tensor<fp16, [1, 8, 3, 128]> var_2867_1 = split(axis = var_2867_axis_0, split_sizes = var_2867_split_sizes_0, x = q_67)[name = string("op_2867")];
+            fp16 const_50_promoted = const()[name = string("const_50_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_2869 = mul(x = var_2867_1, y = const_50_promoted)[name = string("op_2869")];
+            int32 var_2871 = const()[name = string("op_2871"), val = int32(-1)];
+            bool var_2872_interleave_0 = const()[name = string("op_2872_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_2872 = concat(axis = var_2871, interleave = var_2872_interleave_0, values = (var_2869, var_2867_0))[name = string("op_2872")];
+            tensor<fp16, [1, 8, 3, 256]> var_2873_cast_fp16 = mul(x = var_2872, y = sin_s)[name = string("op_2873_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_69_cast_fp16 = add(x = var_2866_cast_fp16, y = var_2873_cast_fp16)[name = string("q_69_cast_fp16")];
+            bool attn_weights_25_transpose_x_0 = const()[name = string("attn_weights_25_transpose_x_0"), val = bool(false)];
+            bool attn_weights_25_transpose_y_0 = const()[name = string("attn_weights_25_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_25_cast_fp16 = matmul(transpose_x = attn_weights_25_transpose_x_0, transpose_y = attn_weights_25_transpose_y_0, x = q_69_cast_fp16, y = transpose_37_cast_fp16)[name = string("attn_weights_25_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_99_cast_fp16 = add(x = attn_weights_25_cast_fp16, y = causal_mask_sliding)[name = string("x_99_cast_fp16")];
+            tensor<int32, [1]> reduce_max_6_axes_0 = const()[name = string("reduce_max_6_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_6_keep_dims_0 = const()[name = string("reduce_max_6_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_6 = reduce_max(axes = reduce_max_6_axes_0, keep_dims = reduce_max_6_keep_dims_0, x = x_99_cast_fp16)[name = string("reduce_max_6")];
+            tensor<fp16, [1, 8, 3, 512]> var_2905 = sub(x = x_99_cast_fp16, y = reduce_max_6)[name = string("op_2905")];
+            tensor<fp16, [1, 8, 3, 512]> var_2911 = exp(x = var_2905)[name = string("op_2911")];
+            tensor<int32, [1]> var_2921_axes_0 = const()[name = string("op_2921_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2921_keep_dims_0 = const()[name = string("op_2921_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_2921 = reduce_sum(axes = var_2921_axes_0, keep_dims = var_2921_keep_dims_0, x = var_2911)[name = string("op_2921")];
+            tensor<fp16, [1, 8, 3, 512]> var_2927_cast_fp16 = real_div(x = var_2911, y = var_2921)[name = string("op_2927_cast_fp16")];
+            bool attn_output_37_transpose_x_0 = const()[name = string("attn_output_37_transpose_x_0"), val = bool(false)];
+            bool attn_output_37_transpose_y_0 = const()[name = string("attn_output_37_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_37_cast_fp16 = matmul(transpose_x = attn_output_37_transpose_x_0, transpose_y = attn_output_37_transpose_y_0, x = var_2927_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_37_cast_fp16")];
+            tensor<int32, [4]> var_2938 = const()[name = string("op_2938"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_2945 = const()[name = string("op_2945"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_2939_cast_fp16 = transpose(perm = var_2938, x = attn_output_37_cast_fp16)[name = string("transpose_29")];
+            tensor<fp16, [1, 3, 2048]> attn_output_39_cast_fp16 = reshape(shape = var_2945, x = var_2939_cast_fp16)[name = string("attn_output_39_cast_fp16")];
+            tensor<int32, [3]> var_2950 = const()[name = string("op_2950"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_2966_pad_type_0 = const()[name = string("op_2966_pad_type_0"), val = string("valid")];
+            int32 var_2966_groups_0 = const()[name = string("op_2966_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_2966_strides_0 = const()[name = string("op_2966_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_2966_pad_0 = const()[name = string("op_2966_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_2966_dilations_0 = const()[name = string("op_2966_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_6_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403820224))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406441728))))[name = string("squeeze_6_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_2951_cast_fp16 = transpose(perm = var_2950, x = attn_output_39_cast_fp16)[name = string("transpose_28")];
+            tensor<fp16, [1, 2560, 3]> var_2966_cast_fp16 = conv(dilations = var_2966_dilations_0, groups = var_2966_groups_0, pad = var_2966_pad_0, pad_type = var_2966_pad_type_0, strides = var_2966_strides_0, weight = squeeze_6_cast_fp16_to_fp32_to_fp16_palettized, x = var_2951_cast_fp16)[name = string("op_2966_cast_fp16")];
+            tensor<int32, [3]> var_2970 = const()[name = string("op_2970"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2976 = const()[name = string("op_2976"), val = int32(-1)];
+            fp16 const_51_promoted_to_fp16 = const()[name = string("const_51_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_103_cast_fp16 = transpose(perm = var_2970, x = var_2966_cast_fp16)[name = string("transpose_27")];
+            tensor<fp16, [1, 3, 2560]> var_2978_cast_fp16 = mul(x = x_103_cast_fp16, y = const_51_promoted_to_fp16)[name = string("op_2978_cast_fp16")];
+            bool input_153_interleave_0 = const()[name = string("input_153_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_153_cast_fp16 = concat(axis = var_2976, interleave = input_153_interleave_0, values = (x_103_cast_fp16, var_2978_cast_fp16))[name = string("input_153_cast_fp16")];
+            tensor<int32, [1]> normed_153_axes_0 = const()[name = string("normed_153_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2973_to_fp16 = const()[name = string("op_2973_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_153_cast_fp16 = layer_norm(axes = normed_153_axes_0, epsilon = var_2973_to_fp16, x = input_153_cast_fp16)[name = string("normed_153_cast_fp16")];
+            tensor<int32, [2]> var_2983_split_sizes_0 = const()[name = string("op_2983_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2983_axis_0 = const()[name = string("op_2983_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2983_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2983_cast_fp16_1 = split(axis = var_2983_axis_0, split_sizes = var_2983_split_sizes_0, x = normed_153_cast_fp16)[name = string("op_2983_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_6_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406444352)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_41_cast_fp16 = mul(x = var_2983_cast_fp16_0, y = layers_6_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_41_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_105_cast_fp16 = add(x = x_95_cast_fp16, y = attn_output_41_cast_fp16)[name = string("x_105_cast_fp16")];
+            int32 var_2992 = const()[name = string("op_2992"), val = int32(-1)];
+            fp16 const_52_promoted_to_fp16 = const()[name = string("const_52_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_2994_cast_fp16 = mul(x = x_105_cast_fp16, y = const_52_promoted_to_fp16)[name = string("op_2994_cast_fp16")];
+            bool input_155_interleave_0 = const()[name = string("input_155_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_155_cast_fp16 = concat(axis = var_2992, interleave = input_155_interleave_0, values = (x_105_cast_fp16, var_2994_cast_fp16))[name = string("input_155_cast_fp16")];
+            tensor<int32, [1]> normed_157_axes_0 = const()[name = string("normed_157_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2989_to_fp16 = const()[name = string("op_2989_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_157_cast_fp16 = layer_norm(axes = normed_157_axes_0, epsilon = var_2989_to_fp16, x = input_155_cast_fp16)[name = string("normed_157_cast_fp16")];
+            tensor<int32, [2]> var_2999_split_sizes_0 = const()[name = string("op_2999_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2999_axis_0 = const()[name = string("op_2999_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2999_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2999_cast_fp16_1 = split(axis = var_2999_axis_0, split_sizes = var_2999_split_sizes_0, x = normed_157_cast_fp16)[name = string("op_2999_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_6_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406449536)))];
+            tensor<fp16, [1, 3, 2560]> h_39_cast_fp16 = mul(x = var_2999_cast_fp16_0, y = layers_6_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_39_cast_fp16")];
+            tensor<int32, [3]> var_3010 = const()[name = string("op_3010"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_157_axes_0 = const()[name = string("input_157_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3011 = transpose(perm = var_3010, x = h_39_cast_fp16)[name = string("transpose_26")];
+            tensor<fp16, [1, 2560, 1, 3]> input_157 = expand_dims(axes = input_157_axes_0, x = var_3011)[name = string("input_157")];
+            string gate_25_pad_type_0 = const()[name = string("gate_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_25_strides_0 = const()[name = string("gate_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_25_pad_0 = const()[name = string("gate_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_25_dilations_0 = const()[name = string("gate_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_25_groups_0 = const()[name = string("gate_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_25 = conv(dilations = gate_25_dilations_0, groups = gate_25_groups_0, pad = gate_25_pad_0, pad_type = gate_25_pad_type_0, strides = gate_25_strides_0, weight = layers_6_mlp_gate_proj_weight_palettized, x = input_157)[name = string("gate_25")];
+            string up_13_pad_type_0 = const()[name = string("up_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_13_strides_0 = const()[name = string("up_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_13_pad_0 = const()[name = string("up_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_13_dilations_0 = const()[name = string("up_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_13_groups_0 = const()[name = string("up_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_13 = conv(dilations = up_13_dilations_0, groups = up_13_groups_0, pad = up_13_pad_0, pad_type = up_13_pad_type_0, strides = up_13_strides_0, weight = layers_6_mlp_up_proj_weight_palettized, x = input_157)[name = string("up_13")];
+            string gate_27_mode_0 = const()[name = string("gate_27_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_27 = gelu(mode = gate_27_mode_0, x = gate_25)[name = string("gate_27")];
+            tensor<fp16, [1, 10240, 1, 3]> input_159 = mul(x = gate_27, y = up_13)[name = string("input_159")];
+            string mlp_out_13_pad_type_0 = const()[name = string("mlp_out_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_13_strides_0 = const()[name = string("mlp_out_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_13_pad_0 = const()[name = string("mlp_out_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_13_dilations_0 = const()[name = string("mlp_out_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_13_groups_0 = const()[name = string("mlp_out_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_13 = conv(dilations = mlp_out_13_dilations_0, groups = mlp_out_13_groups_0, pad = mlp_out_13_pad_0, pad_type = mlp_out_13_pad_type_0, strides = mlp_out_13_strides_0, weight = layers_6_mlp_down_proj_weight_palettized, x = input_159)[name = string("mlp_out_13")];
+            tensor<int32, [1]> var_3051_axes_0 = const()[name = string("op_3051_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3051 = squeeze(axes = var_3051_axes_0, x = mlp_out_13)[name = string("op_3051")];
+            tensor<int32, [3]> var_3055 = const()[name = string("op_3055"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3061 = const()[name = string("op_3061"), val = int32(-1)];
+            fp16 const_53_promoted = const()[name = string("const_53_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_107 = transpose(perm = var_3055, x = var_3051)[name = string("transpose_25")];
+            tensor<fp16, [1, 3, 2560]> var_3063 = mul(x = x_107, y = const_53_promoted)[name = string("op_3063")];
+            bool input_161_interleave_0 = const()[name = string("input_161_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_161 = concat(axis = var_3061, interleave = input_161_interleave_0, values = (x_107, var_3063))[name = string("input_161")];
+            tensor<int32, [1]> normed_161_axes_0 = const()[name = string("normed_161_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3058_to_fp16 = const()[name = string("op_3058_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_161_cast_fp16 = layer_norm(axes = normed_161_axes_0, epsilon = var_3058_to_fp16, x = input_161)[name = string("normed_161_cast_fp16")];
+            tensor<int32, [2]> var_3068_split_sizes_0 = const()[name = string("op_3068_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3068_axis_0 = const()[name = string("op_3068_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3068_0, tensor<fp16, [1, 3, 2560]> var_3068_1 = split(axis = var_3068_axis_0, split_sizes = var_3068_split_sizes_0, x = normed_161_cast_fp16)[name = string("op_3068")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_63 = mul(x = var_3068_0, y = layers_6_post_feedforward_layernorm_weight)[name = string("hidden_states_63")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_65_cast_fp16 = add(x = x_105_cast_fp16, y = hidden_states_63)[name = string("hidden_states_65_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_13_begin_0 = const()[name = string("per_layer_slice_13_begin_0"), val = tensor<int32, [3]>([0, 0, 7680])];
+            tensor<int32, [3]> per_layer_slice_13_end_0 = const()[name = string("per_layer_slice_13_end_0"), val = tensor<int32, [3]>([1, 3, 7936])];
+            tensor<bool, [3]> per_layer_slice_13_end_mask_0 = const()[name = string("per_layer_slice_13_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_13_cast_fp16 = slice_by_index(begin = per_layer_slice_13_begin_0, end = per_layer_slice_13_end_0, end_mask = per_layer_slice_13_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_13_cast_fp16")];
+            tensor<int32, [3]> var_3096 = const()[name = string("op_3096"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_163_axes_0 = const()[name = string("input_163_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3097 = transpose(perm = var_3096, x = hidden_states_65_cast_fp16)[name = string("transpose_24")];
+            tensor<fp16, [1, 2560, 1, 3]> input_163 = expand_dims(axes = input_163_axes_0, x = var_3097)[name = string("input_163")];
+            string gated_37_pad_type_0 = const()[name = string("gated_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_37_strides_0 = const()[name = string("gated_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_37_pad_0 = const()[name = string("gated_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_37_dilations_0 = const()[name = string("gated_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_37_groups_0 = const()[name = string("gated_37_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_37 = conv(dilations = gated_37_dilations_0, groups = gated_37_groups_0, pad = gated_37_pad_0, pad_type = gated_37_pad_type_0, strides = gated_37_strides_0, weight = layers_6_per_layer_input_gate_weight_palettized, x = input_163)[name = string("gated_37")];
+            string gated_39_mode_0 = const()[name = string("gated_39_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_39 = gelu(mode = gated_39_mode_0, x = gated_37)[name = string("gated_39")];
+            tensor<int32, [3]> var_3116 = const()[name = string("op_3116"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_13_axes_0 = const()[name = string("per_layer_slice_conv_13_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_3117_cast_fp16 = transpose(perm = var_3116, x = per_layer_slice_13_cast_fp16)[name = string("transpose_23")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_13_cast_fp16 = expand_dims(axes = per_layer_slice_conv_13_axes_0, x = var_3117_cast_fp16)[name = string("per_layer_slice_conv_13_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_165_cast_fp16 = mul(x = gated_39, y = per_layer_slice_conv_13_cast_fp16)[name = string("input_165_cast_fp16")];
+            string gated_41_pad_type_0 = const()[name = string("gated_41_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_41_strides_0 = const()[name = string("gated_41_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_41_pad_0 = const()[name = string("gated_41_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_41_dilations_0 = const()[name = string("gated_41_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_41_groups_0 = const()[name = string("gated_41_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_6_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406454720))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406782464))))[name = string("layers_6_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_41_cast_fp16 = conv(dilations = gated_41_dilations_0, groups = gated_41_groups_0, pad = gated_41_pad_0, pad_type = gated_41_pad_type_0, strides = gated_41_strides_0, weight = layers_6_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_165_cast_fp16)[name = string("gated_41_cast_fp16")];
+            tensor<int32, [1]> var_3133_axes_0 = const()[name = string("op_3133_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3133_cast_fp16 = squeeze(axes = var_3133_axes_0, x = gated_41_cast_fp16)[name = string("op_3133_cast_fp16")];
+            tensor<int32, [3]> var_3137 = const()[name = string("op_3137"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3143 = const()[name = string("op_3143"), val = int32(-1)];
+            fp16 const_54_promoted_to_fp16 = const()[name = string("const_54_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_109_cast_fp16 = transpose(perm = var_3137, x = var_3133_cast_fp16)[name = string("transpose_22")];
+            tensor<fp16, [1, 3, 2560]> var_3145_cast_fp16 = mul(x = x_109_cast_fp16, y = const_54_promoted_to_fp16)[name = string("op_3145_cast_fp16")];
+            bool input_167_interleave_0 = const()[name = string("input_167_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_167_cast_fp16 = concat(axis = var_3143, interleave = input_167_interleave_0, values = (x_109_cast_fp16, var_3145_cast_fp16))[name = string("input_167_cast_fp16")];
+            tensor<int32, [1]> normed_165_axes_0 = const()[name = string("normed_165_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3140_to_fp16 = const()[name = string("op_3140_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_165_cast_fp16 = layer_norm(axes = normed_165_axes_0, epsilon = var_3140_to_fp16, x = input_167_cast_fp16)[name = string("normed_165_cast_fp16")];
+            tensor<int32, [2]> var_3150_split_sizes_0 = const()[name = string("op_3150_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3150_axis_0 = const()[name = string("op_3150_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3150_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3150_cast_fp16_1 = split(axis = var_3150_axis_0, split_sizes = var_3150_split_sizes_0, x = normed_165_cast_fp16)[name = string("op_3150_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_6_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406785088)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_69_cast_fp16 = mul(x = var_3150_cast_fp16_0, y = layers_6_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_69_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_71_cast_fp16 = add(x = hidden_states_65_cast_fp16, y = hidden_states_69_cast_fp16)[name = string("hidden_states_71_cast_fp16")];
+            tensor<fp16, [1]> const_55_promoted_to_fp16 = const()[name = string("const_55_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.6cp-1])];
+            tensor<fp16, [1, 3, 2560]> x_111_cast_fp16 = mul(x = hidden_states_71_cast_fp16, y = const_55_promoted_to_fp16)[name = string("x_111_cast_fp16")];
+            int32 var_3165 = const()[name = string("op_3165"), val = int32(-1)];
+            fp16 const_56_promoted_to_fp16 = const()[name = string("const_56_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_3167_cast_fp16 = mul(x = x_111_cast_fp16, y = const_56_promoted_to_fp16)[name = string("op_3167_cast_fp16")];
+            bool input_169_interleave_0 = const()[name = string("input_169_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_169_cast_fp16 = concat(axis = var_3165, interleave = input_169_interleave_0, values = (x_111_cast_fp16, var_3167_cast_fp16))[name = string("input_169_cast_fp16")];
+            tensor<int32, [1]> normed_169_axes_0 = const()[name = string("normed_169_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3162_to_fp16 = const()[name = string("op_3162_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_169_cast_fp16 = layer_norm(axes = normed_169_axes_0, epsilon = var_3162_to_fp16, x = input_169_cast_fp16)[name = string("normed_169_cast_fp16")];
+            tensor<int32, [2]> var_3172_split_sizes_0 = const()[name = string("op_3172_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3172_axis_0 = const()[name = string("op_3172_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3172_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3172_cast_fp16_1 = split(axis = var_3172_axis_0, split_sizes = var_3172_split_sizes_0, x = normed_169_cast_fp16)[name = string("op_3172_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_7_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406790272)))];
+            tensor<fp16, [1, 3, 2560]> h_43_cast_fp16 = mul(x = var_3172_cast_fp16_0, y = layers_7_input_layernorm_weight_promoted_to_fp16)[name = string("h_43_cast_fp16")];
+            tensor<int32, [3]> var_3178 = const()[name = string("op_3178"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_3181_axes_0 = const()[name = string("op_3181_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3179_cast_fp16 = transpose(perm = var_3178, x = h_43_cast_fp16)[name = string("transpose_21")];
+            tensor<fp16, [1, 2560, 1, 3]> var_3181_cast_fp16 = expand_dims(axes = var_3181_axes_0, x = var_3179_cast_fp16)[name = string("op_3181_cast_fp16")];
+            string q_71_pad_type_0 = const()[name = string("q_71_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_71_strides_0 = const()[name = string("q_71_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_71_pad_0 = const()[name = string("q_71_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_71_dilations_0 = const()[name = string("q_71_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_71_groups_0 = const()[name = string("q_71_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_71 = conv(dilations = q_71_dilations_0, groups = q_71_groups_0, pad = q_71_pad_0, pad_type = q_71_pad_type_0, strides = q_71_strides_0, weight = layers_7_self_attn_q_proj_weight_palettized, x = var_3181_cast_fp16)[name = string("q_71")];
+            tensor<int32, [4]> var_3202 = const()[name = string("op_3202"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_3203 = reshape(shape = var_3202, x = q_71)[name = string("op_3203")];
+            tensor<int32, [4]> transpose_50_perm_0 = const()[name = string("transpose_50_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_3226 = const()[name = string("op_3226"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_50 = transpose(perm = transpose_50_perm_0, x = var_3203)[name = string("transpose_20")];
+            tensor<fp16, [3, 8, 256]> x_113 = reshape(shape = var_3226, x = transpose_50)[name = string("x_113")];
+            int32 var_3232 = const()[name = string("op_3232"), val = int32(-1)];
+            fp16 const_57_promoted = const()[name = string("const_57_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_3234 = mul(x = x_113, y = const_57_promoted)[name = string("op_3234")];
+            bool input_173_interleave_0 = const()[name = string("input_173_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_173 = concat(axis = var_3232, interleave = input_173_interleave_0, values = (x_113, var_3234))[name = string("input_173")];
+            tensor<int32, [1]> normed_173_axes_0 = const()[name = string("normed_173_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3229_to_fp16 = const()[name = string("op_3229_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_173_cast_fp16 = layer_norm(axes = normed_173_axes_0, epsilon = var_3229_to_fp16, x = input_173)[name = string("normed_173_cast_fp16")];
+            tensor<int32, [2]> var_3239_split_sizes_0 = const()[name = string("op_3239_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_3239_axis_0 = const()[name = string("op_3239_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_3239_0, tensor<fp16, [3, 8, 256]> var_3239_1 = split(axis = var_3239_axis_0, split_sizes = var_3239_split_sizes_0, x = normed_173_cast_fp16)[name = string("op_3239")];
+            tensor<fp16, [3, 8, 256]> q_75 = mul(x = var_3239_0, y = layers_0_self_attn_q_norm_weight)[name = string("q_75")];
+            tensor<int32, [4]> var_3246 = const()[name = string("op_3246"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_3247 = reshape(shape = var_3246, x = q_75)[name = string("op_3247")];
+            tensor<int32, [4]> var_3252 = const()[name = string("op_3252"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_77 = transpose(perm = var_3252, x = var_3247)[name = string("transpose_19")];
+            tensor<fp16, [1, 8, 3, 256]> var_3254_cast_fp16 = mul(x = q_77, y = cos_s)[name = string("op_3254_cast_fp16")];
+            tensor<int32, [2]> var_3255_split_sizes_0 = const()[name = string("op_3255_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_3255_axis_0 = const()[name = string("op_3255_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_3255_0, tensor<fp16, [1, 8, 3, 128]> var_3255_1 = split(axis = var_3255_axis_0, split_sizes = var_3255_split_sizes_0, x = q_77)[name = string("op_3255")];
+            fp16 const_58_promoted = const()[name = string("const_58_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_3257 = mul(x = var_3255_1, y = const_58_promoted)[name = string("op_3257")];
+            int32 var_3259 = const()[name = string("op_3259"), val = int32(-1)];
+            bool var_3260_interleave_0 = const()[name = string("op_3260_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_3260 = concat(axis = var_3259, interleave = var_3260_interleave_0, values = (var_3257, var_3255_0))[name = string("op_3260")];
+            tensor<fp16, [1, 8, 3, 256]> var_3261_cast_fp16 = mul(x = var_3260, y = sin_s)[name = string("op_3261_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_79_cast_fp16 = add(x = var_3254_cast_fp16, y = var_3261_cast_fp16)[name = string("q_79_cast_fp16")];
+            bool attn_weights_29_transpose_x_0 = const()[name = string("attn_weights_29_transpose_x_0"), val = bool(false)];
+            bool attn_weights_29_transpose_y_0 = const()[name = string("attn_weights_29_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_29_cast_fp16 = matmul(transpose_x = attn_weights_29_transpose_x_0, transpose_y = attn_weights_29_transpose_y_0, x = q_79_cast_fp16, y = transpose_37_cast_fp16)[name = string("attn_weights_29_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_115_cast_fp16 = add(x = attn_weights_29_cast_fp16, y = causal_mask_sliding)[name = string("x_115_cast_fp16")];
+            tensor<int32, [1]> reduce_max_7_axes_0 = const()[name = string("reduce_max_7_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_7_keep_dims_0 = const()[name = string("reduce_max_7_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_7 = reduce_max(axes = reduce_max_7_axes_0, keep_dims = reduce_max_7_keep_dims_0, x = x_115_cast_fp16)[name = string("reduce_max_7")];
+            tensor<fp16, [1, 8, 3, 512]> var_3293 = sub(x = x_115_cast_fp16, y = reduce_max_7)[name = string("op_3293")];
+            tensor<fp16, [1, 8, 3, 512]> var_3299 = exp(x = var_3293)[name = string("op_3299")];
+            tensor<int32, [1]> var_3309_axes_0 = const()[name = string("op_3309_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3309_keep_dims_0 = const()[name = string("op_3309_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_3309 = reduce_sum(axes = var_3309_axes_0, keep_dims = var_3309_keep_dims_0, x = var_3299)[name = string("op_3309")];
+            tensor<fp16, [1, 8, 3, 512]> var_3315_cast_fp16 = real_div(x = var_3299, y = var_3309)[name = string("op_3315_cast_fp16")];
+            bool attn_output_43_transpose_x_0 = const()[name = string("attn_output_43_transpose_x_0"), val = bool(false)];
+            bool attn_output_43_transpose_y_0 = const()[name = string("attn_output_43_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_43_cast_fp16 = matmul(transpose_x = attn_output_43_transpose_x_0, transpose_y = attn_output_43_transpose_y_0, x = var_3315_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_43_cast_fp16")];
+            tensor<int32, [4]> var_3326 = const()[name = string("op_3326"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_3333 = const()[name = string("op_3333"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_3327_cast_fp16 = transpose(perm = var_3326, x = attn_output_43_cast_fp16)[name = string("transpose_18")];
+            tensor<fp16, [1, 3, 2048]> attn_output_45_cast_fp16 = reshape(shape = var_3333, x = var_3327_cast_fp16)[name = string("attn_output_45_cast_fp16")];
+            tensor<int32, [3]> var_3338 = const()[name = string("op_3338"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_3354_pad_type_0 = const()[name = string("op_3354_pad_type_0"), val = string("valid")];
+            int32 var_3354_groups_0 = const()[name = string("op_3354_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_3354_strides_0 = const()[name = string("op_3354_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_3354_pad_0 = const()[name = string("op_3354_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_3354_dilations_0 = const()[name = string("op_3354_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_7_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406795456))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409416960))))[name = string("squeeze_7_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_3339_cast_fp16 = transpose(perm = var_3338, x = attn_output_45_cast_fp16)[name = string("transpose_17")];
+            tensor<fp16, [1, 2560, 3]> var_3354_cast_fp16 = conv(dilations = var_3354_dilations_0, groups = var_3354_groups_0, pad = var_3354_pad_0, pad_type = var_3354_pad_type_0, strides = var_3354_strides_0, weight = squeeze_7_cast_fp16_to_fp32_to_fp16_palettized, x = var_3339_cast_fp16)[name = string("op_3354_cast_fp16")];
+            tensor<int32, [3]> var_3358 = const()[name = string("op_3358"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3364 = const()[name = string("op_3364"), val = int32(-1)];
+            fp16 const_59_promoted_to_fp16 = const()[name = string("const_59_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_119_cast_fp16 = transpose(perm = var_3358, x = var_3354_cast_fp16)[name = string("transpose_16")];
+            tensor<fp16, [1, 3, 2560]> var_3366_cast_fp16 = mul(x = x_119_cast_fp16, y = const_59_promoted_to_fp16)[name = string("op_3366_cast_fp16")];
+            bool input_177_interleave_0 = const()[name = string("input_177_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_177_cast_fp16 = concat(axis = var_3364, interleave = input_177_interleave_0, values = (x_119_cast_fp16, var_3366_cast_fp16))[name = string("input_177_cast_fp16")];
+            tensor<int32, [1]> normed_177_axes_0 = const()[name = string("normed_177_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3361_to_fp16 = const()[name = string("op_3361_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_177_cast_fp16 = layer_norm(axes = normed_177_axes_0, epsilon = var_3361_to_fp16, x = input_177_cast_fp16)[name = string("normed_177_cast_fp16")];
+            tensor<int32, [2]> var_3371_split_sizes_0 = const()[name = string("op_3371_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3371_axis_0 = const()[name = string("op_3371_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3371_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3371_cast_fp16_1 = split(axis = var_3371_axis_0, split_sizes = var_3371_split_sizes_0, x = normed_177_cast_fp16)[name = string("op_3371_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_7_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409419584)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_47_cast_fp16 = mul(x = var_3371_cast_fp16_0, y = layers_7_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_47_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_121_cast_fp16 = add(x = x_111_cast_fp16, y = attn_output_47_cast_fp16)[name = string("x_121_cast_fp16")];
+            int32 var_3380 = const()[name = string("op_3380"), val = int32(-1)];
+            fp16 const_60_promoted_to_fp16 = const()[name = string("const_60_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_3382_cast_fp16 = mul(x = x_121_cast_fp16, y = const_60_promoted_to_fp16)[name = string("op_3382_cast_fp16")];
+            bool input_179_interleave_0 = const()[name = string("input_179_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_179_cast_fp16 = concat(axis = var_3380, interleave = input_179_interleave_0, values = (x_121_cast_fp16, var_3382_cast_fp16))[name = string("input_179_cast_fp16")];
+            tensor<int32, [1]> normed_181_axes_0 = const()[name = string("normed_181_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3377_to_fp16 = const()[name = string("op_3377_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_181_cast_fp16 = layer_norm(axes = normed_181_axes_0, epsilon = var_3377_to_fp16, x = input_179_cast_fp16)[name = string("normed_181_cast_fp16")];
+            tensor<int32, [2]> var_3387_split_sizes_0 = const()[name = string("op_3387_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3387_axis_0 = const()[name = string("op_3387_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3387_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3387_cast_fp16_1 = split(axis = var_3387_axis_0, split_sizes = var_3387_split_sizes_0, x = normed_181_cast_fp16)[name = string("op_3387_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_7_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409424768)))];
+            tensor<fp16, [1, 3, 2560]> h_45_cast_fp16 = mul(x = var_3387_cast_fp16_0, y = layers_7_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_45_cast_fp16")];
+            tensor<int32, [3]> var_3398 = const()[name = string("op_3398"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_181_axes_0 = const()[name = string("input_181_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3399 = transpose(perm = var_3398, x = h_45_cast_fp16)[name = string("transpose_15")];
+            tensor<fp16, [1, 2560, 1, 3]> input_181 = expand_dims(axes = input_181_axes_0, x = var_3399)[name = string("input_181")];
+            string gate_29_pad_type_0 = const()[name = string("gate_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_29_strides_0 = const()[name = string("gate_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_29_pad_0 = const()[name = string("gate_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_29_dilations_0 = const()[name = string("gate_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_29_groups_0 = const()[name = string("gate_29_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_29 = conv(dilations = gate_29_dilations_0, groups = gate_29_groups_0, pad = gate_29_pad_0, pad_type = gate_29_pad_type_0, strides = gate_29_strides_0, weight = layers_7_mlp_gate_proj_weight_palettized, x = input_181)[name = string("gate_29")];
+            string up_15_pad_type_0 = const()[name = string("up_15_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_15_strides_0 = const()[name = string("up_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_15_pad_0 = const()[name = string("up_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_15_dilations_0 = const()[name = string("up_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_15_groups_0 = const()[name = string("up_15_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_15 = conv(dilations = up_15_dilations_0, groups = up_15_groups_0, pad = up_15_pad_0, pad_type = up_15_pad_type_0, strides = up_15_strides_0, weight = layers_7_mlp_up_proj_weight_palettized, x = input_181)[name = string("up_15")];
+            string gate_31_mode_0 = const()[name = string("gate_31_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_31 = gelu(mode = gate_31_mode_0, x = gate_29)[name = string("gate_31")];
+            tensor<fp16, [1, 10240, 1, 3]> input_183 = mul(x = gate_31, y = up_15)[name = string("input_183")];
+            string mlp_out_15_pad_type_0 = const()[name = string("mlp_out_15_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_15_strides_0 = const()[name = string("mlp_out_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_15_pad_0 = const()[name = string("mlp_out_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_15_dilations_0 = const()[name = string("mlp_out_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_15_groups_0 = const()[name = string("mlp_out_15_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_15 = conv(dilations = mlp_out_15_dilations_0, groups = mlp_out_15_groups_0, pad = mlp_out_15_pad_0, pad_type = mlp_out_15_pad_type_0, strides = mlp_out_15_strides_0, weight = layers_7_mlp_down_proj_weight_palettized, x = input_183)[name = string("mlp_out_15")];
+            tensor<int32, [1]> var_3439_axes_0 = const()[name = string("op_3439_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3439 = squeeze(axes = var_3439_axes_0, x = mlp_out_15)[name = string("op_3439")];
+            tensor<int32, [3]> var_3443 = const()[name = string("op_3443"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3449 = const()[name = string("op_3449"), val = int32(-1)];
+            fp16 const_61_promoted = const()[name = string("const_61_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_123 = transpose(perm = var_3443, x = var_3439)[name = string("transpose_14")];
+            tensor<fp16, [1, 3, 2560]> var_3451 = mul(x = x_123, y = const_61_promoted)[name = string("op_3451")];
+            bool input_185_interleave_0 = const()[name = string("input_185_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_185 = concat(axis = var_3449, interleave = input_185_interleave_0, values = (x_123, var_3451))[name = string("input_185")];
+            tensor<int32, [1]> normed_185_axes_0 = const()[name = string("normed_185_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3446_to_fp16 = const()[name = string("op_3446_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_185_cast_fp16 = layer_norm(axes = normed_185_axes_0, epsilon = var_3446_to_fp16, x = input_185)[name = string("normed_185_cast_fp16")];
+            tensor<int32, [2]> var_3456_split_sizes_0 = const()[name = string("op_3456_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3456_axis_0 = const()[name = string("op_3456_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3456_0, tensor<fp16, [1, 3, 2560]> var_3456_1 = split(axis = var_3456_axis_0, split_sizes = var_3456_split_sizes_0, x = normed_185_cast_fp16)[name = string("op_3456")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_73 = mul(x = var_3456_0, y = layers_7_post_feedforward_layernorm_weight)[name = string("hidden_states_73")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_75_cast_fp16 = add(x = x_121_cast_fp16, y = hidden_states_73)[name = string("hidden_states_75_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_15_begin_0 = const()[name = string("per_layer_slice_15_begin_0"), val = tensor<int32, [3]>([0, 0, 7936])];
+            tensor<int32, [3]> per_layer_slice_15_end_0 = const()[name = string("per_layer_slice_15_end_0"), val = tensor<int32, [3]>([1, 3, 8192])];
+            tensor<bool, [3]> per_layer_slice_15_end_mask_0 = const()[name = string("per_layer_slice_15_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_15_cast_fp16 = slice_by_index(begin = per_layer_slice_15_begin_0, end = per_layer_slice_15_end_0, end_mask = per_layer_slice_15_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_15_cast_fp16")];
+            tensor<int32, [3]> var_3484 = const()[name = string("op_3484"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_187_axes_0 = const()[name = string("input_187_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3485 = transpose(perm = var_3484, x = hidden_states_75_cast_fp16)[name = string("transpose_13")];
+            tensor<fp16, [1, 2560, 1, 3]> input_187 = expand_dims(axes = input_187_axes_0, x = var_3485)[name = string("input_187")];
+            string gated_43_pad_type_0 = const()[name = string("gated_43_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_43_strides_0 = const()[name = string("gated_43_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_43_pad_0 = const()[name = string("gated_43_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_43_dilations_0 = const()[name = string("gated_43_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_43_groups_0 = const()[name = string("gated_43_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_43 = conv(dilations = gated_43_dilations_0, groups = gated_43_groups_0, pad = gated_43_pad_0, pad_type = gated_43_pad_type_0, strides = gated_43_strides_0, weight = layers_7_per_layer_input_gate_weight_palettized, x = input_187)[name = string("gated_43")];
+            string gated_45_mode_0 = const()[name = string("gated_45_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_45 = gelu(mode = gated_45_mode_0, x = gated_43)[name = string("gated_45")];
+            tensor<int32, [3]> var_3504 = const()[name = string("op_3504"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_15_axes_0 = const()[name = string("per_layer_slice_conv_15_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_3505_cast_fp16 = transpose(perm = var_3504, x = per_layer_slice_15_cast_fp16)[name = string("transpose_12")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_15_cast_fp16 = expand_dims(axes = per_layer_slice_conv_15_axes_0, x = var_3505_cast_fp16)[name = string("per_layer_slice_conv_15_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_189_cast_fp16 = mul(x = gated_45, y = per_layer_slice_conv_15_cast_fp16)[name = string("input_189_cast_fp16")];
+            string gated_47_pad_type_0 = const()[name = string("gated_47_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_47_strides_0 = const()[name = string("gated_47_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_47_pad_0 = const()[name = string("gated_47_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_47_dilations_0 = const()[name = string("gated_47_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_47_groups_0 = const()[name = string("gated_47_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_7_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409429952))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409757696))))[name = string("layers_7_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_47_cast_fp16 = conv(dilations = gated_47_dilations_0, groups = gated_47_groups_0, pad = gated_47_pad_0, pad_type = gated_47_pad_type_0, strides = gated_47_strides_0, weight = layers_7_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_189_cast_fp16)[name = string("gated_47_cast_fp16")];
+            tensor<int32, [1]> var_3521_axes_0 = const()[name = string("op_3521_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3521_cast_fp16 = squeeze(axes = var_3521_axes_0, x = gated_47_cast_fp16)[name = string("op_3521_cast_fp16")];
+            tensor<int32, [3]> var_3525 = const()[name = string("op_3525"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3531 = const()[name = string("op_3531"), val = int32(-1)];
+            fp16 const_62_promoted_to_fp16 = const()[name = string("const_62_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_125_cast_fp16 = transpose(perm = var_3525, x = var_3521_cast_fp16)[name = string("transpose_11")];
+            tensor<fp16, [1, 3, 2560]> var_3533_cast_fp16 = mul(x = x_125_cast_fp16, y = const_62_promoted_to_fp16)[name = string("op_3533_cast_fp16")];
+            bool input_191_interleave_0 = const()[name = string("input_191_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_191_cast_fp16 = concat(axis = var_3531, interleave = input_191_interleave_0, values = (x_125_cast_fp16, var_3533_cast_fp16))[name = string("input_191_cast_fp16")];
+            tensor<int32, [1]> normed_189_axes_0 = const()[name = string("normed_189_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3528_to_fp16 = const()[name = string("op_3528_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_189_cast_fp16 = layer_norm(axes = normed_189_axes_0, epsilon = var_3528_to_fp16, x = input_191_cast_fp16)[name = string("normed_189_cast_fp16")];
+            tensor<int32, [2]> var_3538_split_sizes_0 = const()[name = string("op_3538_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3538_axis_0 = const()[name = string("op_3538_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3538_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3538_cast_fp16_1 = split(axis = var_3538_axis_0, split_sizes = var_3538_split_sizes_0, x = normed_189_cast_fp16)[name = string("op_3538_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_7_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409760320)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_79_cast_fp16 = mul(x = var_3538_cast_fp16_0, y = layers_7_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_79_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_81_cast_fp16 = add(x = hidden_states_75_cast_fp16, y = hidden_states_79_cast_fp16)[name = string("hidden_states_81_cast_fp16")];
+            tensor<fp16, [1]> const_63_promoted_to_fp16 = const()[name = string("const_63_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.a2p-1])];
+            tensor<fp16, [1, 3, 2560]> x_127_cast_fp16 = mul(x = hidden_states_81_cast_fp16, y = const_63_promoted_to_fp16)[name = string("x_127_cast_fp16")];
+            int32 var_3553 = const()[name = string("op_3553"), val = int32(-1)];
+            fp16 const_64_promoted_to_fp16 = const()[name = string("const_64_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_3555_cast_fp16 = mul(x = x_127_cast_fp16, y = const_64_promoted_to_fp16)[name = string("op_3555_cast_fp16")];
+            bool input_193_interleave_0 = const()[name = string("input_193_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_193_cast_fp16 = concat(axis = var_3553, interleave = input_193_interleave_0, values = (x_127_cast_fp16, var_3555_cast_fp16))[name = string("input_193_cast_fp16")];
+            tensor<int32, [1]> normed_193_axes_0 = const()[name = string("normed_193_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3550_to_fp16 = const()[name = string("op_3550_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_193_cast_fp16 = layer_norm(axes = normed_193_axes_0, epsilon = var_3550_to_fp16, x = input_193_cast_fp16)[name = string("normed_193_cast_fp16")];
+            tensor<int32, [2]> var_3560_split_sizes_0 = const()[name = string("op_3560_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3560_axis_0 = const()[name = string("op_3560_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3560_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3560_cast_fp16_1 = split(axis = var_3560_axis_0, split_sizes = var_3560_split_sizes_0, x = normed_193_cast_fp16)[name = string("op_3560_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_8_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409765504)))];
+            tensor<fp16, [1, 3, 2560]> h_49_cast_fp16 = mul(x = var_3560_cast_fp16_0, y = layers_8_input_layernorm_weight_promoted_to_fp16)[name = string("h_49_cast_fp16")];
+            tensor<int32, [3]> var_3566 = const()[name = string("op_3566"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_3569_axes_0 = const()[name = string("op_3569_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3567_cast_fp16 = transpose(perm = var_3566, x = h_49_cast_fp16)[name = string("transpose_10")];
+            tensor<fp16, [1, 2560, 1, 3]> var_3569_cast_fp16 = expand_dims(axes = var_3569_axes_0, x = var_3567_cast_fp16)[name = string("op_3569_cast_fp16")];
+            string q_81_pad_type_0 = const()[name = string("q_81_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_81_strides_0 = const()[name = string("q_81_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_81_pad_0 = const()[name = string("q_81_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_81_dilations_0 = const()[name = string("q_81_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_81_groups_0 = const()[name = string("q_81_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_81 = conv(dilations = q_81_dilations_0, groups = q_81_groups_0, pad = q_81_pad_0, pad_type = q_81_pad_type_0, strides = q_81_strides_0, weight = layers_8_self_attn_q_proj_weight_palettized, x = var_3569_cast_fp16)[name = string("q_81")];
+            tensor<int32, [4]> var_3590 = const()[name = string("op_3590"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_3591 = reshape(shape = var_3590, x = q_81)[name = string("op_3591")];
+            tensor<int32, [4]> transpose_52_perm_0 = const()[name = string("transpose_52_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_3614 = const()[name = string("op_3614"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_52 = transpose(perm = transpose_52_perm_0, x = var_3591)[name = string("transpose_9")];
+            tensor<fp16, [3, 8, 256]> x_129 = reshape(shape = var_3614, x = transpose_52)[name = string("x_129")];
+            int32 var_3620 = const()[name = string("op_3620"), val = int32(-1)];
+            fp16 const_65_promoted = const()[name = string("const_65_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_3622 = mul(x = x_129, y = const_65_promoted)[name = string("op_3622")];
+            bool input_197_interleave_0 = const()[name = string("input_197_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_197 = concat(axis = var_3620, interleave = input_197_interleave_0, values = (x_129, var_3622))[name = string("input_197")];
+            tensor<int32, [1]> normed_197_axes_0 = const()[name = string("normed_197_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3617_to_fp16 = const()[name = string("op_3617_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_197_cast_fp16 = layer_norm(axes = normed_197_axes_0, epsilon = var_3617_to_fp16, x = input_197)[name = string("normed_197_cast_fp16")];
+            tensor<int32, [2]> var_3627_split_sizes_0 = const()[name = string("op_3627_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_3627_axis_0 = const()[name = string("op_3627_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_3627_0, tensor<fp16, [3, 8, 256]> var_3627_1 = split(axis = var_3627_axis_0, split_sizes = var_3627_split_sizes_0, x = normed_197_cast_fp16)[name = string("op_3627")];
+            tensor<fp16, [3, 8, 256]> q_85 = mul(x = var_3627_0, y = layers_0_self_attn_q_norm_weight)[name = string("q_85")];
+            tensor<int32, [4]> var_3634 = const()[name = string("op_3634"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_3635 = reshape(shape = var_3634, x = q_85)[name = string("op_3635")];
+            tensor<int32, [4]> var_3640 = const()[name = string("op_3640"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_87 = transpose(perm = var_3640, x = var_3635)[name = string("transpose_8")];
+            tensor<fp16, [1, 8, 3, 256]> var_3642_cast_fp16 = mul(x = q_87, y = cos_s)[name = string("op_3642_cast_fp16")];
+            tensor<int32, [2]> var_3643_split_sizes_0 = const()[name = string("op_3643_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_3643_axis_0 = const()[name = string("op_3643_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_3643_0, tensor<fp16, [1, 8, 3, 128]> var_3643_1 = split(axis = var_3643_axis_0, split_sizes = var_3643_split_sizes_0, x = q_87)[name = string("op_3643")];
+            fp16 const_66_promoted = const()[name = string("const_66_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_3645 = mul(x = var_3643_1, y = const_66_promoted)[name = string("op_3645")];
+            int32 var_3647 = const()[name = string("op_3647"), val = int32(-1)];
+            bool var_3648_interleave_0 = const()[name = string("op_3648_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_3648 = concat(axis = var_3647, interleave = var_3648_interleave_0, values = (var_3645, var_3643_0))[name = string("op_3648")];
+            tensor<fp16, [1, 8, 3, 256]> var_3649_cast_fp16 = mul(x = var_3648, y = sin_s)[name = string("op_3649_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_cast_fp16 = add(x = var_3642_cast_fp16, y = var_3649_cast_fp16)[name = string("q_cast_fp16")];
+            bool attn_weights_33_transpose_x_0 = const()[name = string("attn_weights_33_transpose_x_0"), val = bool(false)];
+            bool attn_weights_33_transpose_y_0 = const()[name = string("attn_weights_33_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_33_cast_fp16 = matmul(transpose_x = attn_weights_33_transpose_x_0, transpose_y = attn_weights_33_transpose_y_0, x = q_cast_fp16, y = transpose_37_cast_fp16)[name = string("attn_weights_33_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_131_cast_fp16 = add(x = attn_weights_33_cast_fp16, y = causal_mask_sliding)[name = string("x_131_cast_fp16")];
+            tensor<int32, [1]> reduce_max_8_axes_0 = const()[name = string("reduce_max_8_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_8_keep_dims_0 = const()[name = string("reduce_max_8_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_8 = reduce_max(axes = reduce_max_8_axes_0, keep_dims = reduce_max_8_keep_dims_0, x = x_131_cast_fp16)[name = string("reduce_max_8")];
+            tensor<fp16, [1, 8, 3, 512]> var_3681 = sub(x = x_131_cast_fp16, y = reduce_max_8)[name = string("op_3681")];
+            tensor<fp16, [1, 8, 3, 512]> var_3687 = exp(x = var_3681)[name = string("op_3687")];
+            tensor<int32, [1]> var_3697_axes_0 = const()[name = string("op_3697_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3697_keep_dims_0 = const()[name = string("op_3697_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_3697 = reduce_sum(axes = var_3697_axes_0, keep_dims = var_3697_keep_dims_0, x = var_3687)[name = string("op_3697")];
+            tensor<fp16, [1, 8, 3, 512]> var_3703_cast_fp16 = real_div(x = var_3687, y = var_3697)[name = string("op_3703_cast_fp16")];
+            bool attn_output_49_transpose_x_0 = const()[name = string("attn_output_49_transpose_x_0"), val = bool(false)];
+            bool attn_output_49_transpose_y_0 = const()[name = string("attn_output_49_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_49_cast_fp16 = matmul(transpose_x = attn_output_49_transpose_x_0, transpose_y = attn_output_49_transpose_y_0, x = var_3703_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_49_cast_fp16")];
+            tensor<int32, [4]> var_3714 = const()[name = string("op_3714"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_3721 = const()[name = string("op_3721"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_3715_cast_fp16 = transpose(perm = var_3714, x = attn_output_49_cast_fp16)[name = string("transpose_7")];
+            tensor<fp16, [1, 3, 2048]> attn_output_51_cast_fp16 = reshape(shape = var_3721, x = var_3715_cast_fp16)[name = string("attn_output_51_cast_fp16")];
+            tensor<int32, [3]> var_3726 = const()[name = string("op_3726"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_3742_pad_type_0 = const()[name = string("op_3742_pad_type_0"), val = string("valid")];
+            int32 var_3742_groups_0 = const()[name = string("op_3742_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_3742_strides_0 = const()[name = string("op_3742_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_3742_pad_0 = const()[name = string("op_3742_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_3742_dilations_0 = const()[name = string("op_3742_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_8_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409770688))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412392192))))[name = string("squeeze_8_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_3727_cast_fp16 = transpose(perm = var_3726, x = attn_output_51_cast_fp16)[name = string("transpose_6")];
+            tensor<fp16, [1, 2560, 3]> var_3742_cast_fp16 = conv(dilations = var_3742_dilations_0, groups = var_3742_groups_0, pad = var_3742_pad_0, pad_type = var_3742_pad_type_0, strides = var_3742_strides_0, weight = squeeze_8_cast_fp16_to_fp32_to_fp16_palettized, x = var_3727_cast_fp16)[name = string("op_3742_cast_fp16")];
+            tensor<int32, [3]> var_3746 = const()[name = string("op_3746"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3752 = const()[name = string("op_3752"), val = int32(-1)];
+            fp16 const_67_promoted_to_fp16 = const()[name = string("const_67_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_135_cast_fp16 = transpose(perm = var_3746, x = var_3742_cast_fp16)[name = string("transpose_5")];
+            tensor<fp16, [1, 3, 2560]> var_3754_cast_fp16 = mul(x = x_135_cast_fp16, y = const_67_promoted_to_fp16)[name = string("op_3754_cast_fp16")];
+            bool input_201_interleave_0 = const()[name = string("input_201_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_201_cast_fp16 = concat(axis = var_3752, interleave = input_201_interleave_0, values = (x_135_cast_fp16, var_3754_cast_fp16))[name = string("input_201_cast_fp16")];
+            tensor<int32, [1]> normed_201_axes_0 = const()[name = string("normed_201_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3749_to_fp16 = const()[name = string("op_3749_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_201_cast_fp16 = layer_norm(axes = normed_201_axes_0, epsilon = var_3749_to_fp16, x = input_201_cast_fp16)[name = string("normed_201_cast_fp16")];
+            tensor<int32, [2]> var_3759_split_sizes_0 = const()[name = string("op_3759_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3759_axis_0 = const()[name = string("op_3759_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3759_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3759_cast_fp16_1 = split(axis = var_3759_axis_0, split_sizes = var_3759_split_sizes_0, x = normed_201_cast_fp16)[name = string("op_3759_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_8_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412394816)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_cast_fp16 = mul(x = var_3759_cast_fp16_0, y = layers_8_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_137_cast_fp16 = add(x = x_127_cast_fp16, y = attn_output_cast_fp16)[name = string("x_137_cast_fp16")];
+            int32 var_3768 = const()[name = string("op_3768"), val = int32(-1)];
+            fp16 const_68_promoted_to_fp16 = const()[name = string("const_68_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_3770_cast_fp16 = mul(x = x_137_cast_fp16, y = const_68_promoted_to_fp16)[name = string("op_3770_cast_fp16")];
+            bool input_203_interleave_0 = const()[name = string("input_203_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_203_cast_fp16 = concat(axis = var_3768, interleave = input_203_interleave_0, values = (x_137_cast_fp16, var_3770_cast_fp16))[name = string("input_203_cast_fp16")];
+            tensor<int32, [1]> normed_205_axes_0 = const()[name = string("normed_205_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3765_to_fp16 = const()[name = string("op_3765_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_205_cast_fp16 = layer_norm(axes = normed_205_axes_0, epsilon = var_3765_to_fp16, x = input_203_cast_fp16)[name = string("normed_205_cast_fp16")];
+            tensor<int32, [2]> var_3775_split_sizes_0 = const()[name = string("op_3775_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3775_axis_0 = const()[name = string("op_3775_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3775_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3775_cast_fp16_1 = split(axis = var_3775_axis_0, split_sizes = var_3775_split_sizes_0, x = normed_205_cast_fp16)[name = string("op_3775_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_8_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412400000)))];
+            tensor<fp16, [1, 3, 2560]> h_51_cast_fp16 = mul(x = var_3775_cast_fp16_0, y = layers_8_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_51_cast_fp16")];
+            tensor<int32, [3]> var_3786 = const()[name = string("op_3786"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_205_axes_0 = const()[name = string("input_205_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3787 = transpose(perm = var_3786, x = h_51_cast_fp16)[name = string("transpose_4")];
+            tensor<fp16, [1, 2560, 1, 3]> input_205 = expand_dims(axes = input_205_axes_0, x = var_3787)[name = string("input_205")];
+            string gate_33_pad_type_0 = const()[name = string("gate_33_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_33_strides_0 = const()[name = string("gate_33_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_33_pad_0 = const()[name = string("gate_33_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_33_dilations_0 = const()[name = string("gate_33_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_33_groups_0 = const()[name = string("gate_33_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_33 = conv(dilations = gate_33_dilations_0, groups = gate_33_groups_0, pad = gate_33_pad_0, pad_type = gate_33_pad_type_0, strides = gate_33_strides_0, weight = layers_8_mlp_gate_proj_weight_palettized, x = input_205)[name = string("gate_33")];
+            string up_pad_type_0 = const()[name = string("up_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_strides_0 = const()[name = string("up_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_pad_0 = const()[name = string("up_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_dilations_0 = const()[name = string("up_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_groups_0 = const()[name = string("up_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up = conv(dilations = up_dilations_0, groups = up_groups_0, pad = up_pad_0, pad_type = up_pad_type_0, strides = up_strides_0, weight = layers_8_mlp_up_proj_weight_palettized, x = input_205)[name = string("up")];
+            string gate_mode_0 = const()[name = string("gate_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate = gelu(mode = gate_mode_0, x = gate_33)[name = string("gate")];
+            tensor<fp16, [1, 10240, 1, 3]> input_207 = mul(x = gate, y = up)[name = string("input_207")];
+            string mlp_out_pad_type_0 = const()[name = string("mlp_out_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_strides_0 = const()[name = string("mlp_out_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_pad_0 = const()[name = string("mlp_out_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_dilations_0 = const()[name = string("mlp_out_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_groups_0 = const()[name = string("mlp_out_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out = conv(dilations = mlp_out_dilations_0, groups = mlp_out_groups_0, pad = mlp_out_pad_0, pad_type = mlp_out_pad_type_0, strides = mlp_out_strides_0, weight = layers_8_mlp_down_proj_weight_palettized, x = input_207)[name = string("mlp_out")];
+            tensor<int32, [1]> var_3827_axes_0 = const()[name = string("op_3827_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3827 = squeeze(axes = var_3827_axes_0, x = mlp_out)[name = string("op_3827")];
+            tensor<int32, [3]> var_3831 = const()[name = string("op_3831"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3837 = const()[name = string("op_3837"), val = int32(-1)];
+            fp16 const_69_promoted = const()[name = string("const_69_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_139 = transpose(perm = var_3831, x = var_3827)[name = string("transpose_3")];
+            tensor<fp16, [1, 3, 2560]> var_3839 = mul(x = x_139, y = const_69_promoted)[name = string("op_3839")];
+            bool input_209_interleave_0 = const()[name = string("input_209_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_209 = concat(axis = var_3837, interleave = input_209_interleave_0, values = (x_139, var_3839))[name = string("input_209")];
+            tensor<int32, [1]> normed_209_axes_0 = const()[name = string("normed_209_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3834_to_fp16 = const()[name = string("op_3834_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_209_cast_fp16 = layer_norm(axes = normed_209_axes_0, epsilon = var_3834_to_fp16, x = input_209)[name = string("normed_209_cast_fp16")];
+            tensor<int32, [2]> var_3844_split_sizes_0 = const()[name = string("op_3844_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3844_axis_0 = const()[name = string("op_3844_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3844_0, tensor<fp16, [1, 3, 2560]> var_3844_1 = split(axis = var_3844_axis_0, split_sizes = var_3844_split_sizes_0, x = normed_209_cast_fp16)[name = string("op_3844")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_83 = mul(x = var_3844_0, y = layers_8_post_feedforward_layernorm_weight)[name = string("hidden_states_83")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_85_cast_fp16 = add(x = x_137_cast_fp16, y = hidden_states_83)[name = string("hidden_states_85_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_begin_0 = const()[name = string("per_layer_slice_begin_0"), val = tensor<int32, [3]>([0, 0, 8192])];
+            tensor<int32, [3]> per_layer_slice_end_0 = const()[name = string("per_layer_slice_end_0"), val = tensor<int32, [3]>([1, 3, 8448])];
+            tensor<bool, [3]> per_layer_slice_end_mask_0 = const()[name = string("per_layer_slice_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_cast_fp16 = slice_by_index(begin = per_layer_slice_begin_0, end = per_layer_slice_end_0, end_mask = per_layer_slice_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_cast_fp16")];
+            tensor<int32, [3]> var_3872 = const()[name = string("op_3872"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_211_axes_0 = const()[name = string("input_211_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3873 = transpose(perm = var_3872, x = hidden_states_85_cast_fp16)[name = string("transpose_2")];
+            tensor<fp16, [1, 2560, 1, 3]> input_211 = expand_dims(axes = input_211_axes_0, x = var_3873)[name = string("input_211")];
+            string gated_49_pad_type_0 = const()[name = string("gated_49_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_49_strides_0 = const()[name = string("gated_49_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_49_pad_0 = const()[name = string("gated_49_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_49_dilations_0 = const()[name = string("gated_49_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_49_groups_0 = const()[name = string("gated_49_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_49 = conv(dilations = gated_49_dilations_0, groups = gated_49_groups_0, pad = gated_49_pad_0, pad_type = gated_49_pad_type_0, strides = gated_49_strides_0, weight = layers_8_per_layer_input_gate_weight_palettized, x = input_211)[name = string("gated_49")];
+            string gated_51_mode_0 = const()[name = string("gated_51_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_51 = gelu(mode = gated_51_mode_0, x = gated_49)[name = string("gated_51")];
+            tensor<int32, [3]> var_3892 = const()[name = string("op_3892"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_axes_0 = const()[name = string("per_layer_slice_conv_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_3893_cast_fp16 = transpose(perm = var_3892, x = per_layer_slice_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_cast_fp16 = expand_dims(axes = per_layer_slice_conv_axes_0, x = var_3893_cast_fp16)[name = string("per_layer_slice_conv_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_213_cast_fp16 = mul(x = gated_51, y = per_layer_slice_conv_cast_fp16)[name = string("input_213_cast_fp16")];
+            string gated_pad_type_0 = const()[name = string("gated_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_strides_0 = const()[name = string("gated_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_pad_0 = const()[name = string("gated_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_dilations_0 = const()[name = string("gated_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_groups_0 = const()[name = string("gated_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_8_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412405184))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412732928))))[name = string("layers_8_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_cast_fp16 = conv(dilations = gated_dilations_0, groups = gated_groups_0, pad = gated_pad_0, pad_type = gated_pad_type_0, strides = gated_strides_0, weight = layers_8_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_213_cast_fp16)[name = string("gated_cast_fp16")];
+            tensor<int32, [1]> var_3909_axes_0 = const()[name = string("op_3909_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3909_cast_fp16 = squeeze(axes = var_3909_axes_0, x = gated_cast_fp16)[name = string("op_3909_cast_fp16")];
+            tensor<int32, [3]> var_3913 = const()[name = string("op_3913"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3919 = const()[name = string("op_3919"), val = int32(-1)];
+            fp16 const_70_promoted_to_fp16 = const()[name = string("const_70_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_cast_fp16 = transpose(perm = var_3913, x = var_3909_cast_fp16)[name = string("transpose_0")];
+            tensor<fp16, [1, 3, 2560]> var_3921_cast_fp16 = mul(x = x_cast_fp16, y = const_70_promoted_to_fp16)[name = string("op_3921_cast_fp16")];
+            bool input_interleave_0 = const()[name = string("input_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_cast_fp16 = concat(axis = var_3919, interleave = input_interleave_0, values = (x_cast_fp16, var_3921_cast_fp16))[name = string("input_cast_fp16")];
+            tensor<int32, [1]> normed_213_axes_0 = const()[name = string("normed_213_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3916_to_fp16 = const()[name = string("op_3916_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_213_cast_fp16 = layer_norm(axes = normed_213_axes_0, epsilon = var_3916_to_fp16, x = input_cast_fp16)[name = string("normed_213_cast_fp16")];
+            tensor<int32, [2]> var_3926_split_sizes_0 = const()[name = string("op_3926_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3926_axis_0 = const()[name = string("op_3926_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3926_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3926_cast_fp16_1 = split(axis = var_3926_axis_0, split_sizes = var_3926_split_sizes_0, x = normed_213_cast_fp16)[name = string("op_3926_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_8_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412735552)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_89_cast_fp16 = mul(x = var_3926_cast_fp16_0, y = layers_8_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_89_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_cast_fp16 = add(x = hidden_states_85_cast_fp16, y = hidden_states_89_cast_fp16)[name = string("hidden_states_cast_fp16")];
+            tensor<fp16, [1]> const_71_promoted_to_fp16 = const()[name = string("const_71_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.b4p-1])];
+            tensor<fp16, [1, 3, 2560]> hidden_states_out = mul(x = hidden_states_cast_fp16, y = const_71_promoted_to_fp16)[name = string("op_3936_cast_fp16")];
+        } -> (hidden_states_out);
+}
\ No newline at end of file
diff --git a/chunk3.mlmodelc/weights/weight.bin b/chunk3.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2c7703f8d2419ab345266f1a991aa5e4c487542f
--- /dev/null
+++ b/chunk3.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db45f9ce7443de57765ba412a0158ac2ad46a9d2f735fba9376bbdb0aa357b88
+size 412740736
diff --git a/chunk3_3way.mlmodelc/analytics/coremldata.bin b/chunk3_3way.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..de6ca6443658b6dc5c97fa73bb4701adf65c9858
--- /dev/null
+++ b/chunk3_3way.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83149a33c6c49a2607a6e038d88d42ea6829bd7ae99bbc67bbba085b983cff48
+size 243
diff --git a/chunk3_3way.mlmodelc/coremldata.bin b/chunk3_3way.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..aa08a79a1986ac94ae90668c5d079531bf3964be
--- /dev/null
+++ b/chunk3_3way.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:346fab1c61528cc8fccc0b5f0d65bb8943b6ebafb0f983a1d6cb1361047195d5
+size 780
diff --git a/chunk3_3way.mlmodelc/metadata.json b/chunk3_3way.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..f8e093f28fed01a3e21f1463cb42d267e9b7e8bd
--- /dev/null
+++ b/chunk3_3way.mlmodelc/metadata.json
@@ -0,0 +1,224 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Mixed (Float16, Palettized (10 bits), Palettized (11 bits), Palettized (13 bits), Palettized (17 bits), Palettized (7 bits), UInt4)",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "token_id",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "token_logit",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1 × 2560)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 2560]",
+        "name" : "hidden_states_out",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 9,
+    "mlProgramOperationTypeHistogram" : {
+      "Ios18.expandDims" : 37,
+      "Ios18.mul" : 166,
+      "Ios18.matmul" : 18,
+      "Identity" : 1,
+      "Ios18.exp" : 9,
+      "Ios18.realDiv" : 9,
+      "Split" : 64,
+      "Ios18.gatherAlongAxis" : 1,
+      "Ios16.reduceMax" : 9,
+      "Tile" : 4,
+      "Ios16.reduceSum" : 9,
+      "Ios18.add" : 45,
+      "Ios18.layerNorm" : 55,
+      "Ios18.reduceArgmax" : 1,
+      "Ios18.reshape" : 44,
+      "Ios18.constexprLutToDense" : 64,
+      "Ios18.conv" : 64,
+      "Ios18.gelu" : 18,
+      "Ios18.concat" : 64,
+      "Ios18.sub" : 9,
+      "Ios18.transpose" : 104,
+      "Ios18.tanh" : 1,
+      "Ios18.squeeze" : 20,
+      "Ios18.sliceByIndex" : 9
+    },
+    "computePrecision" : "Mixed (Float16, Int32)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+
+    ],
+    "availability" : {
+      "macOS" : "15.0",
+      "tvOS" : "18.0",
+      "visionOS" : "2.0",
+      "watchOS" : "11.0",
+      "iOS" : "18.0",
+      "macCatalyst" : "18.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.conversion_date" : "2026-04-30",
+      "com.github.apple.coremltools.source" : "torch==2.11.0",
+      "com.github.apple.coremltools.version" : "9.0",
+      "com.github.apple.coremltools.source_dialect" : "TorchScript"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1 × 2560)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 2560]",
+        "name" : "hidden_states",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1 × 1 × 2048)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 1, 2048]",
+        "name" : "causal_mask_full",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1 × 1 × 512)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 1, 512]",
+        "name" : "causal_mask_sliding",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1 × 2048 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 2048, 1]",
+        "name" : "update_mask",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1 × 10752)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 10752]",
+        "name" : "per_layer_combined",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1 × 1 × 256)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 1, 256]",
+        "name" : "cos_s",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1 × 1 × 256)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 1, 256]",
+        "name" : "sin_s",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1 × 1 × 512)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 1, 512]",
+        "name" : "cos_f",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 1 × 1 × 512)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 1, 512]",
+        "name" : "sin_f",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 2 × 512 × 256)",
+        "shortDescription" : "",
+        "shape" : "[1, 2, 512, 256]",
+        "name" : "kv13_k",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 2 × 512 × 256)",
+        "shortDescription" : "",
+        "shape" : "[1, 2, 512, 256]",
+        "name" : "kv13_v",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 2 × 2048 × 512)",
+        "shortDescription" : "",
+        "shape" : "[1, 2, 2048, 512]",
+        "name" : "kv14_k",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 2 × 2048 × 512)",
+        "shortDescription" : "",
+        "shape" : "[1, 2, 2048, 512]",
+        "name" : "kv14_v",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "chunk3_3way",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/chunk3_3way.mlmodelc/model.mil b/chunk3_3way.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..40c2489f08b885fb9c2fbb442f2c20ab58a077d4
--- /dev/null
+++ b/chunk3_3way.mlmodelc/model.mil
@@ -0,0 +1,1971 @@
+program(1.3)
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3500.14.1"}, {"coremlc-version", "3500.32.1"}})]
+{
+    func main<ios18>(tensor<fp16, [1, 1, 1, 2048]> causal_mask_full, tensor<fp16, [1, 1, 1, 512]> causal_mask_sliding, tensor<fp16, [1, 1, 1, 512]> cos_f, tensor<fp16, [1, 1, 1, 256]> cos_s, tensor<fp16, [1, 1, 2560]> hidden_states, tensor<fp16, [1, 2, 512, 256]> kv13_k, tensor<fp16, [1, 2, 512, 256]> kv13_v, tensor<fp16, [1, 2, 2048, 512]> kv14_k, tensor<fp16, [1, 2, 2048, 512]> kv14_v, tensor<fp16, [1, 1, 10752]> per_layer_combined, tensor<fp16, [1, 1, 1, 512]> sin_f, tensor<fp16, [1, 1, 1, 256]> sin_s, tensor<fp16, [1, 1, 2048, 1]> update_mask) {
+            tensor<fp16, [2048, 2560, 1, 1]> layers_0_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2621568))))[name = string("layers_0_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_0_self_attn_q_norm_weight = const()[name = string("layers_0_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2623680)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_0_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2624256))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(15731520))))[name = string("layers_0_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_0_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(15741824))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(28849088))))[name = string("layers_0_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_0_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(28859392))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(41966656))))[name = string("layers_0_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_0_post_feedforward_layernorm_weight = const()[name = string("layers_0_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(41969280)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_0_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(41974464))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(42302208))))[name = string("layers_0_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_1_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(42302528))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(44924032))))[name = string("layers_1_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_1_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(44926144))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(58033408))))[name = string("layers_1_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_1_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(58043712))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(71150976))))[name = string("layers_1_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_1_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(71161280))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(84268544))))[name = string("layers_1_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_1_post_feedforward_layernorm_weight = const()[name = string("layers_1_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(84271168)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_1_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(84276352))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(84604096))))[name = string("layers_1_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [4096, 2560, 1, 1]> layers_2_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(84604416))), lut = tensor<fp16, [128, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(89847360))))[name = string("layers_2_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [512]> layers_2_self_attn_q_norm_weight = const()[name = string("layers_2_self_attn_q_norm_weight"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(89851520)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_2_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(89852608))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(102959872))))[name = string("layers_2_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_2_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(102970176))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(116077440))))[name = string("layers_2_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_2_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(116087744))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(129195008))))[name = string("layers_2_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_2_post_feedforward_layernorm_weight = const()[name = string("layers_2_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(129197632)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_2_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(129202816))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(129530560))))[name = string("layers_2_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_3_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(129530880))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(132152384))))[name = string("layers_3_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_3_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(132154496))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(145261760))))[name = string("layers_3_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_3_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(145272064))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(158379328))))[name = string("layers_3_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_3_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(158389632))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(171496896))))[name = string("layers_3_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_3_post_feedforward_layernorm_weight = const()[name = string("layers_3_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(171499520)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_3_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(171504704))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(171832448))))[name = string("layers_3_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_4_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(171832768))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(174454272))))[name = string("layers_4_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_4_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(174456384))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(187563648))))[name = string("layers_4_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_4_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(187573952))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(200681216))))[name = string("layers_4_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_4_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(200691520))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(213798784))))[name = string("layers_4_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_4_post_feedforward_layernorm_weight = const()[name = string("layers_4_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(213801408)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_4_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(213806592))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(214134336))))[name = string("layers_4_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_5_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(214134656))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(216756160))))[name = string("layers_5_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_5_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(216758272))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(229865536))))[name = string("layers_5_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_5_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(229875840))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(242983104))))[name = string("layers_5_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_5_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(242993408))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(256100672))))[name = string("layers_5_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_5_post_feedforward_layernorm_weight = const()[name = string("layers_5_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(256103296)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_5_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(256108480))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(256436224))))[name = string("layers_5_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_6_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(256436544))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(259058048))))[name = string("layers_6_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_6_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(259060160))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(272167424))))[name = string("layers_6_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_6_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(272177728))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(285284992))))[name = string("layers_6_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_6_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(285295296))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(298402560))))[name = string("layers_6_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_6_post_feedforward_layernorm_weight = const()[name = string("layers_6_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(298405184)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_6_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(298410368))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(298738112))))[name = string("layers_6_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_7_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(298738432))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(301359936))))[name = string("layers_7_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_7_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(301362048))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(314469312))))[name = string("layers_7_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_7_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(314479616))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(327586880))))[name = string("layers_7_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_7_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(327597184))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(340704448))))[name = string("layers_7_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_7_post_feedforward_layernorm_weight = const()[name = string("layers_7_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(340707072)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_7_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(340712256))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(341040000))))[name = string("layers_7_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [4096, 2560, 1, 1]> layers_8_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(341040320))), lut = tensor<fp16, [128, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(346283264))))[name = string("layers_8_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_8_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(346287424))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(359394688))))[name = string("layers_8_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_8_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(359404992))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(372512256))))[name = string("layers_8_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_8_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(372522560))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(385629824))))[name = string("layers_8_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_8_post_feedforward_layernorm_weight = const()[name = string("layers_8_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(385632448)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_8_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(385637632))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(385965376))))[name = string("layers_8_per_layer_input_gate_weight_palettized")];
+            int32 var_452 = const()[name = string("op_452"), val = int32(-1)];
+            fp16 const_0_promoted_to_fp16 = const()[name = string("const_0_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_454_cast_fp16 = mul(x = hidden_states, y = const_0_promoted_to_fp16)[name = string("op_454_cast_fp16")];
+            bool input_1_interleave_0 = const()[name = string("input_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_1_cast_fp16 = concat(axis = var_452, interleave = input_1_interleave_0, values = (hidden_states, var_454_cast_fp16))[name = string("input_1_cast_fp16")];
+            tensor<int32, [1]> normed_1_axes_0 = const()[name = string("normed_1_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_449_to_fp16 = const()[name = string("op_449_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_1_cast_fp16 = layer_norm(axes = normed_1_axes_0, epsilon = var_449_to_fp16, x = input_1_cast_fp16)[name = string("normed_1_cast_fp16")];
+            tensor<int32, [2]> var_459_split_sizes_0 = const()[name = string("op_459_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_459_axis_0 = const()[name = string("op_459_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_459_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_459_cast_fp16_1 = split(axis = var_459_axis_0, split_sizes = var_459_split_sizes_0, x = normed_1_cast_fp16)[name = string("op_459_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_0_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(385965696)))];
+            tensor<fp16, [1, 1, 2560]> h_1_cast_fp16 = mul(x = var_459_cast_fp16_0, y = layers_0_input_layernorm_weight_promoted_to_fp16)[name = string("h_1_cast_fp16")];
+            tensor<int32, [3]> var_465 = const()[name = string("op_465"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_468_axes_0 = const()[name = string("op_468_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_466_cast_fp16 = transpose(perm = var_465, x = h_1_cast_fp16)[name = string("transpose_103")];
+            tensor<fp16, [1, 2560, 1, 1]> var_468_cast_fp16 = expand_dims(axes = var_468_axes_0, x = var_466_cast_fp16)[name = string("op_468_cast_fp16")];
+            string var_484_pad_type_0 = const()[name = string("op_484_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_484_strides_0 = const()[name = string("op_484_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_484_pad_0 = const()[name = string("op_484_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_484_dilations_0 = const()[name = string("op_484_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_484_groups_0 = const()[name = string("op_484_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_484 = conv(dilations = var_484_dilations_0, groups = var_484_groups_0, pad = var_484_pad_0, pad_type = var_484_pad_type_0, strides = var_484_strides_0, weight = layers_0_self_attn_q_proj_weight_palettized, x = var_468_cast_fp16)[name = string("op_484")];
+            tensor<int32, [4]> var_489 = const()[name = string("op_489"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_490 = reshape(shape = var_489, x = var_484)[name = string("op_490")];
+            tensor<int32, [4]> var_495 = const()[name = string("op_495"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_505 = const()[name = string("op_505"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_496 = transpose(perm = var_495, x = var_490)[name = string("transpose_102")];
+            tensor<fp16, [1, 8, 256]> x_1 = reshape(shape = var_505, x = var_496)[name = string("x_1")];
+            int32 var_511 = const()[name = string("op_511"), val = int32(-1)];
+            fp16 const_1_promoted = const()[name = string("const_1_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_513 = mul(x = x_1, y = const_1_promoted)[name = string("op_513")];
+            bool input_5_interleave_0 = const()[name = string("input_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_5 = concat(axis = var_511, interleave = input_5_interleave_0, values = (x_1, var_513))[name = string("input_5")];
+            tensor<int32, [1]> normed_5_axes_0 = const()[name = string("normed_5_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_508_to_fp16 = const()[name = string("op_508_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_5_cast_fp16 = layer_norm(axes = normed_5_axes_0, epsilon = var_508_to_fp16, x = input_5)[name = string("normed_5_cast_fp16")];
+            tensor<int32, [2]> var_518_split_sizes_0 = const()[name = string("op_518_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_518_axis_0 = const()[name = string("op_518_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_518_0, tensor<fp16, [1, 8, 256]> var_518_1 = split(axis = var_518_axis_0, split_sizes = var_518_split_sizes_0, x = normed_5_cast_fp16)[name = string("op_518")];
+            tensor<fp16, [1, 8, 256]> var_520 = mul(x = var_518_0, y = layers_0_self_attn_q_norm_weight)[name = string("op_520")];
+            tensor<int32, [4]> var_525 = const()[name = string("op_525"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_3 = reshape(shape = var_525, x = var_520)[name = string("q_3")];
+            tensor<fp16, [1, 8, 1, 256]> var_527_cast_fp16 = mul(x = q_3, y = cos_s)[name = string("op_527_cast_fp16")];
+            tensor<int32, [2]> var_528_split_sizes_0 = const()[name = string("op_528_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_528_axis_0 = const()[name = string("op_528_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_528_0, tensor<fp16, [1, 8, 1, 128]> var_528_1 = split(axis = var_528_axis_0, split_sizes = var_528_split_sizes_0, x = q_3)[name = string("op_528")];
+            fp16 const_2_promoted = const()[name = string("const_2_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_530 = mul(x = var_528_1, y = const_2_promoted)[name = string("op_530")];
+            int32 var_532 = const()[name = string("op_532"), val = int32(-1)];
+            bool var_533_interleave_0 = const()[name = string("op_533_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_533 = concat(axis = var_532, interleave = var_533_interleave_0, values = (var_530, var_528_0))[name = string("op_533")];
+            tensor<fp16, [1, 8, 1, 256]> var_534_cast_fp16 = mul(x = var_533, y = sin_s)[name = string("op_534_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_5_cast_fp16 = add(x = var_527_cast_fp16, y = var_534_cast_fp16)[name = string("q_5_cast_fp16")];
+            tensor<int32, [4]> transpose_0_perm_0 = const()[name = string("transpose_0_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_0_reps_0 = const()[name = string("tile_0_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_0_cast_fp16 = transpose(perm = transpose_0_perm_0, x = kv13_k)[name = string("transpose_101")];
+            tensor<fp16, [8, 1, 512, 256]> tile_0_cast_fp16 = tile(reps = tile_0_reps_0, x = transpose_0_cast_fp16)[name = string("tile_0_cast_fp16")];
+            tensor<int32, [5]> concat_0 = const()[name = string("concat_0"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_0_cast_fp16 = reshape(shape = concat_0, x = tile_0_cast_fp16)[name = string("reshape_0_cast_fp16")];
+            tensor<int32, [5]> transpose_1_perm_0 = const()[name = string("transpose_1_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_1 = const()[name = string("concat_1"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_1_cast_fp16 = transpose(perm = transpose_1_perm_0, x = reshape_0_cast_fp16)[name = string("transpose_100")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_1_cast_fp16 = reshape(shape = concat_1, x = transpose_1_cast_fp16)[name = string("reshape_1_cast_fp16")];
+            tensor<int32, [4]> transpose_36_perm_0 = const()[name = string("transpose_36_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_2_perm_0 = const()[name = string("transpose_2_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_1_reps_0 = const()[name = string("tile_1_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_2_cast_fp16 = transpose(perm = transpose_2_perm_0, x = kv13_v)[name = string("transpose_99")];
+            tensor<fp16, [8, 1, 512, 256]> tile_1_cast_fp16 = tile(reps = tile_1_reps_0, x = transpose_2_cast_fp16)[name = string("tile_1_cast_fp16")];
+            tensor<int32, [5]> concat_2 = const()[name = string("concat_2"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_2_cast_fp16 = reshape(shape = concat_2, x = tile_1_cast_fp16)[name = string("reshape_2_cast_fp16")];
+            tensor<int32, [5]> transpose_3_perm_0 = const()[name = string("transpose_3_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_3 = const()[name = string("concat_3"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_3_cast_fp16 = transpose(perm = transpose_3_perm_0, x = reshape_2_cast_fp16)[name = string("transpose_98")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_3_cast_fp16 = reshape(shape = concat_3, x = transpose_3_cast_fp16)[name = string("reshape_3_cast_fp16")];
+            tensor<int32, [4]> V_expanded_1_perm_0 = const()[name = string("V_expanded_1_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(false)];
+            bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_36_cast_fp16 = transpose(perm = transpose_36_perm_0, x = reshape_1_cast_fp16)[name = string("transpose_97")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = q_5_cast_fp16, y = transpose_36_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = causal_mask_sliding)[name = string("x_3_cast_fp16")];
+            tensor<int32, [1]> reduce_max_0_axes_0 = const()[name = string("reduce_max_0_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_0_keep_dims_0 = const()[name = string("reduce_max_0_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_0 = reduce_max(axes = reduce_max_0_axes_0, keep_dims = reduce_max_0_keep_dims_0, x = x_3_cast_fp16)[name = string("reduce_max_0")];
+            tensor<fp16, [1, 8, 1, 512]> var_566 = sub(x = x_3_cast_fp16, y = reduce_max_0)[name = string("op_566")];
+            tensor<fp16, [1, 8, 1, 512]> var_572 = exp(x = var_566)[name = string("op_572")];
+            tensor<int32, [1]> var_582_axes_0 = const()[name = string("op_582_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_582_keep_dims_0 = const()[name = string("op_582_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_582 = reduce_sum(axes = var_582_axes_0, keep_dims = var_582_keep_dims_0, x = var_572)[name = string("op_582")];
+            tensor<fp16, [1, 8, 1, 512]> var_588_cast_fp16 = real_div(x = var_572, y = var_582)[name = string("op_588_cast_fp16")];
+            bool attn_output_1_transpose_x_0 = const()[name = string("attn_output_1_transpose_x_0"), val = bool(false)];
+            bool attn_output_1_transpose_y_0 = const()[name = string("attn_output_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_1_cast_fp16 = transpose(perm = V_expanded_1_perm_0, x = reshape_3_cast_fp16)[name = string("transpose_96")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_1_cast_fp16 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = var_588_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_1_cast_fp16")];
+            tensor<int32, [4]> var_599 = const()[name = string("op_599"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_606 = const()[name = string("op_606"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_600_cast_fp16 = transpose(perm = var_599, x = attn_output_1_cast_fp16)[name = string("transpose_95")];
+            tensor<fp16, [1, 1, 2048]> attn_output_3_cast_fp16 = reshape(shape = var_606, x = var_600_cast_fp16)[name = string("attn_output_3_cast_fp16")];
+            tensor<int32, [3]> var_611 = const()[name = string("op_611"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_627_pad_type_0 = const()[name = string("op_627_pad_type_0"), val = string("valid")];
+            int32 var_627_groups_0 = const()[name = string("op_627_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_627_strides_0 = const()[name = string("op_627_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_627_pad_0 = const()[name = string("op_627_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_627_dilations_0 = const()[name = string("op_627_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_0_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(385970880))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388592384))))[name = string("squeeze_0_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_612_cast_fp16 = transpose(perm = var_611, x = attn_output_3_cast_fp16)[name = string("transpose_94")];
+            tensor<fp16, [1, 2560, 1]> var_627_cast_fp16 = conv(dilations = var_627_dilations_0, groups = var_627_groups_0, pad = var_627_pad_0, pad_type = var_627_pad_type_0, strides = var_627_strides_0, weight = squeeze_0_cast_fp16_to_fp32_to_fp16_palettized, x = var_612_cast_fp16)[name = string("op_627_cast_fp16")];
+            tensor<int32, [3]> var_631 = const()[name = string("op_631"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_637 = const()[name = string("op_637"), val = int32(-1)];
+            fp16 const_3_promoted_to_fp16 = const()[name = string("const_3_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_7_cast_fp16 = transpose(perm = var_631, x = var_627_cast_fp16)[name = string("transpose_93")];
+            tensor<fp16, [1, 1, 2560]> var_639_cast_fp16 = mul(x = x_7_cast_fp16, y = const_3_promoted_to_fp16)[name = string("op_639_cast_fp16")];
+            bool input_9_interleave_0 = const()[name = string("input_9_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_9_cast_fp16 = concat(axis = var_637, interleave = input_9_interleave_0, values = (x_7_cast_fp16, var_639_cast_fp16))[name = string("input_9_cast_fp16")];
+            tensor<int32, [1]> normed_9_axes_0 = const()[name = string("normed_9_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_634_to_fp16 = const()[name = string("op_634_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_9_cast_fp16 = layer_norm(axes = normed_9_axes_0, epsilon = var_634_to_fp16, x = input_9_cast_fp16)[name = string("normed_9_cast_fp16")];
+            tensor<int32, [2]> var_644_split_sizes_0 = const()[name = string("op_644_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_644_axis_0 = const()[name = string("op_644_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_644_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_644_cast_fp16_1 = split(axis = var_644_axis_0, split_sizes = var_644_split_sizes_0, x = normed_9_cast_fp16)[name = string("op_644_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_0_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388595008)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_5_cast_fp16 = mul(x = var_644_cast_fp16_0, y = layers_0_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_5_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_9_cast_fp16 = add(x = hidden_states, y = attn_output_5_cast_fp16)[name = string("x_9_cast_fp16")];
+            int32 var_653 = const()[name = string("op_653"), val = int32(-1)];
+            fp16 const_4_promoted_to_fp16 = const()[name = string("const_4_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_655_cast_fp16 = mul(x = x_9_cast_fp16, y = const_4_promoted_to_fp16)[name = string("op_655_cast_fp16")];
+            bool input_11_interleave_0 = const()[name = string("input_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_11_cast_fp16 = concat(axis = var_653, interleave = input_11_interleave_0, values = (x_9_cast_fp16, var_655_cast_fp16))[name = string("input_11_cast_fp16")];
+            tensor<int32, [1]> normed_13_axes_0 = const()[name = string("normed_13_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_650_to_fp16 = const()[name = string("op_650_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_13_cast_fp16 = layer_norm(axes = normed_13_axes_0, epsilon = var_650_to_fp16, x = input_11_cast_fp16)[name = string("normed_13_cast_fp16")];
+            tensor<int32, [2]> var_660_split_sizes_0 = const()[name = string("op_660_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_660_axis_0 = const()[name = string("op_660_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_660_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_660_cast_fp16_1 = split(axis = var_660_axis_0, split_sizes = var_660_split_sizes_0, x = normed_13_cast_fp16)[name = string("op_660_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_0_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388600192)))];
+            tensor<fp16, [1, 1, 2560]> h_3_cast_fp16 = mul(x = var_660_cast_fp16_0, y = layers_0_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_3_cast_fp16")];
+            tensor<int32, [3]> var_671 = const()[name = string("op_671"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_13_axes_0 = const()[name = string("input_13_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_672 = transpose(perm = var_671, x = h_3_cast_fp16)[name = string("transpose_92")];
+            tensor<fp16, [1, 2560, 1, 1]> input_13 = expand_dims(axes = input_13_axes_0, x = var_672)[name = string("input_13")];
+            string gate_1_pad_type_0 = const()[name = string("gate_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_1_strides_0 = const()[name = string("gate_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_1_pad_0 = const()[name = string("gate_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_1_dilations_0 = const()[name = string("gate_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_1_groups_0 = const()[name = string("gate_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_1 = conv(dilations = gate_1_dilations_0, groups = gate_1_groups_0, pad = gate_1_pad_0, pad_type = gate_1_pad_type_0, strides = gate_1_strides_0, weight = layers_0_mlp_gate_proj_weight_palettized, x = input_13)[name = string("gate_1")];
+            string up_1_pad_type_0 = const()[name = string("up_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_1_strides_0 = const()[name = string("up_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_1_pad_0 = const()[name = string("up_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_1_dilations_0 = const()[name = string("up_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_1_groups_0 = const()[name = string("up_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_1 = conv(dilations = up_1_dilations_0, groups = up_1_groups_0, pad = up_1_pad_0, pad_type = up_1_pad_type_0, strides = up_1_strides_0, weight = layers_0_mlp_up_proj_weight_palettized, x = input_13)[name = string("up_1")];
+            string gate_3_mode_0 = const()[name = string("gate_3_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_3 = gelu(mode = gate_3_mode_0, x = gate_1)[name = string("gate_3")];
+            tensor<fp16, [1, 10240, 1, 1]> input_15 = mul(x = gate_3, y = up_1)[name = string("input_15")];
+            string mlp_out_1_pad_type_0 = const()[name = string("mlp_out_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_1_strides_0 = const()[name = string("mlp_out_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_1_pad_0 = const()[name = string("mlp_out_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_1_dilations_0 = const()[name = string("mlp_out_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_1_groups_0 = const()[name = string("mlp_out_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_1 = conv(dilations = mlp_out_1_dilations_0, groups = mlp_out_1_groups_0, pad = mlp_out_1_pad_0, pad_type = mlp_out_1_pad_type_0, strides = mlp_out_1_strides_0, weight = layers_0_mlp_down_proj_weight_palettized, x = input_15)[name = string("mlp_out_1")];
+            tensor<int32, [1]> var_712_axes_0 = const()[name = string("op_712_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_712 = squeeze(axes = var_712_axes_0, x = mlp_out_1)[name = string("op_712")];
+            tensor<int32, [3]> var_716 = const()[name = string("op_716"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_722 = const()[name = string("op_722"), val = int32(-1)];
+            fp16 const_5_promoted = const()[name = string("const_5_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_11 = transpose(perm = var_716, x = var_712)[name = string("transpose_91")];
+            tensor<fp16, [1, 1, 2560]> var_724 = mul(x = x_11, y = const_5_promoted)[name = string("op_724")];
+            bool input_17_interleave_0 = const()[name = string("input_17_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_17 = concat(axis = var_722, interleave = input_17_interleave_0, values = (x_11, var_724))[name = string("input_17")];
+            tensor<int32, [1]> normed_17_axes_0 = const()[name = string("normed_17_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_719_to_fp16 = const()[name = string("op_719_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_17_cast_fp16 = layer_norm(axes = normed_17_axes_0, epsilon = var_719_to_fp16, x = input_17)[name = string("normed_17_cast_fp16")];
+            tensor<int32, [2]> var_729_split_sizes_0 = const()[name = string("op_729_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_729_axis_0 = const()[name = string("op_729_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_729_0, tensor<fp16, [1, 1, 2560]> var_729_1 = split(axis = var_729_axis_0, split_sizes = var_729_split_sizes_0, x = normed_17_cast_fp16)[name = string("op_729")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_3 = mul(x = var_729_0, y = layers_0_post_feedforward_layernorm_weight)[name = string("hidden_states_3")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_5_cast_fp16 = add(x = x_9_cast_fp16, y = hidden_states_3)[name = string("hidden_states_5_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_1_begin_0 = const()[name = string("per_layer_slice_1_begin_0"), val = tensor<int32, [3]>([0, 0, 8448])];
+            tensor<int32, [3]> per_layer_slice_1_end_0 = const()[name = string("per_layer_slice_1_end_0"), val = tensor<int32, [3]>([1, 1, 8704])];
+            tensor<bool, [3]> per_layer_slice_1_end_mask_0 = const()[name = string("per_layer_slice_1_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_1_cast_fp16 = slice_by_index(begin = per_layer_slice_1_begin_0, end = per_layer_slice_1_end_0, end_mask = per_layer_slice_1_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_1_cast_fp16")];
+            tensor<int32, [3]> var_757 = const()[name = string("op_757"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_19_axes_0 = const()[name = string("input_19_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_758 = transpose(perm = var_757, x = hidden_states_5_cast_fp16)[name = string("transpose_90")];
+            tensor<fp16, [1, 2560, 1, 1]> input_19 = expand_dims(axes = input_19_axes_0, x = var_758)[name = string("input_19")];
+            string gated_1_pad_type_0 = const()[name = string("gated_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_1_strides_0 = const()[name = string("gated_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_1_pad_0 = const()[name = string("gated_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_1_dilations_0 = const()[name = string("gated_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_1_groups_0 = const()[name = string("gated_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_1 = conv(dilations = gated_1_dilations_0, groups = gated_1_groups_0, pad = gated_1_pad_0, pad_type = gated_1_pad_type_0, strides = gated_1_strides_0, weight = layers_0_per_layer_input_gate_weight_palettized, x = input_19)[name = string("gated_1")];
+            string gated_3_mode_0 = const()[name = string("gated_3_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_3 = gelu(mode = gated_3_mode_0, x = gated_1)[name = string("gated_3")];
+            tensor<int32, [3]> var_777 = const()[name = string("op_777"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_1_axes_0 = const()[name = string("per_layer_slice_conv_1_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_778_cast_fp16 = transpose(perm = var_777, x = per_layer_slice_1_cast_fp16)[name = string("transpose_89")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_1_cast_fp16 = expand_dims(axes = per_layer_slice_conv_1_axes_0, x = var_778_cast_fp16)[name = string("per_layer_slice_conv_1_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_21_cast_fp16 = mul(x = gated_3, y = per_layer_slice_conv_1_cast_fp16)[name = string("input_21_cast_fp16")];
+            string gated_5_pad_type_0 = const()[name = string("gated_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_5_strides_0 = const()[name = string("gated_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_5_pad_0 = const()[name = string("gated_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_5_dilations_0 = const()[name = string("gated_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_5_groups_0 = const()[name = string("gated_5_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_0_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388605376))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388933120))))[name = string("layers_0_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_5_cast_fp16 = conv(dilations = gated_5_dilations_0, groups = gated_5_groups_0, pad = gated_5_pad_0, pad_type = gated_5_pad_type_0, strides = gated_5_strides_0, weight = layers_0_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_21_cast_fp16)[name = string("gated_5_cast_fp16")];
+            tensor<int32, [1]> var_794_axes_0 = const()[name = string("op_794_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_794_cast_fp16 = squeeze(axes = var_794_axes_0, x = gated_5_cast_fp16)[name = string("op_794_cast_fp16")];
+            tensor<int32, [3]> var_798 = const()[name = string("op_798"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_804 = const()[name = string("op_804"), val = int32(-1)];
+            fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_13_cast_fp16 = transpose(perm = var_798, x = var_794_cast_fp16)[name = string("transpose_88")];
+            tensor<fp16, [1, 1, 2560]> var_806_cast_fp16 = mul(x = x_13_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_806_cast_fp16")];
+            bool input_23_interleave_0 = const()[name = string("input_23_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_23_cast_fp16 = concat(axis = var_804, interleave = input_23_interleave_0, values = (x_13_cast_fp16, var_806_cast_fp16))[name = string("input_23_cast_fp16")];
+            tensor<int32, [1]> normed_21_axes_0 = const()[name = string("normed_21_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_801_to_fp16 = const()[name = string("op_801_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_21_cast_fp16 = layer_norm(axes = normed_21_axes_0, epsilon = var_801_to_fp16, x = input_23_cast_fp16)[name = string("normed_21_cast_fp16")];
+            tensor<int32, [2]> var_811_split_sizes_0 = const()[name = string("op_811_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_811_axis_0 = const()[name = string("op_811_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_811_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_811_cast_fp16_1 = split(axis = var_811_axis_0, split_sizes = var_811_split_sizes_0, x = normed_21_cast_fp16)[name = string("op_811_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_0_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388935744)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_9_cast_fp16 = mul(x = var_811_cast_fp16_0, y = layers_0_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_9_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_11_cast_fp16 = add(x = hidden_states_5_cast_fp16, y = hidden_states_9_cast_fp16)[name = string("hidden_states_11_cast_fp16")];
+            tensor<fp16, [1]> const_7_promoted_to_fp16 = const()[name = string("const_7_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.a6p-1])];
+            tensor<fp16, [1, 1, 2560]> x_15_cast_fp16 = mul(x = hidden_states_11_cast_fp16, y = const_7_promoted_to_fp16)[name = string("x_15_cast_fp16")];
+            int32 var_826 = const()[name = string("op_826"), val = int32(-1)];
+            fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_828_cast_fp16 = mul(x = x_15_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_828_cast_fp16")];
+            bool input_25_interleave_0 = const()[name = string("input_25_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_25_cast_fp16 = concat(axis = var_826, interleave = input_25_interleave_0, values = (x_15_cast_fp16, var_828_cast_fp16))[name = string("input_25_cast_fp16")];
+            tensor<int32, [1]> normed_25_axes_0 = const()[name = string("normed_25_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_823_to_fp16 = const()[name = string("op_823_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_25_cast_fp16 = layer_norm(axes = normed_25_axes_0, epsilon = var_823_to_fp16, x = input_25_cast_fp16)[name = string("normed_25_cast_fp16")];
+            tensor<int32, [2]> var_833_split_sizes_0 = const()[name = string("op_833_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_833_axis_0 = const()[name = string("op_833_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_833_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_833_cast_fp16_1 = split(axis = var_833_axis_0, split_sizes = var_833_split_sizes_0, x = normed_25_cast_fp16)[name = string("op_833_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_1_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388940928)))];
+            tensor<fp16, [1, 1, 2560]> h_7_cast_fp16 = mul(x = var_833_cast_fp16_0, y = layers_1_input_layernorm_weight_promoted_to_fp16)[name = string("h_7_cast_fp16")];
+            tensor<int32, [3]> var_839 = const()[name = string("op_839"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_842_axes_0 = const()[name = string("op_842_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_840_cast_fp16 = transpose(perm = var_839, x = h_7_cast_fp16)[name = string("transpose_87")];
+            tensor<fp16, [1, 2560, 1, 1]> var_842_cast_fp16 = expand_dims(axes = var_842_axes_0, x = var_840_cast_fp16)[name = string("op_842_cast_fp16")];
+            string var_858_pad_type_0 = const()[name = string("op_858_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_858_strides_0 = const()[name = string("op_858_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_858_pad_0 = const()[name = string("op_858_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_858_dilations_0 = const()[name = string("op_858_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_858_groups_0 = const()[name = string("op_858_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_858 = conv(dilations = var_858_dilations_0, groups = var_858_groups_0, pad = var_858_pad_0, pad_type = var_858_pad_type_0, strides = var_858_strides_0, weight = layers_1_self_attn_q_proj_weight_palettized, x = var_842_cast_fp16)[name = string("op_858")];
+            tensor<int32, [4]> var_863 = const()[name = string("op_863"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_864 = reshape(shape = var_863, x = var_858)[name = string("op_864")];
+            tensor<int32, [4]> var_869 = const()[name = string("op_869"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_879 = const()[name = string("op_879"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_870 = transpose(perm = var_869, x = var_864)[name = string("transpose_86")];
+            tensor<fp16, [1, 8, 256]> x_17 = reshape(shape = var_879, x = var_870)[name = string("x_17")];
+            int32 var_885 = const()[name = string("op_885"), val = int32(-1)];
+            fp16 const_9_promoted = const()[name = string("const_9_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_887 = mul(x = x_17, y = const_9_promoted)[name = string("op_887")];
+            bool input_29_interleave_0 = const()[name = string("input_29_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_29 = concat(axis = var_885, interleave = input_29_interleave_0, values = (x_17, var_887))[name = string("input_29")];
+            tensor<int32, [1]> normed_29_axes_0 = const()[name = string("normed_29_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_882_to_fp16 = const()[name = string("op_882_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_29_cast_fp16 = layer_norm(axes = normed_29_axes_0, epsilon = var_882_to_fp16, x = input_29)[name = string("normed_29_cast_fp16")];
+            tensor<int32, [2]> var_892_split_sizes_0 = const()[name = string("op_892_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_892_axis_0 = const()[name = string("op_892_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_892_0, tensor<fp16, [1, 8, 256]> var_892_1 = split(axis = var_892_axis_0, split_sizes = var_892_split_sizes_0, x = normed_29_cast_fp16)[name = string("op_892")];
+            tensor<fp16, [1, 8, 256]> var_894 = mul(x = var_892_0, y = layers_0_self_attn_q_norm_weight)[name = string("op_894")];
+            tensor<int32, [4]> var_899 = const()[name = string("op_899"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_9 = reshape(shape = var_899, x = var_894)[name = string("q_9")];
+            tensor<fp16, [1, 8, 1, 256]> var_901_cast_fp16 = mul(x = q_9, y = cos_s)[name = string("op_901_cast_fp16")];
+            tensor<int32, [2]> var_902_split_sizes_0 = const()[name = string("op_902_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_902_axis_0 = const()[name = string("op_902_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_902_0, tensor<fp16, [1, 8, 1, 128]> var_902_1 = split(axis = var_902_axis_0, split_sizes = var_902_split_sizes_0, x = q_9)[name = string("op_902")];
+            fp16 const_10_promoted = const()[name = string("const_10_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_904 = mul(x = var_902_1, y = const_10_promoted)[name = string("op_904")];
+            int32 var_906 = const()[name = string("op_906"), val = int32(-1)];
+            bool var_907_interleave_0 = const()[name = string("op_907_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_907 = concat(axis = var_906, interleave = var_907_interleave_0, values = (var_904, var_902_0))[name = string("op_907")];
+            tensor<fp16, [1, 8, 1, 256]> var_908_cast_fp16 = mul(x = var_907, y = sin_s)[name = string("op_908_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_11_cast_fp16 = add(x = var_901_cast_fp16, y = var_908_cast_fp16)[name = string("q_11_cast_fp16")];
+            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(false)];
+            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = q_11_cast_fp16, y = transpose_36_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_19_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = causal_mask_sliding)[name = string("x_19_cast_fp16")];
+            tensor<int32, [1]> reduce_max_1_axes_0 = const()[name = string("reduce_max_1_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_1_keep_dims_0 = const()[name = string("reduce_max_1_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_1 = reduce_max(axes = reduce_max_1_axes_0, keep_dims = reduce_max_1_keep_dims_0, x = x_19_cast_fp16)[name = string("reduce_max_1")];
+            tensor<fp16, [1, 8, 1, 512]> var_940 = sub(x = x_19_cast_fp16, y = reduce_max_1)[name = string("op_940")];
+            tensor<fp16, [1, 8, 1, 512]> var_946 = exp(x = var_940)[name = string("op_946")];
+            tensor<int32, [1]> var_956_axes_0 = const()[name = string("op_956_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_956_keep_dims_0 = const()[name = string("op_956_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_956 = reduce_sum(axes = var_956_axes_0, keep_dims = var_956_keep_dims_0, x = var_946)[name = string("op_956")];
+            tensor<fp16, [1, 8, 1, 512]> var_962_cast_fp16 = real_div(x = var_946, y = var_956)[name = string("op_962_cast_fp16")];
+            bool attn_output_7_transpose_x_0 = const()[name = string("attn_output_7_transpose_x_0"), val = bool(false)];
+            bool attn_output_7_transpose_y_0 = const()[name = string("attn_output_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_7_cast_fp16 = matmul(transpose_x = attn_output_7_transpose_x_0, transpose_y = attn_output_7_transpose_y_0, x = var_962_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_7_cast_fp16")];
+            tensor<int32, [4]> var_973 = const()[name = string("op_973"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_980 = const()[name = string("op_980"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_974_cast_fp16 = transpose(perm = var_973, x = attn_output_7_cast_fp16)[name = string("transpose_85")];
+            tensor<fp16, [1, 1, 2048]> attn_output_9_cast_fp16 = reshape(shape = var_980, x = var_974_cast_fp16)[name = string("attn_output_9_cast_fp16")];
+            tensor<int32, [3]> var_985 = const()[name = string("op_985"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_1001_pad_type_0 = const()[name = string("op_1001_pad_type_0"), val = string("valid")];
+            int32 var_1001_groups_0 = const()[name = string("op_1001_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_1001_strides_0 = const()[name = string("op_1001_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_1001_pad_0 = const()[name = string("op_1001_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_1001_dilations_0 = const()[name = string("op_1001_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_1_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388946112))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391567616))))[name = string("squeeze_1_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_986_cast_fp16 = transpose(perm = var_985, x = attn_output_9_cast_fp16)[name = string("transpose_84")];
+            tensor<fp16, [1, 2560, 1]> var_1001_cast_fp16 = conv(dilations = var_1001_dilations_0, groups = var_1001_groups_0, pad = var_1001_pad_0, pad_type = var_1001_pad_type_0, strides = var_1001_strides_0, weight = squeeze_1_cast_fp16_to_fp32_to_fp16_palettized, x = var_986_cast_fp16)[name = string("op_1001_cast_fp16")];
+            tensor<int32, [3]> var_1005 = const()[name = string("op_1005"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1011 = const()[name = string("op_1011"), val = int32(-1)];
+            fp16 const_11_promoted_to_fp16 = const()[name = string("const_11_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_23_cast_fp16 = transpose(perm = var_1005, x = var_1001_cast_fp16)[name = string("transpose_83")];
+            tensor<fp16, [1, 1, 2560]> var_1013_cast_fp16 = mul(x = x_23_cast_fp16, y = const_11_promoted_to_fp16)[name = string("op_1013_cast_fp16")];
+            bool input_33_interleave_0 = const()[name = string("input_33_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_33_cast_fp16 = concat(axis = var_1011, interleave = input_33_interleave_0, values = (x_23_cast_fp16, var_1013_cast_fp16))[name = string("input_33_cast_fp16")];
+            tensor<int32, [1]> normed_33_axes_0 = const()[name = string("normed_33_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1008_to_fp16 = const()[name = string("op_1008_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_33_cast_fp16 = layer_norm(axes = normed_33_axes_0, epsilon = var_1008_to_fp16, x = input_33_cast_fp16)[name = string("normed_33_cast_fp16")];
+            tensor<int32, [2]> var_1018_split_sizes_0 = const()[name = string("op_1018_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1018_axis_0 = const()[name = string("op_1018_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1018_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1018_cast_fp16_1 = split(axis = var_1018_axis_0, split_sizes = var_1018_split_sizes_0, x = normed_33_cast_fp16)[name = string("op_1018_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_1_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391570240)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_11_cast_fp16 = mul(x = var_1018_cast_fp16_0, y = layers_1_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_11_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_25_cast_fp16 = add(x = x_15_cast_fp16, y = attn_output_11_cast_fp16)[name = string("x_25_cast_fp16")];
+            int32 var_1027 = const()[name = string("op_1027"), val = int32(-1)];
+            fp16 const_12_promoted_to_fp16 = const()[name = string("const_12_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_1029_cast_fp16 = mul(x = x_25_cast_fp16, y = const_12_promoted_to_fp16)[name = string("op_1029_cast_fp16")];
+            bool input_35_interleave_0 = const()[name = string("input_35_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_35_cast_fp16 = concat(axis = var_1027, interleave = input_35_interleave_0, values = (x_25_cast_fp16, var_1029_cast_fp16))[name = string("input_35_cast_fp16")];
+            tensor<int32, [1]> normed_37_axes_0 = const()[name = string("normed_37_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1024_to_fp16 = const()[name = string("op_1024_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_37_cast_fp16 = layer_norm(axes = normed_37_axes_0, epsilon = var_1024_to_fp16, x = input_35_cast_fp16)[name = string("normed_37_cast_fp16")];
+            tensor<int32, [2]> var_1034_split_sizes_0 = const()[name = string("op_1034_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1034_axis_0 = const()[name = string("op_1034_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1034_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1034_cast_fp16_1 = split(axis = var_1034_axis_0, split_sizes = var_1034_split_sizes_0, x = normed_37_cast_fp16)[name = string("op_1034_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_1_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391575424)))];
+            tensor<fp16, [1, 1, 2560]> h_9_cast_fp16 = mul(x = var_1034_cast_fp16_0, y = layers_1_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_9_cast_fp16")];
+            tensor<int32, [3]> var_1045 = const()[name = string("op_1045"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_37_axes_0 = const()[name = string("input_37_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1046 = transpose(perm = var_1045, x = h_9_cast_fp16)[name = string("transpose_82")];
+            tensor<fp16, [1, 2560, 1, 1]> input_37 = expand_dims(axes = input_37_axes_0, x = var_1046)[name = string("input_37")];
+            string gate_5_pad_type_0 = const()[name = string("gate_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_5_strides_0 = const()[name = string("gate_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_5_pad_0 = const()[name = string("gate_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_5_dilations_0 = const()[name = string("gate_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_5_groups_0 = const()[name = string("gate_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_5 = conv(dilations = gate_5_dilations_0, groups = gate_5_groups_0, pad = gate_5_pad_0, pad_type = gate_5_pad_type_0, strides = gate_5_strides_0, weight = layers_1_mlp_gate_proj_weight_palettized, x = input_37)[name = string("gate_5")];
+            string up_3_pad_type_0 = const()[name = string("up_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_3_strides_0 = const()[name = string("up_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_3_pad_0 = const()[name = string("up_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_3_dilations_0 = const()[name = string("up_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_3_groups_0 = const()[name = string("up_3_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_3 = conv(dilations = up_3_dilations_0, groups = up_3_groups_0, pad = up_3_pad_0, pad_type = up_3_pad_type_0, strides = up_3_strides_0, weight = layers_1_mlp_up_proj_weight_palettized, x = input_37)[name = string("up_3")];
+            string gate_7_mode_0 = const()[name = string("gate_7_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_7 = gelu(mode = gate_7_mode_0, x = gate_5)[name = string("gate_7")];
+            tensor<fp16, [1, 10240, 1, 1]> input_39 = mul(x = gate_7, y = up_3)[name = string("input_39")];
+            string mlp_out_3_pad_type_0 = const()[name = string("mlp_out_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_3_strides_0 = const()[name = string("mlp_out_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_3_pad_0 = const()[name = string("mlp_out_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_3_dilations_0 = const()[name = string("mlp_out_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_3_groups_0 = const()[name = string("mlp_out_3_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_3 = conv(dilations = mlp_out_3_dilations_0, groups = mlp_out_3_groups_0, pad = mlp_out_3_pad_0, pad_type = mlp_out_3_pad_type_0, strides = mlp_out_3_strides_0, weight = layers_1_mlp_down_proj_weight_palettized, x = input_39)[name = string("mlp_out_3")];
+            tensor<int32, [1]> var_1086_axes_0 = const()[name = string("op_1086_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1086 = squeeze(axes = var_1086_axes_0, x = mlp_out_3)[name = string("op_1086")];
+            tensor<int32, [3]> var_1090 = const()[name = string("op_1090"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1096 = const()[name = string("op_1096"), val = int32(-1)];
+            fp16 const_13_promoted = const()[name = string("const_13_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_27 = transpose(perm = var_1090, x = var_1086)[name = string("transpose_81")];
+            tensor<fp16, [1, 1, 2560]> var_1098 = mul(x = x_27, y = const_13_promoted)[name = string("op_1098")];
+            bool input_41_interleave_0 = const()[name = string("input_41_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_41 = concat(axis = var_1096, interleave = input_41_interleave_0, values = (x_27, var_1098))[name = string("input_41")];
+            tensor<int32, [1]> normed_41_axes_0 = const()[name = string("normed_41_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1093_to_fp16 = const()[name = string("op_1093_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_41_cast_fp16 = layer_norm(axes = normed_41_axes_0, epsilon = var_1093_to_fp16, x = input_41)[name = string("normed_41_cast_fp16")];
+            tensor<int32, [2]> var_1103_split_sizes_0 = const()[name = string("op_1103_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1103_axis_0 = const()[name = string("op_1103_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1103_0, tensor<fp16, [1, 1, 2560]> var_1103_1 = split(axis = var_1103_axis_0, split_sizes = var_1103_split_sizes_0, x = normed_41_cast_fp16)[name = string("op_1103")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_13 = mul(x = var_1103_0, y = layers_1_post_feedforward_layernorm_weight)[name = string("hidden_states_13")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_15_cast_fp16 = add(x = x_25_cast_fp16, y = hidden_states_13)[name = string("hidden_states_15_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_3_begin_0 = const()[name = string("per_layer_slice_3_begin_0"), val = tensor<int32, [3]>([0, 0, 8704])];
+            tensor<int32, [3]> per_layer_slice_3_end_0 = const()[name = string("per_layer_slice_3_end_0"), val = tensor<int32, [3]>([1, 1, 8960])];
+            tensor<bool, [3]> per_layer_slice_3_end_mask_0 = const()[name = string("per_layer_slice_3_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_3_cast_fp16 = slice_by_index(begin = per_layer_slice_3_begin_0, end = per_layer_slice_3_end_0, end_mask = per_layer_slice_3_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_3_cast_fp16")];
+            tensor<int32, [3]> var_1131 = const()[name = string("op_1131"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_43_axes_0 = const()[name = string("input_43_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1132 = transpose(perm = var_1131, x = hidden_states_15_cast_fp16)[name = string("transpose_80")];
+            tensor<fp16, [1, 2560, 1, 1]> input_43 = expand_dims(axes = input_43_axes_0, x = var_1132)[name = string("input_43")];
+            string gated_7_pad_type_0 = const()[name = string("gated_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_7_strides_0 = const()[name = string("gated_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_7_pad_0 = const()[name = string("gated_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_7_dilations_0 = const()[name = string("gated_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_7_groups_0 = const()[name = string("gated_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_7 = conv(dilations = gated_7_dilations_0, groups = gated_7_groups_0, pad = gated_7_pad_0, pad_type = gated_7_pad_type_0, strides = gated_7_strides_0, weight = layers_1_per_layer_input_gate_weight_palettized, x = input_43)[name = string("gated_7")];
+            string gated_9_mode_0 = const()[name = string("gated_9_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_9 = gelu(mode = gated_9_mode_0, x = gated_7)[name = string("gated_9")];
+            tensor<int32, [3]> var_1151 = const()[name = string("op_1151"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_3_axes_0 = const()[name = string("per_layer_slice_conv_3_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_1152_cast_fp16 = transpose(perm = var_1151, x = per_layer_slice_3_cast_fp16)[name = string("transpose_79")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_3_cast_fp16 = expand_dims(axes = per_layer_slice_conv_3_axes_0, x = var_1152_cast_fp16)[name = string("per_layer_slice_conv_3_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_45_cast_fp16 = mul(x = gated_9, y = per_layer_slice_conv_3_cast_fp16)[name = string("input_45_cast_fp16")];
+            string gated_11_pad_type_0 = const()[name = string("gated_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_11_strides_0 = const()[name = string("gated_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_11_pad_0 = const()[name = string("gated_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_11_dilations_0 = const()[name = string("gated_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_11_groups_0 = const()[name = string("gated_11_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_1_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391580608))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391908352))))[name = string("layers_1_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_11_cast_fp16 = conv(dilations = gated_11_dilations_0, groups = gated_11_groups_0, pad = gated_11_pad_0, pad_type = gated_11_pad_type_0, strides = gated_11_strides_0, weight = layers_1_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_45_cast_fp16)[name = string("gated_11_cast_fp16")];
+            tensor<int32, [1]> var_1168_axes_0 = const()[name = string("op_1168_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1168_cast_fp16 = squeeze(axes = var_1168_axes_0, x = gated_11_cast_fp16)[name = string("op_1168_cast_fp16")];
+            tensor<int32, [3]> var_1172 = const()[name = string("op_1172"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1178 = const()[name = string("op_1178"), val = int32(-1)];
+            fp16 const_14_promoted_to_fp16 = const()[name = string("const_14_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_29_cast_fp16 = transpose(perm = var_1172, x = var_1168_cast_fp16)[name = string("transpose_78")];
+            tensor<fp16, [1, 1, 2560]> var_1180_cast_fp16 = mul(x = x_29_cast_fp16, y = const_14_promoted_to_fp16)[name = string("op_1180_cast_fp16")];
+            bool input_47_interleave_0 = const()[name = string("input_47_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_47_cast_fp16 = concat(axis = var_1178, interleave = input_47_interleave_0, values = (x_29_cast_fp16, var_1180_cast_fp16))[name = string("input_47_cast_fp16")];
+            tensor<int32, [1]> normed_45_axes_0 = const()[name = string("normed_45_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1175_to_fp16 = const()[name = string("op_1175_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_45_cast_fp16 = layer_norm(axes = normed_45_axes_0, epsilon = var_1175_to_fp16, x = input_47_cast_fp16)[name = string("normed_45_cast_fp16")];
+            tensor<int32, [2]> var_1185_split_sizes_0 = const()[name = string("op_1185_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1185_axis_0 = const()[name = string("op_1185_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1185_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1185_cast_fp16_1 = split(axis = var_1185_axis_0, split_sizes = var_1185_split_sizes_0, x = normed_45_cast_fp16)[name = string("op_1185_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_1_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391910976)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_19_cast_fp16 = mul(x = var_1185_cast_fp16_0, y = layers_1_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_19_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_21_cast_fp16 = add(x = hidden_states_15_cast_fp16, y = hidden_states_19_cast_fp16)[name = string("hidden_states_21_cast_fp16")];
+            tensor<fp16, [1]> const_15_promoted_to_fp16 = const()[name = string("const_15_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.acp-1])];
+            tensor<fp16, [1, 1, 2560]> x_31_cast_fp16 = mul(x = hidden_states_21_cast_fp16, y = const_15_promoted_to_fp16)[name = string("x_31_cast_fp16")];
+            int32 var_1200 = const()[name = string("op_1200"), val = int32(-1)];
+            fp16 const_16_promoted_to_fp16 = const()[name = string("const_16_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_1202_cast_fp16 = mul(x = x_31_cast_fp16, y = const_16_promoted_to_fp16)[name = string("op_1202_cast_fp16")];
+            bool input_49_interleave_0 = const()[name = string("input_49_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_49_cast_fp16 = concat(axis = var_1200, interleave = input_49_interleave_0, values = (x_31_cast_fp16, var_1202_cast_fp16))[name = string("input_49_cast_fp16")];
+            tensor<int32, [1]> normed_49_axes_0 = const()[name = string("normed_49_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1197_to_fp16 = const()[name = string("op_1197_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_49_cast_fp16 = layer_norm(axes = normed_49_axes_0, epsilon = var_1197_to_fp16, x = input_49_cast_fp16)[name = string("normed_49_cast_fp16")];
+            tensor<int32, [2]> var_1207_split_sizes_0 = const()[name = string("op_1207_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1207_axis_0 = const()[name = string("op_1207_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1207_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1207_cast_fp16_1 = split(axis = var_1207_axis_0, split_sizes = var_1207_split_sizes_0, x = normed_49_cast_fp16)[name = string("op_1207_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_2_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391916160)))];
+            tensor<fp16, [1, 1, 2560]> h_13_cast_fp16 = mul(x = var_1207_cast_fp16_0, y = layers_2_input_layernorm_weight_promoted_to_fp16)[name = string("h_13_cast_fp16")];
+            tensor<int32, [3]> var_1213 = const()[name = string("op_1213"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1216_axes_0 = const()[name = string("op_1216_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1214_cast_fp16 = transpose(perm = var_1213, x = h_13_cast_fp16)[name = string("transpose_77")];
+            tensor<fp16, [1, 2560, 1, 1]> var_1216_cast_fp16 = expand_dims(axes = var_1216_axes_0, x = var_1214_cast_fp16)[name = string("op_1216_cast_fp16")];
+            string var_1232_pad_type_0 = const()[name = string("op_1232_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1232_strides_0 = const()[name = string("op_1232_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1232_pad_0 = const()[name = string("op_1232_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1232_dilations_0 = const()[name = string("op_1232_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1232_groups_0 = const()[name = string("op_1232_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 4096, 1, 1]> var_1232 = conv(dilations = var_1232_dilations_0, groups = var_1232_groups_0, pad = var_1232_pad_0, pad_type = var_1232_pad_type_0, strides = var_1232_strides_0, weight = layers_2_self_attn_q_proj_weight_palettized, x = var_1216_cast_fp16)[name = string("op_1232")];
+            tensor<int32, [4]> var_1237 = const()[name = string("op_1237"), val = tensor<int32, [4]>([1, 8, 512, 1])];
+            tensor<fp16, [1, 8, 512, 1]> var_1238 = reshape(shape = var_1237, x = var_1232)[name = string("op_1238")];
+            tensor<int32, [4]> var_1243 = const()[name = string("op_1243"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_1253 = const()[name = string("op_1253"), val = tensor<int32, [3]>([1, 8, 512])];
+            tensor<fp16, [1, 8, 1, 512]> var_1244 = transpose(perm = var_1243, x = var_1238)[name = string("transpose_76")];
+            tensor<fp16, [1, 8, 512]> x_33 = reshape(shape = var_1253, x = var_1244)[name = string("x_33")];
+            int32 var_1259 = const()[name = string("op_1259"), val = int32(-1)];
+            fp16 const_17_promoted = const()[name = string("const_17_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 512]> var_1261 = mul(x = x_33, y = const_17_promoted)[name = string("op_1261")];
+            bool input_53_interleave_0 = const()[name = string("input_53_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1024]> input_53 = concat(axis = var_1259, interleave = input_53_interleave_0, values = (x_33, var_1261))[name = string("input_53")];
+            tensor<int32, [1]> normed_53_axes_0 = const()[name = string("normed_53_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1256_to_fp16 = const()[name = string("op_1256_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 1024]> normed_53_cast_fp16 = layer_norm(axes = normed_53_axes_0, epsilon = var_1256_to_fp16, x = input_53)[name = string("normed_53_cast_fp16")];
+            tensor<int32, [2]> var_1266_split_sizes_0 = const()[name = string("op_1266_split_sizes_0"), val = tensor<int32, [2]>([512, 512])];
+            int32 var_1266_axis_0 = const()[name = string("op_1266_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 512]> var_1266_0, tensor<fp16, [1, 8, 512]> var_1266_1 = split(axis = var_1266_axis_0, split_sizes = var_1266_split_sizes_0, x = normed_53_cast_fp16)[name = string("op_1266")];
+            tensor<fp16, [1, 8, 512]> var_1268 = mul(x = var_1266_0, y = layers_2_self_attn_q_norm_weight)[name = string("op_1268")];
+            tensor<int32, [4]> var_1273 = const()[name = string("op_1273"), val = tensor<int32, [4]>([1, 8, 1, 512])];
+            tensor<fp16, [1, 8, 1, 512]> q_15 = reshape(shape = var_1273, x = var_1268)[name = string("q_15")];
+            tensor<fp16, [1, 8, 1, 512]> var_1275_cast_fp16 = mul(x = q_15, y = cos_f)[name = string("op_1275_cast_fp16")];
+            tensor<int32, [2]> var_1276_split_sizes_0 = const()[name = string("op_1276_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_1276_axis_0 = const()[name = string("op_1276_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 256]> var_1276_0, tensor<fp16, [1, 8, 1, 256]> var_1276_1 = split(axis = var_1276_axis_0, split_sizes = var_1276_split_sizes_0, x = q_15)[name = string("op_1276")];
+            fp16 const_18_promoted = const()[name = string("const_18_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 256]> var_1278 = mul(x = var_1276_1, y = const_18_promoted)[name = string("op_1278")];
+            int32 var_1280 = const()[name = string("op_1280"), val = int32(-1)];
+            bool var_1281_interleave_0 = const()[name = string("op_1281_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> var_1281 = concat(axis = var_1280, interleave = var_1281_interleave_0, values = (var_1278, var_1276_0))[name = string("op_1281")];
+            tensor<fp16, [1, 8, 1, 512]> var_1282_cast_fp16 = mul(x = var_1281, y = sin_f)[name = string("op_1282_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> q_17_cast_fp16 = add(x = var_1275_cast_fp16, y = var_1282_cast_fp16)[name = string("q_17_cast_fp16")];
+            tensor<int32, [4]> transpose_8_perm_0 = const()[name = string("transpose_8_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_4_reps_0 = const()[name = string("tile_4_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 2048, 512]> transpose_8_cast_fp16 = transpose(perm = transpose_8_perm_0, x = kv14_k)[name = string("transpose_75")];
+            tensor<fp16, [8, 1, 2048, 512]> tile_4_cast_fp16 = tile(reps = tile_4_reps_0, x = transpose_8_cast_fp16)[name = string("tile_4_cast_fp16")];
+            tensor<int32, [5]> concat_8 = const()[name = string("concat_8"), val = tensor<int32, [5]>([4, 2, 1, 2048, 512])];
+            tensor<fp16, [4, 2, 1, 2048, 512]> reshape_8_cast_fp16 = reshape(shape = concat_8, x = tile_4_cast_fp16)[name = string("reshape_8_cast_fp16")];
+            tensor<int32, [5]> transpose_9_perm_0 = const()[name = string("transpose_9_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_9 = const()[name = string("concat_9"), val = tensor<int32, [4]>([-1, 1, 2048, 512])];
+            tensor<fp16, [2, 4, 1, 2048, 512]> transpose_9_cast_fp16 = transpose(perm = transpose_9_perm_0, x = reshape_8_cast_fp16)[name = string("transpose_74")];
+            tensor<fp16, [8, 1, 2048, 512]> reshape_9_cast_fp16 = reshape(shape = concat_9, x = transpose_9_cast_fp16)[name = string("reshape_9_cast_fp16")];
+            tensor<int32, [4]> transpose_38_perm_0 = const()[name = string("transpose_38_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_10_perm_0 = const()[name = string("transpose_10_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_5_reps_0 = const()[name = string("tile_5_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 2048, 512]> transpose_10_cast_fp16 = transpose(perm = transpose_10_perm_0, x = kv14_v)[name = string("transpose_73")];
+            tensor<fp16, [8, 1, 2048, 512]> tile_5_cast_fp16 = tile(reps = tile_5_reps_0, x = transpose_10_cast_fp16)[name = string("tile_5_cast_fp16")];
+            tensor<int32, [5]> concat_10 = const()[name = string("concat_10"), val = tensor<int32, [5]>([4, 2, 1, 2048, 512])];
+            tensor<fp16, [4, 2, 1, 2048, 512]> reshape_10_cast_fp16 = reshape(shape = concat_10, x = tile_5_cast_fp16)[name = string("reshape_10_cast_fp16")];
+            tensor<int32, [5]> transpose_11_perm_0 = const()[name = string("transpose_11_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_11 = const()[name = string("concat_11"), val = tensor<int32, [4]>([-1, 1, 2048, 512])];
+            tensor<fp16, [2, 4, 1, 2048, 512]> transpose_11_cast_fp16 = transpose(perm = transpose_11_perm_0, x = reshape_10_cast_fp16)[name = string("transpose_72")];
+            tensor<fp16, [8, 1, 2048, 512]> reshape_11_cast_fp16 = reshape(shape = concat_11, x = transpose_11_cast_fp16)[name = string("reshape_11_cast_fp16")];
+            tensor<int32, [4]> V_expanded_5_perm_0 = const()[name = string("V_expanded_5_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(false)];
+            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 2048]> transpose_38_cast_fp16 = transpose(perm = transpose_38_perm_0, x = reshape_9_cast_fp16)[name = string("transpose_71")];
+            tensor<fp16, [1, 8, 1, 2048]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = q_17_cast_fp16, y = transpose_38_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 2048]> x_35_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = causal_mask_full)[name = string("x_35_cast_fp16")];
+            tensor<int32, [1]> reduce_max_2_axes_0 = const()[name = string("reduce_max_2_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_2_keep_dims_0 = const()[name = string("reduce_max_2_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_2 = reduce_max(axes = reduce_max_2_axes_0, keep_dims = reduce_max_2_keep_dims_0, x = x_35_cast_fp16)[name = string("reduce_max_2")];
+            tensor<fp16, [1, 8, 1, 2048]> var_1314 = sub(x = x_35_cast_fp16, y = reduce_max_2)[name = string("op_1314")];
+            tensor<fp16, [1, 8, 1, 2048]> var_1320 = exp(x = var_1314)[name = string("op_1320")];
+            tensor<int32, [1]> var_1330_axes_0 = const()[name = string("op_1330_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1330_keep_dims_0 = const()[name = string("op_1330_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_1330 = reduce_sum(axes = var_1330_axes_0, keep_dims = var_1330_keep_dims_0, x = var_1320)[name = string("op_1330")];
+            tensor<fp16, [1, 8, 1, 2048]> var_1336_cast_fp16 = real_div(x = var_1320, y = var_1330)[name = string("op_1336_cast_fp16")];
+            bool attn_output_13_transpose_x_0 = const()[name = string("attn_output_13_transpose_x_0"), val = bool(false)];
+            bool attn_output_13_transpose_y_0 = const()[name = string("attn_output_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 2048, 512]> V_expanded_5_cast_fp16 = transpose(perm = V_expanded_5_perm_0, x = reshape_11_cast_fp16)[name = string("transpose_70")];
+            tensor<fp16, [1, 8, 1, 512]> attn_output_13_cast_fp16 = matmul(transpose_x = attn_output_13_transpose_x_0, transpose_y = attn_output_13_transpose_y_0, x = var_1336_cast_fp16, y = V_expanded_5_cast_fp16)[name = string("attn_output_13_cast_fp16")];
+            tensor<int32, [4]> var_1347 = const()[name = string("op_1347"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1354 = const()[name = string("op_1354"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 512]> var_1348_cast_fp16 = transpose(perm = var_1347, x = attn_output_13_cast_fp16)[name = string("transpose_69")];
+            tensor<fp16, [1, 1, 4096]> attn_output_15_cast_fp16 = reshape(shape = var_1354, x = var_1348_cast_fp16)[name = string("attn_output_15_cast_fp16")];
+            tensor<int32, [3]> var_1359 = const()[name = string("op_1359"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_1375_pad_type_0 = const()[name = string("op_1375_pad_type_0"), val = string("valid")];
+            int32 var_1375_groups_0 = const()[name = string("op_1375_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_1375_strides_0 = const()[name = string("op_1375_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_1375_pad_0 = const()[name = string("op_1375_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_1375_dilations_0 = const()[name = string("op_1375_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 4096, 1]> squeeze_2_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 4096, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391921344))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397164288))))[name = string("squeeze_2_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 4096, 1]> var_1360_cast_fp16 = transpose(perm = var_1359, x = attn_output_15_cast_fp16)[name = string("transpose_68")];
+            tensor<fp16, [1, 2560, 1]> var_1375_cast_fp16 = conv(dilations = var_1375_dilations_0, groups = var_1375_groups_0, pad = var_1375_pad_0, pad_type = var_1375_pad_type_0, strides = var_1375_strides_0, weight = squeeze_2_cast_fp16_to_fp32_to_fp16_palettized, x = var_1360_cast_fp16)[name = string("op_1375_cast_fp16")];
+            tensor<int32, [3]> var_1379 = const()[name = string("op_1379"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1385 = const()[name = string("op_1385"), val = int32(-1)];
+            fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_39_cast_fp16 = transpose(perm = var_1379, x = var_1375_cast_fp16)[name = string("transpose_67")];
+            tensor<fp16, [1, 1, 2560]> var_1387_cast_fp16 = mul(x = x_39_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_1387_cast_fp16")];
+            bool input_57_interleave_0 = const()[name = string("input_57_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_57_cast_fp16 = concat(axis = var_1385, interleave = input_57_interleave_0, values = (x_39_cast_fp16, var_1387_cast_fp16))[name = string("input_57_cast_fp16")];
+            tensor<int32, [1]> normed_57_axes_0 = const()[name = string("normed_57_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1382_to_fp16 = const()[name = string("op_1382_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_57_cast_fp16 = layer_norm(axes = normed_57_axes_0, epsilon = var_1382_to_fp16, x = input_57_cast_fp16)[name = string("normed_57_cast_fp16")];
+            tensor<int32, [2]> var_1392_split_sizes_0 = const()[name = string("op_1392_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1392_axis_0 = const()[name = string("op_1392_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1392_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1392_cast_fp16_1 = split(axis = var_1392_axis_0, split_sizes = var_1392_split_sizes_0, x = normed_57_cast_fp16)[name = string("op_1392_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_2_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397166912)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_17_cast_fp16 = mul(x = var_1392_cast_fp16_0, y = layers_2_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_17_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_41_cast_fp16 = add(x = x_31_cast_fp16, y = attn_output_17_cast_fp16)[name = string("x_41_cast_fp16")];
+            int32 var_1401 = const()[name = string("op_1401"), val = int32(-1)];
+            fp16 const_20_promoted_to_fp16 = const()[name = string("const_20_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_1403_cast_fp16 = mul(x = x_41_cast_fp16, y = const_20_promoted_to_fp16)[name = string("op_1403_cast_fp16")];
+            bool input_59_interleave_0 = const()[name = string("input_59_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_59_cast_fp16 = concat(axis = var_1401, interleave = input_59_interleave_0, values = (x_41_cast_fp16, var_1403_cast_fp16))[name = string("input_59_cast_fp16")];
+            tensor<int32, [1]> normed_61_axes_0 = const()[name = string("normed_61_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1398_to_fp16 = const()[name = string("op_1398_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_61_cast_fp16 = layer_norm(axes = normed_61_axes_0, epsilon = var_1398_to_fp16, x = input_59_cast_fp16)[name = string("normed_61_cast_fp16")];
+            tensor<int32, [2]> var_1408_split_sizes_0 = const()[name = string("op_1408_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1408_axis_0 = const()[name = string("op_1408_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1408_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1408_cast_fp16_1 = split(axis = var_1408_axis_0, split_sizes = var_1408_split_sizes_0, x = normed_61_cast_fp16)[name = string("op_1408_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_2_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397172096)))];
+            tensor<fp16, [1, 1, 2560]> h_15_cast_fp16 = mul(x = var_1408_cast_fp16_0, y = layers_2_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_15_cast_fp16")];
+            tensor<int32, [3]> var_1419 = const()[name = string("op_1419"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_61_axes_0 = const()[name = string("input_61_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1420 = transpose(perm = var_1419, x = h_15_cast_fp16)[name = string("transpose_66")];
+            tensor<fp16, [1, 2560, 1, 1]> input_61 = expand_dims(axes = input_61_axes_0, x = var_1420)[name = string("input_61")];
+            string gate_9_pad_type_0 = const()[name = string("gate_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_9_strides_0 = const()[name = string("gate_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_9_pad_0 = const()[name = string("gate_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_9_dilations_0 = const()[name = string("gate_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_9_groups_0 = const()[name = string("gate_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_9 = conv(dilations = gate_9_dilations_0, groups = gate_9_groups_0, pad = gate_9_pad_0, pad_type = gate_9_pad_type_0, strides = gate_9_strides_0, weight = layers_2_mlp_gate_proj_weight_palettized, x = input_61)[name = string("gate_9")];
+            string up_5_pad_type_0 = const()[name = string("up_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_5_strides_0 = const()[name = string("up_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_5_pad_0 = const()[name = string("up_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_5_dilations_0 = const()[name = string("up_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_5_groups_0 = const()[name = string("up_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_5 = conv(dilations = up_5_dilations_0, groups = up_5_groups_0, pad = up_5_pad_0, pad_type = up_5_pad_type_0, strides = up_5_strides_0, weight = layers_2_mlp_up_proj_weight_palettized, x = input_61)[name = string("up_5")];
+            string gate_11_mode_0 = const()[name = string("gate_11_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_11 = gelu(mode = gate_11_mode_0, x = gate_9)[name = string("gate_11")];
+            tensor<fp16, [1, 10240, 1, 1]> input_63 = mul(x = gate_11, y = up_5)[name = string("input_63")];
+            string mlp_out_5_pad_type_0 = const()[name = string("mlp_out_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_5_strides_0 = const()[name = string("mlp_out_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_5_pad_0 = const()[name = string("mlp_out_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_5_dilations_0 = const()[name = string("mlp_out_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_5_groups_0 = const()[name = string("mlp_out_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_5 = conv(dilations = mlp_out_5_dilations_0, groups = mlp_out_5_groups_0, pad = mlp_out_5_pad_0, pad_type = mlp_out_5_pad_type_0, strides = mlp_out_5_strides_0, weight = layers_2_mlp_down_proj_weight_palettized, x = input_63)[name = string("mlp_out_5")];
+            tensor<int32, [1]> var_1460_axes_0 = const()[name = string("op_1460_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1460 = squeeze(axes = var_1460_axes_0, x = mlp_out_5)[name = string("op_1460")];
+            tensor<int32, [3]> var_1464 = const()[name = string("op_1464"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1470 = const()[name = string("op_1470"), val = int32(-1)];
+            fp16 const_21_promoted = const()[name = string("const_21_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_43 = transpose(perm = var_1464, x = var_1460)[name = string("transpose_65")];
+            tensor<fp16, [1, 1, 2560]> var_1472 = mul(x = x_43, y = const_21_promoted)[name = string("op_1472")];
+            bool input_65_interleave_0 = const()[name = string("input_65_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_65 = concat(axis = var_1470, interleave = input_65_interleave_0, values = (x_43, var_1472))[name = string("input_65")];
+            tensor<int32, [1]> normed_65_axes_0 = const()[name = string("normed_65_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1467_to_fp16 = const()[name = string("op_1467_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_65_cast_fp16 = layer_norm(axes = normed_65_axes_0, epsilon = var_1467_to_fp16, x = input_65)[name = string("normed_65_cast_fp16")];
+            tensor<int32, [2]> var_1477_split_sizes_0 = const()[name = string("op_1477_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1477_axis_0 = const()[name = string("op_1477_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1477_0, tensor<fp16, [1, 1, 2560]> var_1477_1 = split(axis = var_1477_axis_0, split_sizes = var_1477_split_sizes_0, x = normed_65_cast_fp16)[name = string("op_1477")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_23 = mul(x = var_1477_0, y = layers_2_post_feedforward_layernorm_weight)[name = string("hidden_states_23")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_25_cast_fp16 = add(x = x_41_cast_fp16, y = hidden_states_23)[name = string("hidden_states_25_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_5_begin_0 = const()[name = string("per_layer_slice_5_begin_0"), val = tensor<int32, [3]>([0, 0, 8960])];
+            tensor<int32, [3]> per_layer_slice_5_end_0 = const()[name = string("per_layer_slice_5_end_0"), val = tensor<int32, [3]>([1, 1, 9216])];
+            tensor<bool, [3]> per_layer_slice_5_end_mask_0 = const()[name = string("per_layer_slice_5_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_5_cast_fp16 = slice_by_index(begin = per_layer_slice_5_begin_0, end = per_layer_slice_5_end_0, end_mask = per_layer_slice_5_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_5_cast_fp16")];
+            tensor<int32, [3]> var_1505 = const()[name = string("op_1505"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_67_axes_0 = const()[name = string("input_67_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1506 = transpose(perm = var_1505, x = hidden_states_25_cast_fp16)[name = string("transpose_64")];
+            tensor<fp16, [1, 2560, 1, 1]> input_67 = expand_dims(axes = input_67_axes_0, x = var_1506)[name = string("input_67")];
+            string gated_13_pad_type_0 = const()[name = string("gated_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_13_strides_0 = const()[name = string("gated_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_13_pad_0 = const()[name = string("gated_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_13_dilations_0 = const()[name = string("gated_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_13_groups_0 = const()[name = string("gated_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_13 = conv(dilations = gated_13_dilations_0, groups = gated_13_groups_0, pad = gated_13_pad_0, pad_type = gated_13_pad_type_0, strides = gated_13_strides_0, weight = layers_2_per_layer_input_gate_weight_palettized, x = input_67)[name = string("gated_13")];
+            string gated_15_mode_0 = const()[name = string("gated_15_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_15 = gelu(mode = gated_15_mode_0, x = gated_13)[name = string("gated_15")];
+            tensor<int32, [3]> var_1525 = const()[name = string("op_1525"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_5_axes_0 = const()[name = string("per_layer_slice_conv_5_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_1526_cast_fp16 = transpose(perm = var_1525, x = per_layer_slice_5_cast_fp16)[name = string("transpose_63")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_5_cast_fp16 = expand_dims(axes = per_layer_slice_conv_5_axes_0, x = var_1526_cast_fp16)[name = string("per_layer_slice_conv_5_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_69_cast_fp16 = mul(x = gated_15, y = per_layer_slice_conv_5_cast_fp16)[name = string("input_69_cast_fp16")];
+            string gated_17_pad_type_0 = const()[name = string("gated_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_17_strides_0 = const()[name = string("gated_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_17_pad_0 = const()[name = string("gated_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_17_dilations_0 = const()[name = string("gated_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_17_groups_0 = const()[name = string("gated_17_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_2_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397177280))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397505024))))[name = string("layers_2_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_17_cast_fp16 = conv(dilations = gated_17_dilations_0, groups = gated_17_groups_0, pad = gated_17_pad_0, pad_type = gated_17_pad_type_0, strides = gated_17_strides_0, weight = layers_2_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_69_cast_fp16)[name = string("gated_17_cast_fp16")];
+            tensor<int32, [1]> var_1542_axes_0 = const()[name = string("op_1542_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1542_cast_fp16 = squeeze(axes = var_1542_axes_0, x = gated_17_cast_fp16)[name = string("op_1542_cast_fp16")];
+            tensor<int32, [3]> var_1546 = const()[name = string("op_1546"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1552 = const()[name = string("op_1552"), val = int32(-1)];
+            fp16 const_22_promoted_to_fp16 = const()[name = string("const_22_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_45_cast_fp16 = transpose(perm = var_1546, x = var_1542_cast_fp16)[name = string("transpose_62")];
+            tensor<fp16, [1, 1, 2560]> var_1554_cast_fp16 = mul(x = x_45_cast_fp16, y = const_22_promoted_to_fp16)[name = string("op_1554_cast_fp16")];
+            bool input_71_interleave_0 = const()[name = string("input_71_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_71_cast_fp16 = concat(axis = var_1552, interleave = input_71_interleave_0, values = (x_45_cast_fp16, var_1554_cast_fp16))[name = string("input_71_cast_fp16")];
+            tensor<int32, [1]> normed_69_axes_0 = const()[name = string("normed_69_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1549_to_fp16 = const()[name = string("op_1549_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_69_cast_fp16 = layer_norm(axes = normed_69_axes_0, epsilon = var_1549_to_fp16, x = input_71_cast_fp16)[name = string("normed_69_cast_fp16")];
+            tensor<int32, [2]> var_1559_split_sizes_0 = const()[name = string("op_1559_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1559_axis_0 = const()[name = string("op_1559_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1559_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1559_cast_fp16_1 = split(axis = var_1559_axis_0, split_sizes = var_1559_split_sizes_0, x = normed_69_cast_fp16)[name = string("op_1559_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_2_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397507648)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_29_cast_fp16 = mul(x = var_1559_cast_fp16_0, y = layers_2_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_29_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_31_cast_fp16 = add(x = hidden_states_25_cast_fp16, y = hidden_states_29_cast_fp16)[name = string("hidden_states_31_cast_fp16")];
+            tensor<fp16, [1]> const_23_promoted_to_fp16 = const()[name = string("const_23_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.acp-1])];
+            tensor<fp16, [1, 1, 2560]> x_47_cast_fp16 = mul(x = hidden_states_31_cast_fp16, y = const_23_promoted_to_fp16)[name = string("x_47_cast_fp16")];
+            int32 var_1574 = const()[name = string("op_1574"), val = int32(-1)];
+            fp16 const_24_promoted_to_fp16 = const()[name = string("const_24_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_1576_cast_fp16 = mul(x = x_47_cast_fp16, y = const_24_promoted_to_fp16)[name = string("op_1576_cast_fp16")];
+            bool input_73_interleave_0 = const()[name = string("input_73_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_73_cast_fp16 = concat(axis = var_1574, interleave = input_73_interleave_0, values = (x_47_cast_fp16, var_1576_cast_fp16))[name = string("input_73_cast_fp16")];
+            tensor<int32, [1]> normed_73_axes_0 = const()[name = string("normed_73_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1571_to_fp16 = const()[name = string("op_1571_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_73_cast_fp16 = layer_norm(axes = normed_73_axes_0, epsilon = var_1571_to_fp16, x = input_73_cast_fp16)[name = string("normed_73_cast_fp16")];
+            tensor<int32, [2]> var_1581_split_sizes_0 = const()[name = string("op_1581_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1581_axis_0 = const()[name = string("op_1581_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1581_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1581_cast_fp16_1 = split(axis = var_1581_axis_0, split_sizes = var_1581_split_sizes_0, x = normed_73_cast_fp16)[name = string("op_1581_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_3_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397512832)))];
+            tensor<fp16, [1, 1, 2560]> h_19_cast_fp16 = mul(x = var_1581_cast_fp16_0, y = layers_3_input_layernorm_weight_promoted_to_fp16)[name = string("h_19_cast_fp16")];
+            tensor<int32, [3]> var_1587 = const()[name = string("op_1587"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1590_axes_0 = const()[name = string("op_1590_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1588_cast_fp16 = transpose(perm = var_1587, x = h_19_cast_fp16)[name = string("transpose_61")];
+            tensor<fp16, [1, 2560, 1, 1]> var_1590_cast_fp16 = expand_dims(axes = var_1590_axes_0, x = var_1588_cast_fp16)[name = string("op_1590_cast_fp16")];
+            string var_1606_pad_type_0 = const()[name = string("op_1606_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1606_strides_0 = const()[name = string("op_1606_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1606_pad_0 = const()[name = string("op_1606_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1606_dilations_0 = const()[name = string("op_1606_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1606_groups_0 = const()[name = string("op_1606_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_1606 = conv(dilations = var_1606_dilations_0, groups = var_1606_groups_0, pad = var_1606_pad_0, pad_type = var_1606_pad_type_0, strides = var_1606_strides_0, weight = layers_3_self_attn_q_proj_weight_palettized, x = var_1590_cast_fp16)[name = string("op_1606")];
+            tensor<int32, [4]> var_1611 = const()[name = string("op_1611"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_1612 = reshape(shape = var_1611, x = var_1606)[name = string("op_1612")];
+            tensor<int32, [4]> var_1617 = const()[name = string("op_1617"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_1627 = const()[name = string("op_1627"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_1618 = transpose(perm = var_1617, x = var_1612)[name = string("transpose_60")];
+            tensor<fp16, [1, 8, 256]> x_49 = reshape(shape = var_1627, x = var_1618)[name = string("x_49")];
+            int32 var_1633 = const()[name = string("op_1633"), val = int32(-1)];
+            fp16 const_25_promoted = const()[name = string("const_25_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_1635 = mul(x = x_49, y = const_25_promoted)[name = string("op_1635")];
+            bool input_77_interleave_0 = const()[name = string("input_77_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_77 = concat(axis = var_1633, interleave = input_77_interleave_0, values = (x_49, var_1635))[name = string("input_77")];
+            tensor<int32, [1]> normed_77_axes_0 = const()[name = string("normed_77_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1630_to_fp16 = const()[name = string("op_1630_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_77_cast_fp16 = layer_norm(axes = normed_77_axes_0, epsilon = var_1630_to_fp16, x = input_77)[name = string("normed_77_cast_fp16")];
+            tensor<int32, [2]> var_1640_split_sizes_0 = const()[name = string("op_1640_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_1640_axis_0 = const()[name = string("op_1640_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_1640_0, tensor<fp16, [1, 8, 256]> var_1640_1 = split(axis = var_1640_axis_0, split_sizes = var_1640_split_sizes_0, x = normed_77_cast_fp16)[name = string("op_1640")];
+            tensor<fp16, [1, 8, 256]> var_1642 = mul(x = var_1640_0, y = layers_0_self_attn_q_norm_weight)[name = string("op_1642")];
+            tensor<int32, [4]> var_1647 = const()[name = string("op_1647"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_21 = reshape(shape = var_1647, x = var_1642)[name = string("q_21")];
+            tensor<fp16, [1, 8, 1, 256]> var_1649_cast_fp16 = mul(x = q_21, y = cos_s)[name = string("op_1649_cast_fp16")];
+            tensor<int32, [2]> var_1650_split_sizes_0 = const()[name = string("op_1650_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_1650_axis_0 = const()[name = string("op_1650_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_1650_0, tensor<fp16, [1, 8, 1, 128]> var_1650_1 = split(axis = var_1650_axis_0, split_sizes = var_1650_split_sizes_0, x = q_21)[name = string("op_1650")];
+            fp16 const_26_promoted = const()[name = string("const_26_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_1652 = mul(x = var_1650_1, y = const_26_promoted)[name = string("op_1652")];
+            int32 var_1654 = const()[name = string("op_1654"), val = int32(-1)];
+            bool var_1655_interleave_0 = const()[name = string("op_1655_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_1655 = concat(axis = var_1654, interleave = var_1655_interleave_0, values = (var_1652, var_1650_0))[name = string("op_1655")];
+            tensor<fp16, [1, 8, 1, 256]> var_1656_cast_fp16 = mul(x = var_1655, y = sin_s)[name = string("op_1656_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_23_cast_fp16 = add(x = var_1649_cast_fp16, y = var_1656_cast_fp16)[name = string("q_23_cast_fp16")];
+            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(false)];
+            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = q_23_cast_fp16, y = transpose_36_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_51_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = causal_mask_sliding)[name = string("x_51_cast_fp16")];
+            tensor<int32, [1]> reduce_max_3_axes_0 = const()[name = string("reduce_max_3_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_3_keep_dims_0 = const()[name = string("reduce_max_3_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_3 = reduce_max(axes = reduce_max_3_axes_0, keep_dims = reduce_max_3_keep_dims_0, x = x_51_cast_fp16)[name = string("reduce_max_3")];
+            tensor<fp16, [1, 8, 1, 512]> var_1688 = sub(x = x_51_cast_fp16, y = reduce_max_3)[name = string("op_1688")];
+            tensor<fp16, [1, 8, 1, 512]> var_1694 = exp(x = var_1688)[name = string("op_1694")];
+            tensor<int32, [1]> var_1704_axes_0 = const()[name = string("op_1704_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1704_keep_dims_0 = const()[name = string("op_1704_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_1704 = reduce_sum(axes = var_1704_axes_0, keep_dims = var_1704_keep_dims_0, x = var_1694)[name = string("op_1704")];
+            tensor<fp16, [1, 8, 1, 512]> var_1710_cast_fp16 = real_div(x = var_1694, y = var_1704)[name = string("op_1710_cast_fp16")];
+            bool attn_output_19_transpose_x_0 = const()[name = string("attn_output_19_transpose_x_0"), val = bool(false)];
+            bool attn_output_19_transpose_y_0 = const()[name = string("attn_output_19_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_19_cast_fp16 = matmul(transpose_x = attn_output_19_transpose_x_0, transpose_y = attn_output_19_transpose_y_0, x = var_1710_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_19_cast_fp16")];
+            tensor<int32, [4]> var_1721 = const()[name = string("op_1721"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1728 = const()[name = string("op_1728"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_1722_cast_fp16 = transpose(perm = var_1721, x = attn_output_19_cast_fp16)[name = string("transpose_59")];
+            tensor<fp16, [1, 1, 2048]> attn_output_21_cast_fp16 = reshape(shape = var_1728, x = var_1722_cast_fp16)[name = string("attn_output_21_cast_fp16")];
+            tensor<int32, [3]> var_1733 = const()[name = string("op_1733"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_1749_pad_type_0 = const()[name = string("op_1749_pad_type_0"), val = string("valid")];
+            int32 var_1749_groups_0 = const()[name = string("op_1749_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_1749_strides_0 = const()[name = string("op_1749_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_1749_pad_0 = const()[name = string("op_1749_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_1749_dilations_0 = const()[name = string("op_1749_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_3_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397518016))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400139520))))[name = string("squeeze_3_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_1734_cast_fp16 = transpose(perm = var_1733, x = attn_output_21_cast_fp16)[name = string("transpose_58")];
+            tensor<fp16, [1, 2560, 1]> var_1749_cast_fp16 = conv(dilations = var_1749_dilations_0, groups = var_1749_groups_0, pad = var_1749_pad_0, pad_type = var_1749_pad_type_0, strides = var_1749_strides_0, weight = squeeze_3_cast_fp16_to_fp32_to_fp16_palettized, x = var_1734_cast_fp16)[name = string("op_1749_cast_fp16")];
+            tensor<int32, [3]> var_1753 = const()[name = string("op_1753"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1759 = const()[name = string("op_1759"), val = int32(-1)];
+            fp16 const_27_promoted_to_fp16 = const()[name = string("const_27_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_55_cast_fp16 = transpose(perm = var_1753, x = var_1749_cast_fp16)[name = string("transpose_57")];
+            tensor<fp16, [1, 1, 2560]> var_1761_cast_fp16 = mul(x = x_55_cast_fp16, y = const_27_promoted_to_fp16)[name = string("op_1761_cast_fp16")];
+            bool input_81_interleave_0 = const()[name = string("input_81_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_81_cast_fp16 = concat(axis = var_1759, interleave = input_81_interleave_0, values = (x_55_cast_fp16, var_1761_cast_fp16))[name = string("input_81_cast_fp16")];
+            tensor<int32, [1]> normed_81_axes_0 = const()[name = string("normed_81_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1756_to_fp16 = const()[name = string("op_1756_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_81_cast_fp16 = layer_norm(axes = normed_81_axes_0, epsilon = var_1756_to_fp16, x = input_81_cast_fp16)[name = string("normed_81_cast_fp16")];
+            tensor<int32, [2]> var_1766_split_sizes_0 = const()[name = string("op_1766_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1766_axis_0 = const()[name = string("op_1766_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1766_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1766_cast_fp16_1 = split(axis = var_1766_axis_0, split_sizes = var_1766_split_sizes_0, x = normed_81_cast_fp16)[name = string("op_1766_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_3_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400142144)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_23_cast_fp16 = mul(x = var_1766_cast_fp16_0, y = layers_3_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_23_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_57_cast_fp16 = add(x = x_47_cast_fp16, y = attn_output_23_cast_fp16)[name = string("x_57_cast_fp16")];
+            int32 var_1775 = const()[name = string("op_1775"), val = int32(-1)];
+            fp16 const_28_promoted_to_fp16 = const()[name = string("const_28_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_1777_cast_fp16 = mul(x = x_57_cast_fp16, y = const_28_promoted_to_fp16)[name = string("op_1777_cast_fp16")];
+            bool input_83_interleave_0 = const()[name = string("input_83_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_83_cast_fp16 = concat(axis = var_1775, interleave = input_83_interleave_0, values = (x_57_cast_fp16, var_1777_cast_fp16))[name = string("input_83_cast_fp16")];
+            tensor<int32, [1]> normed_85_axes_0 = const()[name = string("normed_85_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1772_to_fp16 = const()[name = string("op_1772_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_85_cast_fp16 = layer_norm(axes = normed_85_axes_0, epsilon = var_1772_to_fp16, x = input_83_cast_fp16)[name = string("normed_85_cast_fp16")];
+            tensor<int32, [2]> var_1782_split_sizes_0 = const()[name = string("op_1782_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1782_axis_0 = const()[name = string("op_1782_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1782_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1782_cast_fp16_1 = split(axis = var_1782_axis_0, split_sizes = var_1782_split_sizes_0, x = normed_85_cast_fp16)[name = string("op_1782_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_3_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400147328)))];
+            tensor<fp16, [1, 1, 2560]> h_21_cast_fp16 = mul(x = var_1782_cast_fp16_0, y = layers_3_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_21_cast_fp16")];
+            tensor<int32, [3]> var_1793 = const()[name = string("op_1793"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_85_axes_0 = const()[name = string("input_85_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1794 = transpose(perm = var_1793, x = h_21_cast_fp16)[name = string("transpose_56")];
+            tensor<fp16, [1, 2560, 1, 1]> input_85 = expand_dims(axes = input_85_axes_0, x = var_1794)[name = string("input_85")];
+            string gate_13_pad_type_0 = const()[name = string("gate_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_13_strides_0 = const()[name = string("gate_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_13_pad_0 = const()[name = string("gate_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_13_dilations_0 = const()[name = string("gate_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_13_groups_0 = const()[name = string("gate_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_13 = conv(dilations = gate_13_dilations_0, groups = gate_13_groups_0, pad = gate_13_pad_0, pad_type = gate_13_pad_type_0, strides = gate_13_strides_0, weight = layers_3_mlp_gate_proj_weight_palettized, x = input_85)[name = string("gate_13")];
+            string up_7_pad_type_0 = const()[name = string("up_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_7_strides_0 = const()[name = string("up_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_7_pad_0 = const()[name = string("up_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_7_dilations_0 = const()[name = string("up_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_7_groups_0 = const()[name = string("up_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_7 = conv(dilations = up_7_dilations_0, groups = up_7_groups_0, pad = up_7_pad_0, pad_type = up_7_pad_type_0, strides = up_7_strides_0, weight = layers_3_mlp_up_proj_weight_palettized, x = input_85)[name = string("up_7")];
+            string gate_15_mode_0 = const()[name = string("gate_15_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_15 = gelu(mode = gate_15_mode_0, x = gate_13)[name = string("gate_15")];
+            tensor<fp16, [1, 10240, 1, 1]> input_87 = mul(x = gate_15, y = up_7)[name = string("input_87")];
+            string mlp_out_7_pad_type_0 = const()[name = string("mlp_out_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_7_strides_0 = const()[name = string("mlp_out_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_7_pad_0 = const()[name = string("mlp_out_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_7_dilations_0 = const()[name = string("mlp_out_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_7_groups_0 = const()[name = string("mlp_out_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_7 = conv(dilations = mlp_out_7_dilations_0, groups = mlp_out_7_groups_0, pad = mlp_out_7_pad_0, pad_type = mlp_out_7_pad_type_0, strides = mlp_out_7_strides_0, weight = layers_3_mlp_down_proj_weight_palettized, x = input_87)[name = string("mlp_out_7")];
+            tensor<int32, [1]> var_1834_axes_0 = const()[name = string("op_1834_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1834 = squeeze(axes = var_1834_axes_0, x = mlp_out_7)[name = string("op_1834")];
+            tensor<int32, [3]> var_1838 = const()[name = string("op_1838"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1844 = const()[name = string("op_1844"), val = int32(-1)];
+            fp16 const_29_promoted = const()[name = string("const_29_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_59 = transpose(perm = var_1838, x = var_1834)[name = string("transpose_55")];
+            tensor<fp16, [1, 1, 2560]> var_1846 = mul(x = x_59, y = const_29_promoted)[name = string("op_1846")];
+            bool input_89_interleave_0 = const()[name = string("input_89_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_89 = concat(axis = var_1844, interleave = input_89_interleave_0, values = (x_59, var_1846))[name = string("input_89")];
+            tensor<int32, [1]> normed_89_axes_0 = const()[name = string("normed_89_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1841_to_fp16 = const()[name = string("op_1841_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_89_cast_fp16 = layer_norm(axes = normed_89_axes_0, epsilon = var_1841_to_fp16, x = input_89)[name = string("normed_89_cast_fp16")];
+            tensor<int32, [2]> var_1851_split_sizes_0 = const()[name = string("op_1851_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1851_axis_0 = const()[name = string("op_1851_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1851_0, tensor<fp16, [1, 1, 2560]> var_1851_1 = split(axis = var_1851_axis_0, split_sizes = var_1851_split_sizes_0, x = normed_89_cast_fp16)[name = string("op_1851")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_33 = mul(x = var_1851_0, y = layers_3_post_feedforward_layernorm_weight)[name = string("hidden_states_33")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_35_cast_fp16 = add(x = x_57_cast_fp16, y = hidden_states_33)[name = string("hidden_states_35_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_7_begin_0 = const()[name = string("per_layer_slice_7_begin_0"), val = tensor<int32, [3]>([0, 0, 9216])];
+            tensor<int32, [3]> per_layer_slice_7_end_0 = const()[name = string("per_layer_slice_7_end_0"), val = tensor<int32, [3]>([1, 1, 9472])];
+            tensor<bool, [3]> per_layer_slice_7_end_mask_0 = const()[name = string("per_layer_slice_7_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_7_cast_fp16 = slice_by_index(begin = per_layer_slice_7_begin_0, end = per_layer_slice_7_end_0, end_mask = per_layer_slice_7_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_7_cast_fp16")];
+            tensor<int32, [3]> var_1879 = const()[name = string("op_1879"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_91_axes_0 = const()[name = string("input_91_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1880 = transpose(perm = var_1879, x = hidden_states_35_cast_fp16)[name = string("transpose_54")];
+            tensor<fp16, [1, 2560, 1, 1]> input_91 = expand_dims(axes = input_91_axes_0, x = var_1880)[name = string("input_91")];
+            string gated_19_pad_type_0 = const()[name = string("gated_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_19_strides_0 = const()[name = string("gated_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_19_pad_0 = const()[name = string("gated_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_19_dilations_0 = const()[name = string("gated_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_19_groups_0 = const()[name = string("gated_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_19 = conv(dilations = gated_19_dilations_0, groups = gated_19_groups_0, pad = gated_19_pad_0, pad_type = gated_19_pad_type_0, strides = gated_19_strides_0, weight = layers_3_per_layer_input_gate_weight_palettized, x = input_91)[name = string("gated_19")];
+            string gated_21_mode_0 = const()[name = string("gated_21_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_21 = gelu(mode = gated_21_mode_0, x = gated_19)[name = string("gated_21")];
+            tensor<int32, [3]> var_1899 = const()[name = string("op_1899"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_7_axes_0 = const()[name = string("per_layer_slice_conv_7_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_1900_cast_fp16 = transpose(perm = var_1899, x = per_layer_slice_7_cast_fp16)[name = string("transpose_53")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_7_cast_fp16 = expand_dims(axes = per_layer_slice_conv_7_axes_0, x = var_1900_cast_fp16)[name = string("per_layer_slice_conv_7_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_93_cast_fp16 = mul(x = gated_21, y = per_layer_slice_conv_7_cast_fp16)[name = string("input_93_cast_fp16")];
+            string gated_23_pad_type_0 = const()[name = string("gated_23_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_23_strides_0 = const()[name = string("gated_23_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_23_pad_0 = const()[name = string("gated_23_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_23_dilations_0 = const()[name = string("gated_23_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_23_groups_0 = const()[name = string("gated_23_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_3_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400152512))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400480256))))[name = string("layers_3_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_23_cast_fp16 = conv(dilations = gated_23_dilations_0, groups = gated_23_groups_0, pad = gated_23_pad_0, pad_type = gated_23_pad_type_0, strides = gated_23_strides_0, weight = layers_3_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_93_cast_fp16)[name = string("gated_23_cast_fp16")];
+            tensor<int32, [1]> var_1916_axes_0 = const()[name = string("op_1916_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1916_cast_fp16 = squeeze(axes = var_1916_axes_0, x = gated_23_cast_fp16)[name = string("op_1916_cast_fp16")];
+            tensor<int32, [3]> var_1920 = const()[name = string("op_1920"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1926 = const()[name = string("op_1926"), val = int32(-1)];
+            fp16 const_30_promoted_to_fp16 = const()[name = string("const_30_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_61_cast_fp16 = transpose(perm = var_1920, x = var_1916_cast_fp16)[name = string("transpose_52")];
+            tensor<fp16, [1, 1, 2560]> var_1928_cast_fp16 = mul(x = x_61_cast_fp16, y = const_30_promoted_to_fp16)[name = string("op_1928_cast_fp16")];
+            bool input_95_interleave_0 = const()[name = string("input_95_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_95_cast_fp16 = concat(axis = var_1926, interleave = input_95_interleave_0, values = (x_61_cast_fp16, var_1928_cast_fp16))[name = string("input_95_cast_fp16")];
+            tensor<int32, [1]> normed_93_axes_0 = const()[name = string("normed_93_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1923_to_fp16 = const()[name = string("op_1923_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_93_cast_fp16 = layer_norm(axes = normed_93_axes_0, epsilon = var_1923_to_fp16, x = input_95_cast_fp16)[name = string("normed_93_cast_fp16")];
+            tensor<int32, [2]> var_1933_split_sizes_0 = const()[name = string("op_1933_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1933_axis_0 = const()[name = string("op_1933_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1933_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1933_cast_fp16_1 = split(axis = var_1933_axis_0, split_sizes = var_1933_split_sizes_0, x = normed_93_cast_fp16)[name = string("op_1933_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_3_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400482880)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_39_cast_fp16 = mul(x = var_1933_cast_fp16_0, y = layers_3_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_39_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_41_cast_fp16 = add(x = hidden_states_35_cast_fp16, y = hidden_states_39_cast_fp16)[name = string("hidden_states_41_cast_fp16")];
+            tensor<fp16, [1]> const_31_promoted_to_fp16 = const()[name = string("const_31_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.b6p-1])];
+            tensor<fp16, [1, 1, 2560]> x_63_cast_fp16 = mul(x = hidden_states_41_cast_fp16, y = const_31_promoted_to_fp16)[name = string("x_63_cast_fp16")];
+            int32 var_1948 = const()[name = string("op_1948"), val = int32(-1)];
+            fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_1950_cast_fp16 = mul(x = x_63_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_1950_cast_fp16")];
+            bool input_97_interleave_0 = const()[name = string("input_97_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_97_cast_fp16 = concat(axis = var_1948, interleave = input_97_interleave_0, values = (x_63_cast_fp16, var_1950_cast_fp16))[name = string("input_97_cast_fp16")];
+            tensor<int32, [1]> normed_97_axes_0 = const()[name = string("normed_97_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1945_to_fp16 = const()[name = string("op_1945_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_97_cast_fp16 = layer_norm(axes = normed_97_axes_0, epsilon = var_1945_to_fp16, x = input_97_cast_fp16)[name = string("normed_97_cast_fp16")];
+            tensor<int32, [2]> var_1955_split_sizes_0 = const()[name = string("op_1955_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1955_axis_0 = const()[name = string("op_1955_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1955_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1955_cast_fp16_1 = split(axis = var_1955_axis_0, split_sizes = var_1955_split_sizes_0, x = normed_97_cast_fp16)[name = string("op_1955_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_4_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400488064)))];
+            tensor<fp16, [1, 1, 2560]> h_25_cast_fp16 = mul(x = var_1955_cast_fp16_0, y = layers_4_input_layernorm_weight_promoted_to_fp16)[name = string("h_25_cast_fp16")];
+            tensor<int32, [3]> var_1961 = const()[name = string("op_1961"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1964_axes_0 = const()[name = string("op_1964_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1962_cast_fp16 = transpose(perm = var_1961, x = h_25_cast_fp16)[name = string("transpose_51")];
+            tensor<fp16, [1, 2560, 1, 1]> var_1964_cast_fp16 = expand_dims(axes = var_1964_axes_0, x = var_1962_cast_fp16)[name = string("op_1964_cast_fp16")];
+            string var_1980_pad_type_0 = const()[name = string("op_1980_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1980_strides_0 = const()[name = string("op_1980_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1980_pad_0 = const()[name = string("op_1980_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1980_dilations_0 = const()[name = string("op_1980_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1980_groups_0 = const()[name = string("op_1980_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_1980 = conv(dilations = var_1980_dilations_0, groups = var_1980_groups_0, pad = var_1980_pad_0, pad_type = var_1980_pad_type_0, strides = var_1980_strides_0, weight = layers_4_self_attn_q_proj_weight_palettized, x = var_1964_cast_fp16)[name = string("op_1980")];
+            tensor<int32, [4]> var_1985 = const()[name = string("op_1985"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_1986 = reshape(shape = var_1985, x = var_1980)[name = string("op_1986")];
+            tensor<int32, [4]> var_1991 = const()[name = string("op_1991"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_2001 = const()[name = string("op_2001"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_1992 = transpose(perm = var_1991, x = var_1986)[name = string("transpose_50")];
+            tensor<fp16, [1, 8, 256]> x_65 = reshape(shape = var_2001, x = var_1992)[name = string("x_65")];
+            int32 var_2007 = const()[name = string("op_2007"), val = int32(-1)];
+            fp16 const_33_promoted = const()[name = string("const_33_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_2009 = mul(x = x_65, y = const_33_promoted)[name = string("op_2009")];
+            bool input_101_interleave_0 = const()[name = string("input_101_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_101 = concat(axis = var_2007, interleave = input_101_interleave_0, values = (x_65, var_2009))[name = string("input_101")];
+            tensor<int32, [1]> normed_101_axes_0 = const()[name = string("normed_101_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2004_to_fp16 = const()[name = string("op_2004_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_101_cast_fp16 = layer_norm(axes = normed_101_axes_0, epsilon = var_2004_to_fp16, x = input_101)[name = string("normed_101_cast_fp16")];
+            tensor<int32, [2]> var_2014_split_sizes_0 = const()[name = string("op_2014_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2014_axis_0 = const()[name = string("op_2014_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_2014_0, tensor<fp16, [1, 8, 256]> var_2014_1 = split(axis = var_2014_axis_0, split_sizes = var_2014_split_sizes_0, x = normed_101_cast_fp16)[name = string("op_2014")];
+            tensor<fp16, [1, 8, 256]> var_2016 = mul(x = var_2014_0, y = layers_0_self_attn_q_norm_weight)[name = string("op_2016")];
+            tensor<int32, [4]> var_2021 = const()[name = string("op_2021"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_27 = reshape(shape = var_2021, x = var_2016)[name = string("q_27")];
+            tensor<fp16, [1, 8, 1, 256]> var_2023_cast_fp16 = mul(x = q_27, y = cos_s)[name = string("op_2023_cast_fp16")];
+            tensor<int32, [2]> var_2024_split_sizes_0 = const()[name = string("op_2024_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2024_axis_0 = const()[name = string("op_2024_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_2024_0, tensor<fp16, [1, 8, 1, 128]> var_2024_1 = split(axis = var_2024_axis_0, split_sizes = var_2024_split_sizes_0, x = q_27)[name = string("op_2024")];
+            fp16 const_34_promoted = const()[name = string("const_34_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_2026 = mul(x = var_2024_1, y = const_34_promoted)[name = string("op_2026")];
+            int32 var_2028 = const()[name = string("op_2028"), val = int32(-1)];
+            bool var_2029_interleave_0 = const()[name = string("op_2029_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_2029 = concat(axis = var_2028, interleave = var_2029_interleave_0, values = (var_2026, var_2024_0))[name = string("op_2029")];
+            tensor<fp16, [1, 8, 1, 256]> var_2030_cast_fp16 = mul(x = var_2029, y = sin_s)[name = string("op_2030_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_29_cast_fp16 = add(x = var_2023_cast_fp16, y = var_2030_cast_fp16)[name = string("q_29_cast_fp16")];
+            bool attn_weights_17_transpose_x_0 = const()[name = string("attn_weights_17_transpose_x_0"), val = bool(false)];
+            bool attn_weights_17_transpose_y_0 = const()[name = string("attn_weights_17_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_17_cast_fp16 = matmul(transpose_x = attn_weights_17_transpose_x_0, transpose_y = attn_weights_17_transpose_y_0, x = q_29_cast_fp16, y = transpose_36_cast_fp16)[name = string("attn_weights_17_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_67_cast_fp16 = add(x = attn_weights_17_cast_fp16, y = causal_mask_sliding)[name = string("x_67_cast_fp16")];
+            tensor<int32, [1]> reduce_max_4_axes_0 = const()[name = string("reduce_max_4_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_4_keep_dims_0 = const()[name = string("reduce_max_4_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_4 = reduce_max(axes = reduce_max_4_axes_0, keep_dims = reduce_max_4_keep_dims_0, x = x_67_cast_fp16)[name = string("reduce_max_4")];
+            tensor<fp16, [1, 8, 1, 512]> var_2062 = sub(x = x_67_cast_fp16, y = reduce_max_4)[name = string("op_2062")];
+            tensor<fp16, [1, 8, 1, 512]> var_2068 = exp(x = var_2062)[name = string("op_2068")];
+            tensor<int32, [1]> var_2078_axes_0 = const()[name = string("op_2078_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2078_keep_dims_0 = const()[name = string("op_2078_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_2078 = reduce_sum(axes = var_2078_axes_0, keep_dims = var_2078_keep_dims_0, x = var_2068)[name = string("op_2078")];
+            tensor<fp16, [1, 8, 1, 512]> var_2084_cast_fp16 = real_div(x = var_2068, y = var_2078)[name = string("op_2084_cast_fp16")];
+            bool attn_output_25_transpose_x_0 = const()[name = string("attn_output_25_transpose_x_0"), val = bool(false)];
+            bool attn_output_25_transpose_y_0 = const()[name = string("attn_output_25_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_25_cast_fp16 = matmul(transpose_x = attn_output_25_transpose_x_0, transpose_y = attn_output_25_transpose_y_0, x = var_2084_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_25_cast_fp16")];
+            tensor<int32, [4]> var_2095 = const()[name = string("op_2095"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_2102 = const()[name = string("op_2102"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_2096_cast_fp16 = transpose(perm = var_2095, x = attn_output_25_cast_fp16)[name = string("transpose_49")];
+            tensor<fp16, [1, 1, 2048]> attn_output_27_cast_fp16 = reshape(shape = var_2102, x = var_2096_cast_fp16)[name = string("attn_output_27_cast_fp16")];
+            tensor<int32, [3]> var_2107 = const()[name = string("op_2107"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_2123_pad_type_0 = const()[name = string("op_2123_pad_type_0"), val = string("valid")];
+            int32 var_2123_groups_0 = const()[name = string("op_2123_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_2123_strides_0 = const()[name = string("op_2123_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_2123_pad_0 = const()[name = string("op_2123_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_2123_dilations_0 = const()[name = string("op_2123_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_4_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400493248))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403114752))))[name = string("squeeze_4_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_2108_cast_fp16 = transpose(perm = var_2107, x = attn_output_27_cast_fp16)[name = string("transpose_48")];
+            tensor<fp16, [1, 2560, 1]> var_2123_cast_fp16 = conv(dilations = var_2123_dilations_0, groups = var_2123_groups_0, pad = var_2123_pad_0, pad_type = var_2123_pad_type_0, strides = var_2123_strides_0, weight = squeeze_4_cast_fp16_to_fp32_to_fp16_palettized, x = var_2108_cast_fp16)[name = string("op_2123_cast_fp16")];
+            tensor<int32, [3]> var_2127 = const()[name = string("op_2127"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2133 = const()[name = string("op_2133"), val = int32(-1)];
+            fp16 const_35_promoted_to_fp16 = const()[name = string("const_35_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_71_cast_fp16 = transpose(perm = var_2127, x = var_2123_cast_fp16)[name = string("transpose_47")];
+            tensor<fp16, [1, 1, 2560]> var_2135_cast_fp16 = mul(x = x_71_cast_fp16, y = const_35_promoted_to_fp16)[name = string("op_2135_cast_fp16")];
+            bool input_105_interleave_0 = const()[name = string("input_105_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_105_cast_fp16 = concat(axis = var_2133, interleave = input_105_interleave_0, values = (x_71_cast_fp16, var_2135_cast_fp16))[name = string("input_105_cast_fp16")];
+            tensor<int32, [1]> normed_105_axes_0 = const()[name = string("normed_105_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2130_to_fp16 = const()[name = string("op_2130_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_105_cast_fp16 = layer_norm(axes = normed_105_axes_0, epsilon = var_2130_to_fp16, x = input_105_cast_fp16)[name = string("normed_105_cast_fp16")];
+            tensor<int32, [2]> var_2140_split_sizes_0 = const()[name = string("op_2140_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2140_axis_0 = const()[name = string("op_2140_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2140_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2140_cast_fp16_1 = split(axis = var_2140_axis_0, split_sizes = var_2140_split_sizes_0, x = normed_105_cast_fp16)[name = string("op_2140_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_4_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403117376)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_29_cast_fp16 = mul(x = var_2140_cast_fp16_0, y = layers_4_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_29_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_73_cast_fp16 = add(x = x_63_cast_fp16, y = attn_output_29_cast_fp16)[name = string("x_73_cast_fp16")];
+            int32 var_2149 = const()[name = string("op_2149"), val = int32(-1)];
+            fp16 const_36_promoted_to_fp16 = const()[name = string("const_36_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_2151_cast_fp16 = mul(x = x_73_cast_fp16, y = const_36_promoted_to_fp16)[name = string("op_2151_cast_fp16")];
+            bool input_107_interleave_0 = const()[name = string("input_107_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_107_cast_fp16 = concat(axis = var_2149, interleave = input_107_interleave_0, values = (x_73_cast_fp16, var_2151_cast_fp16))[name = string("input_107_cast_fp16")];
+            tensor<int32, [1]> normed_109_axes_0 = const()[name = string("normed_109_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2146_to_fp16 = const()[name = string("op_2146_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_109_cast_fp16 = layer_norm(axes = normed_109_axes_0, epsilon = var_2146_to_fp16, x = input_107_cast_fp16)[name = string("normed_109_cast_fp16")];
+            tensor<int32, [2]> var_2156_split_sizes_0 = const()[name = string("op_2156_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2156_axis_0 = const()[name = string("op_2156_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2156_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2156_cast_fp16_1 = split(axis = var_2156_axis_0, split_sizes = var_2156_split_sizes_0, x = normed_109_cast_fp16)[name = string("op_2156_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_4_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403122560)))];
+            tensor<fp16, [1, 1, 2560]> h_27_cast_fp16 = mul(x = var_2156_cast_fp16_0, y = layers_4_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_27_cast_fp16")];
+            tensor<int32, [3]> var_2167 = const()[name = string("op_2167"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_109_axes_0 = const()[name = string("input_109_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2168 = transpose(perm = var_2167, x = h_27_cast_fp16)[name = string("transpose_46")];
+            tensor<fp16, [1, 2560, 1, 1]> input_109 = expand_dims(axes = input_109_axes_0, x = var_2168)[name = string("input_109")];
+            string gate_17_pad_type_0 = const()[name = string("gate_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_17_strides_0 = const()[name = string("gate_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_17_pad_0 = const()[name = string("gate_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_17_dilations_0 = const()[name = string("gate_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_17_groups_0 = const()[name = string("gate_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_17 = conv(dilations = gate_17_dilations_0, groups = gate_17_groups_0, pad = gate_17_pad_0, pad_type = gate_17_pad_type_0, strides = gate_17_strides_0, weight = layers_4_mlp_gate_proj_weight_palettized, x = input_109)[name = string("gate_17")];
+            string up_9_pad_type_0 = const()[name = string("up_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_9_strides_0 = const()[name = string("up_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_9_pad_0 = const()[name = string("up_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_9_dilations_0 = const()[name = string("up_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_9_groups_0 = const()[name = string("up_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_9 = conv(dilations = up_9_dilations_0, groups = up_9_groups_0, pad = up_9_pad_0, pad_type = up_9_pad_type_0, strides = up_9_strides_0, weight = layers_4_mlp_up_proj_weight_palettized, x = input_109)[name = string("up_9")];
+            string gate_19_mode_0 = const()[name = string("gate_19_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_19 = gelu(mode = gate_19_mode_0, x = gate_17)[name = string("gate_19")];
+            tensor<fp16, [1, 10240, 1, 1]> input_111 = mul(x = gate_19, y = up_9)[name = string("input_111")];
+            string mlp_out_9_pad_type_0 = const()[name = string("mlp_out_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_9_strides_0 = const()[name = string("mlp_out_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_9_pad_0 = const()[name = string("mlp_out_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_9_dilations_0 = const()[name = string("mlp_out_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_9_groups_0 = const()[name = string("mlp_out_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_9 = conv(dilations = mlp_out_9_dilations_0, groups = mlp_out_9_groups_0, pad = mlp_out_9_pad_0, pad_type = mlp_out_9_pad_type_0, strides = mlp_out_9_strides_0, weight = layers_4_mlp_down_proj_weight_palettized, x = input_111)[name = string("mlp_out_9")];
+            tensor<int32, [1]> var_2208_axes_0 = const()[name = string("op_2208_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2208 = squeeze(axes = var_2208_axes_0, x = mlp_out_9)[name = string("op_2208")];
+            tensor<int32, [3]> var_2212 = const()[name = string("op_2212"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2218 = const()[name = string("op_2218"), val = int32(-1)];
+            fp16 const_37_promoted = const()[name = string("const_37_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_75 = transpose(perm = var_2212, x = var_2208)[name = string("transpose_45")];
+            tensor<fp16, [1, 1, 2560]> var_2220 = mul(x = x_75, y = const_37_promoted)[name = string("op_2220")];
+            bool input_113_interleave_0 = const()[name = string("input_113_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_113 = concat(axis = var_2218, interleave = input_113_interleave_0, values = (x_75, var_2220))[name = string("input_113")];
+            tensor<int32, [1]> normed_113_axes_0 = const()[name = string("normed_113_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2215_to_fp16 = const()[name = string("op_2215_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_113_cast_fp16 = layer_norm(axes = normed_113_axes_0, epsilon = var_2215_to_fp16, x = input_113)[name = string("normed_113_cast_fp16")];
+            tensor<int32, [2]> var_2225_split_sizes_0 = const()[name = string("op_2225_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2225_axis_0 = const()[name = string("op_2225_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2225_0, tensor<fp16, [1, 1, 2560]> var_2225_1 = split(axis = var_2225_axis_0, split_sizes = var_2225_split_sizes_0, x = normed_113_cast_fp16)[name = string("op_2225")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_43 = mul(x = var_2225_0, y = layers_4_post_feedforward_layernorm_weight)[name = string("hidden_states_43")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_45_cast_fp16 = add(x = x_73_cast_fp16, y = hidden_states_43)[name = string("hidden_states_45_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_9_begin_0 = const()[name = string("per_layer_slice_9_begin_0"), val = tensor<int32, [3]>([0, 0, 9472])];
+            tensor<int32, [3]> per_layer_slice_9_end_0 = const()[name = string("per_layer_slice_9_end_0"), val = tensor<int32, [3]>([1, 1, 9728])];
+            tensor<bool, [3]> per_layer_slice_9_end_mask_0 = const()[name = string("per_layer_slice_9_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_9_cast_fp16 = slice_by_index(begin = per_layer_slice_9_begin_0, end = per_layer_slice_9_end_0, end_mask = per_layer_slice_9_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_9_cast_fp16")];
+            tensor<int32, [3]> var_2253 = const()[name = string("op_2253"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_115_axes_0 = const()[name = string("input_115_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2254 = transpose(perm = var_2253, x = hidden_states_45_cast_fp16)[name = string("transpose_44")];
+            tensor<fp16, [1, 2560, 1, 1]> input_115 = expand_dims(axes = input_115_axes_0, x = var_2254)[name = string("input_115")];
+            string gated_25_pad_type_0 = const()[name = string("gated_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_25_strides_0 = const()[name = string("gated_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_25_pad_0 = const()[name = string("gated_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_25_dilations_0 = const()[name = string("gated_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_25_groups_0 = const()[name = string("gated_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_25 = conv(dilations = gated_25_dilations_0, groups = gated_25_groups_0, pad = gated_25_pad_0, pad_type = gated_25_pad_type_0, strides = gated_25_strides_0, weight = layers_4_per_layer_input_gate_weight_palettized, x = input_115)[name = string("gated_25")];
+            string gated_27_mode_0 = const()[name = string("gated_27_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_27 = gelu(mode = gated_27_mode_0, x = gated_25)[name = string("gated_27")];
+            tensor<int32, [3]> var_2273 = const()[name = string("op_2273"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_9_axes_0 = const()[name = string("per_layer_slice_conv_9_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_2274_cast_fp16 = transpose(perm = var_2273, x = per_layer_slice_9_cast_fp16)[name = string("transpose_43")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_9_cast_fp16 = expand_dims(axes = per_layer_slice_conv_9_axes_0, x = var_2274_cast_fp16)[name = string("per_layer_slice_conv_9_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_117_cast_fp16 = mul(x = gated_27, y = per_layer_slice_conv_9_cast_fp16)[name = string("input_117_cast_fp16")];
+            string gated_29_pad_type_0 = const()[name = string("gated_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_29_strides_0 = const()[name = string("gated_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_29_pad_0 = const()[name = string("gated_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_29_dilations_0 = const()[name = string("gated_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_29_groups_0 = const()[name = string("gated_29_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_4_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403127744))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403455488))))[name = string("layers_4_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_29_cast_fp16 = conv(dilations = gated_29_dilations_0, groups = gated_29_groups_0, pad = gated_29_pad_0, pad_type = gated_29_pad_type_0, strides = gated_29_strides_0, weight = layers_4_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_117_cast_fp16)[name = string("gated_29_cast_fp16")];
+            tensor<int32, [1]> var_2290_axes_0 = const()[name = string("op_2290_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2290_cast_fp16 = squeeze(axes = var_2290_axes_0, x = gated_29_cast_fp16)[name = string("op_2290_cast_fp16")];
+            tensor<int32, [3]> var_2294 = const()[name = string("op_2294"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2300 = const()[name = string("op_2300"), val = int32(-1)];
+            fp16 const_38_promoted_to_fp16 = const()[name = string("const_38_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_77_cast_fp16 = transpose(perm = var_2294, x = var_2290_cast_fp16)[name = string("transpose_42")];
+            tensor<fp16, [1, 1, 2560]> var_2302_cast_fp16 = mul(x = x_77_cast_fp16, y = const_38_promoted_to_fp16)[name = string("op_2302_cast_fp16")];
+            bool input_119_interleave_0 = const()[name = string("input_119_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_119_cast_fp16 = concat(axis = var_2300, interleave = input_119_interleave_0, values = (x_77_cast_fp16, var_2302_cast_fp16))[name = string("input_119_cast_fp16")];
+            tensor<int32, [1]> normed_117_axes_0 = const()[name = string("normed_117_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2297_to_fp16 = const()[name = string("op_2297_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_117_cast_fp16 = layer_norm(axes = normed_117_axes_0, epsilon = var_2297_to_fp16, x = input_119_cast_fp16)[name = string("normed_117_cast_fp16")];
+            tensor<int32, [2]> var_2307_split_sizes_0 = const()[name = string("op_2307_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2307_axis_0 = const()[name = string("op_2307_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2307_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2307_cast_fp16_1 = split(axis = var_2307_axis_0, split_sizes = var_2307_split_sizes_0, x = normed_117_cast_fp16)[name = string("op_2307_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_4_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403458112)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_49_cast_fp16 = mul(x = var_2307_cast_fp16_0, y = layers_4_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_49_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_51_cast_fp16 = add(x = hidden_states_45_cast_fp16, y = hidden_states_49_cast_fp16)[name = string("hidden_states_51_cast_fp16")];
+            tensor<fp16, [1]> const_39_promoted_to_fp16 = const()[name = string("const_39_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.c6p-1])];
+            tensor<fp16, [1, 1, 2560]> x_79_cast_fp16 = mul(x = hidden_states_51_cast_fp16, y = const_39_promoted_to_fp16)[name = string("x_79_cast_fp16")];
+            int32 var_2322 = const()[name = string("op_2322"), val = int32(-1)];
+            fp16 const_40_promoted_to_fp16 = const()[name = string("const_40_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_2324_cast_fp16 = mul(x = x_79_cast_fp16, y = const_40_promoted_to_fp16)[name = string("op_2324_cast_fp16")];
+            bool input_121_interleave_0 = const()[name = string("input_121_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_121_cast_fp16 = concat(axis = var_2322, interleave = input_121_interleave_0, values = (x_79_cast_fp16, var_2324_cast_fp16))[name = string("input_121_cast_fp16")];
+            tensor<int32, [1]> normed_121_axes_0 = const()[name = string("normed_121_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2319_to_fp16 = const()[name = string("op_2319_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_121_cast_fp16 = layer_norm(axes = normed_121_axes_0, epsilon = var_2319_to_fp16, x = input_121_cast_fp16)[name = string("normed_121_cast_fp16")];
+            tensor<int32, [2]> var_2329_split_sizes_0 = const()[name = string("op_2329_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2329_axis_0 = const()[name = string("op_2329_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2329_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2329_cast_fp16_1 = split(axis = var_2329_axis_0, split_sizes = var_2329_split_sizes_0, x = normed_121_cast_fp16)[name = string("op_2329_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_5_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403463296)))];
+            tensor<fp16, [1, 1, 2560]> h_31_cast_fp16 = mul(x = var_2329_cast_fp16_0, y = layers_5_input_layernorm_weight_promoted_to_fp16)[name = string("h_31_cast_fp16")];
+            tensor<int32, [3]> var_2335 = const()[name = string("op_2335"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_2338_axes_0 = const()[name = string("op_2338_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2336_cast_fp16 = transpose(perm = var_2335, x = h_31_cast_fp16)[name = string("transpose_41")];
+            tensor<fp16, [1, 2560, 1, 1]> var_2338_cast_fp16 = expand_dims(axes = var_2338_axes_0, x = var_2336_cast_fp16)[name = string("op_2338_cast_fp16")];
+            string var_2354_pad_type_0 = const()[name = string("op_2354_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2354_strides_0 = const()[name = string("op_2354_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2354_pad_0 = const()[name = string("op_2354_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2354_dilations_0 = const()[name = string("op_2354_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2354_groups_0 = const()[name = string("op_2354_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_2354 = conv(dilations = var_2354_dilations_0, groups = var_2354_groups_0, pad = var_2354_pad_0, pad_type = var_2354_pad_type_0, strides = var_2354_strides_0, weight = layers_5_self_attn_q_proj_weight_palettized, x = var_2338_cast_fp16)[name = string("op_2354")];
+            tensor<int32, [4]> var_2359 = const()[name = string("op_2359"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_2360 = reshape(shape = var_2359, x = var_2354)[name = string("op_2360")];
+            tensor<int32, [4]> var_2365 = const()[name = string("op_2365"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_2375 = const()[name = string("op_2375"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_2366 = transpose(perm = var_2365, x = var_2360)[name = string("transpose_40")];
+            tensor<fp16, [1, 8, 256]> x_81 = reshape(shape = var_2375, x = var_2366)[name = string("x_81")];
+            int32 var_2381 = const()[name = string("op_2381"), val = int32(-1)];
+            fp16 const_41_promoted = const()[name = string("const_41_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_2383 = mul(x = x_81, y = const_41_promoted)[name = string("op_2383")];
+            bool input_125_interleave_0 = const()[name = string("input_125_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_125 = concat(axis = var_2381, interleave = input_125_interleave_0, values = (x_81, var_2383))[name = string("input_125")];
+            tensor<int32, [1]> normed_125_axes_0 = const()[name = string("normed_125_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2378_to_fp16 = const()[name = string("op_2378_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_125_cast_fp16 = layer_norm(axes = normed_125_axes_0, epsilon = var_2378_to_fp16, x = input_125)[name = string("normed_125_cast_fp16")];
+            tensor<int32, [2]> var_2388_split_sizes_0 = const()[name = string("op_2388_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2388_axis_0 = const()[name = string("op_2388_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_2388_0, tensor<fp16, [1, 8, 256]> var_2388_1 = split(axis = var_2388_axis_0, split_sizes = var_2388_split_sizes_0, x = normed_125_cast_fp16)[name = string("op_2388")];
+            tensor<fp16, [1, 8, 256]> var_2390 = mul(x = var_2388_0, y = layers_0_self_attn_q_norm_weight)[name = string("op_2390")];
+            tensor<int32, [4]> var_2395 = const()[name = string("op_2395"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_33 = reshape(shape = var_2395, x = var_2390)[name = string("q_33")];
+            tensor<fp16, [1, 8, 1, 256]> var_2397_cast_fp16 = mul(x = q_33, y = cos_s)[name = string("op_2397_cast_fp16")];
+            tensor<int32, [2]> var_2398_split_sizes_0 = const()[name = string("op_2398_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2398_axis_0 = const()[name = string("op_2398_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_2398_0, tensor<fp16, [1, 8, 1, 128]> var_2398_1 = split(axis = var_2398_axis_0, split_sizes = var_2398_split_sizes_0, x = q_33)[name = string("op_2398")];
+            fp16 const_42_promoted = const()[name = string("const_42_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_2400 = mul(x = var_2398_1, y = const_42_promoted)[name = string("op_2400")];
+            int32 var_2402 = const()[name = string("op_2402"), val = int32(-1)];
+            bool var_2403_interleave_0 = const()[name = string("op_2403_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_2403 = concat(axis = var_2402, interleave = var_2403_interleave_0, values = (var_2400, var_2398_0))[name = string("op_2403")];
+            tensor<fp16, [1, 8, 1, 256]> var_2404_cast_fp16 = mul(x = var_2403, y = sin_s)[name = string("op_2404_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_35_cast_fp16 = add(x = var_2397_cast_fp16, y = var_2404_cast_fp16)[name = string("q_35_cast_fp16")];
+            bool attn_weights_21_transpose_x_0 = const()[name = string("attn_weights_21_transpose_x_0"), val = bool(false)];
+            bool attn_weights_21_transpose_y_0 = const()[name = string("attn_weights_21_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_21_cast_fp16 = matmul(transpose_x = attn_weights_21_transpose_x_0, transpose_y = attn_weights_21_transpose_y_0, x = q_35_cast_fp16, y = transpose_36_cast_fp16)[name = string("attn_weights_21_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_83_cast_fp16 = add(x = attn_weights_21_cast_fp16, y = causal_mask_sliding)[name = string("x_83_cast_fp16")];
+            tensor<int32, [1]> reduce_max_5_axes_0 = const()[name = string("reduce_max_5_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_5_keep_dims_0 = const()[name = string("reduce_max_5_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_5 = reduce_max(axes = reduce_max_5_axes_0, keep_dims = reduce_max_5_keep_dims_0, x = x_83_cast_fp16)[name = string("reduce_max_5")];
+            tensor<fp16, [1, 8, 1, 512]> var_2436 = sub(x = x_83_cast_fp16, y = reduce_max_5)[name = string("op_2436")];
+            tensor<fp16, [1, 8, 1, 512]> var_2442 = exp(x = var_2436)[name = string("op_2442")];
+            tensor<int32, [1]> var_2452_axes_0 = const()[name = string("op_2452_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2452_keep_dims_0 = const()[name = string("op_2452_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_2452 = reduce_sum(axes = var_2452_axes_0, keep_dims = var_2452_keep_dims_0, x = var_2442)[name = string("op_2452")];
+            tensor<fp16, [1, 8, 1, 512]> var_2458_cast_fp16 = real_div(x = var_2442, y = var_2452)[name = string("op_2458_cast_fp16")];
+            bool attn_output_31_transpose_x_0 = const()[name = string("attn_output_31_transpose_x_0"), val = bool(false)];
+            bool attn_output_31_transpose_y_0 = const()[name = string("attn_output_31_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_31_cast_fp16 = matmul(transpose_x = attn_output_31_transpose_x_0, transpose_y = attn_output_31_transpose_y_0, x = var_2458_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_31_cast_fp16")];
+            tensor<int32, [4]> var_2469 = const()[name = string("op_2469"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_2476 = const()[name = string("op_2476"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_2470_cast_fp16 = transpose(perm = var_2469, x = attn_output_31_cast_fp16)[name = string("transpose_39")];
+            tensor<fp16, [1, 1, 2048]> attn_output_33_cast_fp16 = reshape(shape = var_2476, x = var_2470_cast_fp16)[name = string("attn_output_33_cast_fp16")];
+            tensor<int32, [3]> var_2481 = const()[name = string("op_2481"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_2497_pad_type_0 = const()[name = string("op_2497_pad_type_0"), val = string("valid")];
+            int32 var_2497_groups_0 = const()[name = string("op_2497_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_2497_strides_0 = const()[name = string("op_2497_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_2497_pad_0 = const()[name = string("op_2497_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_2497_dilations_0 = const()[name = string("op_2497_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_5_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403468480))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406089984))))[name = string("squeeze_5_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_2482_cast_fp16 = transpose(perm = var_2481, x = attn_output_33_cast_fp16)[name = string("transpose_38")];
+            tensor<fp16, [1, 2560, 1]> var_2497_cast_fp16 = conv(dilations = var_2497_dilations_0, groups = var_2497_groups_0, pad = var_2497_pad_0, pad_type = var_2497_pad_type_0, strides = var_2497_strides_0, weight = squeeze_5_cast_fp16_to_fp32_to_fp16_palettized, x = var_2482_cast_fp16)[name = string("op_2497_cast_fp16")];
+            tensor<int32, [3]> var_2501 = const()[name = string("op_2501"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2507 = const()[name = string("op_2507"), val = int32(-1)];
+            fp16 const_43_promoted_to_fp16 = const()[name = string("const_43_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_87_cast_fp16 = transpose(perm = var_2501, x = var_2497_cast_fp16)[name = string("transpose_37")];
+            tensor<fp16, [1, 1, 2560]> var_2509_cast_fp16 = mul(x = x_87_cast_fp16, y = const_43_promoted_to_fp16)[name = string("op_2509_cast_fp16")];
+            bool input_129_interleave_0 = const()[name = string("input_129_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_129_cast_fp16 = concat(axis = var_2507, interleave = input_129_interleave_0, values = (x_87_cast_fp16, var_2509_cast_fp16))[name = string("input_129_cast_fp16")];
+            tensor<int32, [1]> normed_129_axes_0 = const()[name = string("normed_129_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2504_to_fp16 = const()[name = string("op_2504_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_129_cast_fp16 = layer_norm(axes = normed_129_axes_0, epsilon = var_2504_to_fp16, x = input_129_cast_fp16)[name = string("normed_129_cast_fp16")];
+            tensor<int32, [2]> var_2514_split_sizes_0 = const()[name = string("op_2514_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2514_axis_0 = const()[name = string("op_2514_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2514_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2514_cast_fp16_1 = split(axis = var_2514_axis_0, split_sizes = var_2514_split_sizes_0, x = normed_129_cast_fp16)[name = string("op_2514_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_5_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406092608)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_35_cast_fp16 = mul(x = var_2514_cast_fp16_0, y = layers_5_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_35_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_89_cast_fp16 = add(x = x_79_cast_fp16, y = attn_output_35_cast_fp16)[name = string("x_89_cast_fp16")];
+            int32 var_2523 = const()[name = string("op_2523"), val = int32(-1)];
+            fp16 const_44_promoted_to_fp16 = const()[name = string("const_44_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_2525_cast_fp16 = mul(x = x_89_cast_fp16, y = const_44_promoted_to_fp16)[name = string("op_2525_cast_fp16")];
+            bool input_131_interleave_0 = const()[name = string("input_131_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_131_cast_fp16 = concat(axis = var_2523, interleave = input_131_interleave_0, values = (x_89_cast_fp16, var_2525_cast_fp16))[name = string("input_131_cast_fp16")];
+            tensor<int32, [1]> normed_133_axes_0 = const()[name = string("normed_133_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2520_to_fp16 = const()[name = string("op_2520_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_133_cast_fp16 = layer_norm(axes = normed_133_axes_0, epsilon = var_2520_to_fp16, x = input_131_cast_fp16)[name = string("normed_133_cast_fp16")];
+            tensor<int32, [2]> var_2530_split_sizes_0 = const()[name = string("op_2530_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2530_axis_0 = const()[name = string("op_2530_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2530_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2530_cast_fp16_1 = split(axis = var_2530_axis_0, split_sizes = var_2530_split_sizes_0, x = normed_133_cast_fp16)[name = string("op_2530_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_5_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406097792)))];
+            tensor<fp16, [1, 1, 2560]> h_33_cast_fp16 = mul(x = var_2530_cast_fp16_0, y = layers_5_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_33_cast_fp16")];
+            tensor<int32, [3]> var_2541 = const()[name = string("op_2541"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_133_axes_0 = const()[name = string("input_133_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2542 = transpose(perm = var_2541, x = h_33_cast_fp16)[name = string("transpose_36")];
+            tensor<fp16, [1, 2560, 1, 1]> input_133 = expand_dims(axes = input_133_axes_0, x = var_2542)[name = string("input_133")];
+            string gate_21_pad_type_0 = const()[name = string("gate_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_21_strides_0 = const()[name = string("gate_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_21_pad_0 = const()[name = string("gate_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_21_dilations_0 = const()[name = string("gate_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_21_groups_0 = const()[name = string("gate_21_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_21 = conv(dilations = gate_21_dilations_0, groups = gate_21_groups_0, pad = gate_21_pad_0, pad_type = gate_21_pad_type_0, strides = gate_21_strides_0, weight = layers_5_mlp_gate_proj_weight_palettized, x = input_133)[name = string("gate_21")];
+            string up_11_pad_type_0 = const()[name = string("up_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_11_strides_0 = const()[name = string("up_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_11_pad_0 = const()[name = string("up_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_11_dilations_0 = const()[name = string("up_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_11_groups_0 = const()[name = string("up_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_11 = conv(dilations = up_11_dilations_0, groups = up_11_groups_0, pad = up_11_pad_0, pad_type = up_11_pad_type_0, strides = up_11_strides_0, weight = layers_5_mlp_up_proj_weight_palettized, x = input_133)[name = string("up_11")];
+            string gate_23_mode_0 = const()[name = string("gate_23_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_23 = gelu(mode = gate_23_mode_0, x = gate_21)[name = string("gate_23")];
+            tensor<fp16, [1, 10240, 1, 1]> input_135 = mul(x = gate_23, y = up_11)[name = string("input_135")];
+            string mlp_out_11_pad_type_0 = const()[name = string("mlp_out_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_11_strides_0 = const()[name = string("mlp_out_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_11_pad_0 = const()[name = string("mlp_out_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_11_dilations_0 = const()[name = string("mlp_out_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_11_groups_0 = const()[name = string("mlp_out_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_11 = conv(dilations = mlp_out_11_dilations_0, groups = mlp_out_11_groups_0, pad = mlp_out_11_pad_0, pad_type = mlp_out_11_pad_type_0, strides = mlp_out_11_strides_0, weight = layers_5_mlp_down_proj_weight_palettized, x = input_135)[name = string("mlp_out_11")];
+            tensor<int32, [1]> var_2582_axes_0 = const()[name = string("op_2582_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2582 = squeeze(axes = var_2582_axes_0, x = mlp_out_11)[name = string("op_2582")];
+            tensor<int32, [3]> var_2586 = const()[name = string("op_2586"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2592 = const()[name = string("op_2592"), val = int32(-1)];
+            fp16 const_45_promoted = const()[name = string("const_45_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_91 = transpose(perm = var_2586, x = var_2582)[name = string("transpose_35")];
+            tensor<fp16, [1, 1, 2560]> var_2594 = mul(x = x_91, y = const_45_promoted)[name = string("op_2594")];
+            bool input_137_interleave_0 = const()[name = string("input_137_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_137 = concat(axis = var_2592, interleave = input_137_interleave_0, values = (x_91, var_2594))[name = string("input_137")];
+            tensor<int32, [1]> normed_137_axes_0 = const()[name = string("normed_137_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2589_to_fp16 = const()[name = string("op_2589_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_137_cast_fp16 = layer_norm(axes = normed_137_axes_0, epsilon = var_2589_to_fp16, x = input_137)[name = string("normed_137_cast_fp16")];
+            tensor<int32, [2]> var_2599_split_sizes_0 = const()[name = string("op_2599_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2599_axis_0 = const()[name = string("op_2599_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2599_0, tensor<fp16, [1, 1, 2560]> var_2599_1 = split(axis = var_2599_axis_0, split_sizes = var_2599_split_sizes_0, x = normed_137_cast_fp16)[name = string("op_2599")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_53 = mul(x = var_2599_0, y = layers_5_post_feedforward_layernorm_weight)[name = string("hidden_states_53")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_55_cast_fp16 = add(x = x_89_cast_fp16, y = hidden_states_53)[name = string("hidden_states_55_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_11_begin_0 = const()[name = string("per_layer_slice_11_begin_0"), val = tensor<int32, [3]>([0, 0, 9728])];
+            tensor<int32, [3]> per_layer_slice_11_end_0 = const()[name = string("per_layer_slice_11_end_0"), val = tensor<int32, [3]>([1, 1, 9984])];
+            tensor<bool, [3]> per_layer_slice_11_end_mask_0 = const()[name = string("per_layer_slice_11_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_11_cast_fp16 = slice_by_index(begin = per_layer_slice_11_begin_0, end = per_layer_slice_11_end_0, end_mask = per_layer_slice_11_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_11_cast_fp16")];
+            tensor<int32, [3]> var_2627 = const()[name = string("op_2627"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_139_axes_0 = const()[name = string("input_139_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2628 = transpose(perm = var_2627, x = hidden_states_55_cast_fp16)[name = string("transpose_34")];
+            tensor<fp16, [1, 2560, 1, 1]> input_139 = expand_dims(axes = input_139_axes_0, x = var_2628)[name = string("input_139")];
+            string gated_31_pad_type_0 = const()[name = string("gated_31_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_31_strides_0 = const()[name = string("gated_31_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_31_pad_0 = const()[name = string("gated_31_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_31_dilations_0 = const()[name = string("gated_31_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_31_groups_0 = const()[name = string("gated_31_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_31 = conv(dilations = gated_31_dilations_0, groups = gated_31_groups_0, pad = gated_31_pad_0, pad_type = gated_31_pad_type_0, strides = gated_31_strides_0, weight = layers_5_per_layer_input_gate_weight_palettized, x = input_139)[name = string("gated_31")];
+            string gated_33_mode_0 = const()[name = string("gated_33_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_33 = gelu(mode = gated_33_mode_0, x = gated_31)[name = string("gated_33")];
+            tensor<int32, [3]> var_2647 = const()[name = string("op_2647"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_11_axes_0 = const()[name = string("per_layer_slice_conv_11_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_2648_cast_fp16 = transpose(perm = var_2647, x = per_layer_slice_11_cast_fp16)[name = string("transpose_33")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_11_cast_fp16 = expand_dims(axes = per_layer_slice_conv_11_axes_0, x = var_2648_cast_fp16)[name = string("per_layer_slice_conv_11_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_141_cast_fp16 = mul(x = gated_33, y = per_layer_slice_conv_11_cast_fp16)[name = string("input_141_cast_fp16")];
+            string gated_35_pad_type_0 = const()[name = string("gated_35_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_35_strides_0 = const()[name = string("gated_35_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_35_pad_0 = const()[name = string("gated_35_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_35_dilations_0 = const()[name = string("gated_35_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_35_groups_0 = const()[name = string("gated_35_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_5_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406102976))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406430720))))[name = string("layers_5_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_35_cast_fp16 = conv(dilations = gated_35_dilations_0, groups = gated_35_groups_0, pad = gated_35_pad_0, pad_type = gated_35_pad_type_0, strides = gated_35_strides_0, weight = layers_5_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_141_cast_fp16)[name = string("gated_35_cast_fp16")];
+            tensor<int32, [1]> var_2664_axes_0 = const()[name = string("op_2664_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2664_cast_fp16 = squeeze(axes = var_2664_axes_0, x = gated_35_cast_fp16)[name = string("op_2664_cast_fp16")];
+            tensor<int32, [3]> var_2668 = const()[name = string("op_2668"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2674 = const()[name = string("op_2674"), val = int32(-1)];
+            fp16 const_46_promoted_to_fp16 = const()[name = string("const_46_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_93_cast_fp16 = transpose(perm = var_2668, x = var_2664_cast_fp16)[name = string("transpose_32")];
+            tensor<fp16, [1, 1, 2560]> var_2676_cast_fp16 = mul(x = x_93_cast_fp16, y = const_46_promoted_to_fp16)[name = string("op_2676_cast_fp16")];
+            bool input_143_interleave_0 = const()[name = string("input_143_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_143_cast_fp16 = concat(axis = var_2674, interleave = input_143_interleave_0, values = (x_93_cast_fp16, var_2676_cast_fp16))[name = string("input_143_cast_fp16")];
+            tensor<int32, [1]> normed_141_axes_0 = const()[name = string("normed_141_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2671_to_fp16 = const()[name = string("op_2671_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_141_cast_fp16 = layer_norm(axes = normed_141_axes_0, epsilon = var_2671_to_fp16, x = input_143_cast_fp16)[name = string("normed_141_cast_fp16")];
+            tensor<int32, [2]> var_2681_split_sizes_0 = const()[name = string("op_2681_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2681_axis_0 = const()[name = string("op_2681_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2681_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2681_cast_fp16_1 = split(axis = var_2681_axis_0, split_sizes = var_2681_split_sizes_0, x = normed_141_cast_fp16)[name = string("op_2681_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_5_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406433344)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_59_cast_fp16 = mul(x = var_2681_cast_fp16_0, y = layers_5_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_59_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_61_cast_fp16 = add(x = hidden_states_55_cast_fp16, y = hidden_states_59_cast_fp16)[name = string("hidden_states_61_cast_fp16")];
+            tensor<fp16, [1]> const_47_promoted_to_fp16 = const()[name = string("const_47_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.c4p-1])];
+            tensor<fp16, [1, 1, 2560]> x_95_cast_fp16 = mul(x = hidden_states_61_cast_fp16, y = const_47_promoted_to_fp16)[name = string("x_95_cast_fp16")];
+            int32 var_2696 = const()[name = string("op_2696"), val = int32(-1)];
+            fp16 const_48_promoted_to_fp16 = const()[name = string("const_48_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_2698_cast_fp16 = mul(x = x_95_cast_fp16, y = const_48_promoted_to_fp16)[name = string("op_2698_cast_fp16")];
+            bool input_145_interleave_0 = const()[name = string("input_145_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_145_cast_fp16 = concat(axis = var_2696, interleave = input_145_interleave_0, values = (x_95_cast_fp16, var_2698_cast_fp16))[name = string("input_145_cast_fp16")];
+            tensor<int32, [1]> normed_145_axes_0 = const()[name = string("normed_145_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2693_to_fp16 = const()[name = string("op_2693_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_145_cast_fp16 = layer_norm(axes = normed_145_axes_0, epsilon = var_2693_to_fp16, x = input_145_cast_fp16)[name = string("normed_145_cast_fp16")];
+            tensor<int32, [2]> var_2703_split_sizes_0 = const()[name = string("op_2703_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2703_axis_0 = const()[name = string("op_2703_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2703_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2703_cast_fp16_1 = split(axis = var_2703_axis_0, split_sizes = var_2703_split_sizes_0, x = normed_145_cast_fp16)[name = string("op_2703_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_6_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406438528)))];
+            tensor<fp16, [1, 1, 2560]> h_37_cast_fp16 = mul(x = var_2703_cast_fp16_0, y = layers_6_input_layernorm_weight_promoted_to_fp16)[name = string("h_37_cast_fp16")];
+            tensor<int32, [3]> var_2709 = const()[name = string("op_2709"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_2712_axes_0 = const()[name = string("op_2712_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2710_cast_fp16 = transpose(perm = var_2709, x = h_37_cast_fp16)[name = string("transpose_31")];
+            tensor<fp16, [1, 2560, 1, 1]> var_2712_cast_fp16 = expand_dims(axes = var_2712_axes_0, x = var_2710_cast_fp16)[name = string("op_2712_cast_fp16")];
+            string var_2728_pad_type_0 = const()[name = string("op_2728_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2728_strides_0 = const()[name = string("op_2728_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2728_pad_0 = const()[name = string("op_2728_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2728_dilations_0 = const()[name = string("op_2728_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2728_groups_0 = const()[name = string("op_2728_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_2728 = conv(dilations = var_2728_dilations_0, groups = var_2728_groups_0, pad = var_2728_pad_0, pad_type = var_2728_pad_type_0, strides = var_2728_strides_0, weight = layers_6_self_attn_q_proj_weight_palettized, x = var_2712_cast_fp16)[name = string("op_2728")];
+            tensor<int32, [4]> var_2733 = const()[name = string("op_2733"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_2734 = reshape(shape = var_2733, x = var_2728)[name = string("op_2734")];
+            tensor<int32, [4]> var_2739 = const()[name = string("op_2739"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_2749 = const()[name = string("op_2749"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_2740 = transpose(perm = var_2739, x = var_2734)[name = string("transpose_30")];
+            tensor<fp16, [1, 8, 256]> x_97 = reshape(shape = var_2749, x = var_2740)[name = string("x_97")];
+            int32 var_2755 = const()[name = string("op_2755"), val = int32(-1)];
+            fp16 const_49_promoted = const()[name = string("const_49_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_2757 = mul(x = x_97, y = const_49_promoted)[name = string("op_2757")];
+            bool input_149_interleave_0 = const()[name = string("input_149_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_149 = concat(axis = var_2755, interleave = input_149_interleave_0, values = (x_97, var_2757))[name = string("input_149")];
+            tensor<int32, [1]> normed_149_axes_0 = const()[name = string("normed_149_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2752_to_fp16 = const()[name = string("op_2752_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_149_cast_fp16 = layer_norm(axes = normed_149_axes_0, epsilon = var_2752_to_fp16, x = input_149)[name = string("normed_149_cast_fp16")];
+            tensor<int32, [2]> var_2762_split_sizes_0 = const()[name = string("op_2762_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2762_axis_0 = const()[name = string("op_2762_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_2762_0, tensor<fp16, [1, 8, 256]> var_2762_1 = split(axis = var_2762_axis_0, split_sizes = var_2762_split_sizes_0, x = normed_149_cast_fp16)[name = string("op_2762")];
+            tensor<fp16, [1, 8, 256]> var_2764 = mul(x = var_2762_0, y = layers_0_self_attn_q_norm_weight)[name = string("op_2764")];
+            tensor<int32, [4]> var_2769 = const()[name = string("op_2769"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_39 = reshape(shape = var_2769, x = var_2764)[name = string("q_39")];
+            tensor<fp16, [1, 8, 1, 256]> var_2771_cast_fp16 = mul(x = q_39, y = cos_s)[name = string("op_2771_cast_fp16")];
+            tensor<int32, [2]> var_2772_split_sizes_0 = const()[name = string("op_2772_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2772_axis_0 = const()[name = string("op_2772_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_2772_0, tensor<fp16, [1, 8, 1, 128]> var_2772_1 = split(axis = var_2772_axis_0, split_sizes = var_2772_split_sizes_0, x = q_39)[name = string("op_2772")];
+            fp16 const_50_promoted = const()[name = string("const_50_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_2774 = mul(x = var_2772_1, y = const_50_promoted)[name = string("op_2774")];
+            int32 var_2776 = const()[name = string("op_2776"), val = int32(-1)];
+            bool var_2777_interleave_0 = const()[name = string("op_2777_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_2777 = concat(axis = var_2776, interleave = var_2777_interleave_0, values = (var_2774, var_2772_0))[name = string("op_2777")];
+            tensor<fp16, [1, 8, 1, 256]> var_2778_cast_fp16 = mul(x = var_2777, y = sin_s)[name = string("op_2778_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_41_cast_fp16 = add(x = var_2771_cast_fp16, y = var_2778_cast_fp16)[name = string("q_41_cast_fp16")];
+            bool attn_weights_25_transpose_x_0 = const()[name = string("attn_weights_25_transpose_x_0"), val = bool(false)];
+            bool attn_weights_25_transpose_y_0 = const()[name = string("attn_weights_25_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_25_cast_fp16 = matmul(transpose_x = attn_weights_25_transpose_x_0, transpose_y = attn_weights_25_transpose_y_0, x = q_41_cast_fp16, y = transpose_36_cast_fp16)[name = string("attn_weights_25_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_99_cast_fp16 = add(x = attn_weights_25_cast_fp16, y = causal_mask_sliding)[name = string("x_99_cast_fp16")];
+            tensor<int32, [1]> reduce_max_6_axes_0 = const()[name = string("reduce_max_6_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_6_keep_dims_0 = const()[name = string("reduce_max_6_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_6 = reduce_max(axes = reduce_max_6_axes_0, keep_dims = reduce_max_6_keep_dims_0, x = x_99_cast_fp16)[name = string("reduce_max_6")];
+            tensor<fp16, [1, 8, 1, 512]> var_2810 = sub(x = x_99_cast_fp16, y = reduce_max_6)[name = string("op_2810")];
+            tensor<fp16, [1, 8, 1, 512]> var_2816 = exp(x = var_2810)[name = string("op_2816")];
+            tensor<int32, [1]> var_2826_axes_0 = const()[name = string("op_2826_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2826_keep_dims_0 = const()[name = string("op_2826_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_2826 = reduce_sum(axes = var_2826_axes_0, keep_dims = var_2826_keep_dims_0, x = var_2816)[name = string("op_2826")];
+            tensor<fp16, [1, 8, 1, 512]> var_2832_cast_fp16 = real_div(x = var_2816, y = var_2826)[name = string("op_2832_cast_fp16")];
+            bool attn_output_37_transpose_x_0 = const()[name = string("attn_output_37_transpose_x_0"), val = bool(false)];
+            bool attn_output_37_transpose_y_0 = const()[name = string("attn_output_37_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_37_cast_fp16 = matmul(transpose_x = attn_output_37_transpose_x_0, transpose_y = attn_output_37_transpose_y_0, x = var_2832_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_37_cast_fp16")];
+            tensor<int32, [4]> var_2843 = const()[name = string("op_2843"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_2850 = const()[name = string("op_2850"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_2844_cast_fp16 = transpose(perm = var_2843, x = attn_output_37_cast_fp16)[name = string("transpose_29")];
+            tensor<fp16, [1, 1, 2048]> attn_output_39_cast_fp16 = reshape(shape = var_2850, x = var_2844_cast_fp16)[name = string("attn_output_39_cast_fp16")];
+            tensor<int32, [3]> var_2855 = const()[name = string("op_2855"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_2871_pad_type_0 = const()[name = string("op_2871_pad_type_0"), val = string("valid")];
+            int32 var_2871_groups_0 = const()[name = string("op_2871_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_2871_strides_0 = const()[name = string("op_2871_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_2871_pad_0 = const()[name = string("op_2871_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_2871_dilations_0 = const()[name = string("op_2871_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_6_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406443712))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409065216))))[name = string("squeeze_6_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_2856_cast_fp16 = transpose(perm = var_2855, x = attn_output_39_cast_fp16)[name = string("transpose_28")];
+            tensor<fp16, [1, 2560, 1]> var_2871_cast_fp16 = conv(dilations = var_2871_dilations_0, groups = var_2871_groups_0, pad = var_2871_pad_0, pad_type = var_2871_pad_type_0, strides = var_2871_strides_0, weight = squeeze_6_cast_fp16_to_fp32_to_fp16_palettized, x = var_2856_cast_fp16)[name = string("op_2871_cast_fp16")];
+            tensor<int32, [3]> var_2875 = const()[name = string("op_2875"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2881 = const()[name = string("op_2881"), val = int32(-1)];
+            fp16 const_51_promoted_to_fp16 = const()[name = string("const_51_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_103_cast_fp16 = transpose(perm = var_2875, x = var_2871_cast_fp16)[name = string("transpose_27")];
+            tensor<fp16, [1, 1, 2560]> var_2883_cast_fp16 = mul(x = x_103_cast_fp16, y = const_51_promoted_to_fp16)[name = string("op_2883_cast_fp16")];
+            bool input_153_interleave_0 = const()[name = string("input_153_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_153_cast_fp16 = concat(axis = var_2881, interleave = input_153_interleave_0, values = (x_103_cast_fp16, var_2883_cast_fp16))[name = string("input_153_cast_fp16")];
+            tensor<int32, [1]> normed_153_axes_0 = const()[name = string("normed_153_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2878_to_fp16 = const()[name = string("op_2878_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_153_cast_fp16 = layer_norm(axes = normed_153_axes_0, epsilon = var_2878_to_fp16, x = input_153_cast_fp16)[name = string("normed_153_cast_fp16")];
+            tensor<int32, [2]> var_2888_split_sizes_0 = const()[name = string("op_2888_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2888_axis_0 = const()[name = string("op_2888_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2888_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2888_cast_fp16_1 = split(axis = var_2888_axis_0, split_sizes = var_2888_split_sizes_0, x = normed_153_cast_fp16)[name = string("op_2888_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_6_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409067840)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_41_cast_fp16 = mul(x = var_2888_cast_fp16_0, y = layers_6_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_41_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_105_cast_fp16 = add(x = x_95_cast_fp16, y = attn_output_41_cast_fp16)[name = string("x_105_cast_fp16")];
+            int32 var_2897 = const()[name = string("op_2897"), val = int32(-1)];
+            fp16 const_52_promoted_to_fp16 = const()[name = string("const_52_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_2899_cast_fp16 = mul(x = x_105_cast_fp16, y = const_52_promoted_to_fp16)[name = string("op_2899_cast_fp16")];
+            bool input_155_interleave_0 = const()[name = string("input_155_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_155_cast_fp16 = concat(axis = var_2897, interleave = input_155_interleave_0, values = (x_105_cast_fp16, var_2899_cast_fp16))[name = string("input_155_cast_fp16")];
+            tensor<int32, [1]> normed_157_axes_0 = const()[name = string("normed_157_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2894_to_fp16 = const()[name = string("op_2894_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_157_cast_fp16 = layer_norm(axes = normed_157_axes_0, epsilon = var_2894_to_fp16, x = input_155_cast_fp16)[name = string("normed_157_cast_fp16")];
+            tensor<int32, [2]> var_2904_split_sizes_0 = const()[name = string("op_2904_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2904_axis_0 = const()[name = string("op_2904_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2904_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2904_cast_fp16_1 = split(axis = var_2904_axis_0, split_sizes = var_2904_split_sizes_0, x = normed_157_cast_fp16)[name = string("op_2904_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_6_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409073024)))];
+            tensor<fp16, [1, 1, 2560]> h_39_cast_fp16 = mul(x = var_2904_cast_fp16_0, y = layers_6_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_39_cast_fp16")];
+            tensor<int32, [3]> var_2915 = const()[name = string("op_2915"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_157_axes_0 = const()[name = string("input_157_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2916 = transpose(perm = var_2915, x = h_39_cast_fp16)[name = string("transpose_26")];
+            tensor<fp16, [1, 2560, 1, 1]> input_157 = expand_dims(axes = input_157_axes_0, x = var_2916)[name = string("input_157")];
+            string gate_25_pad_type_0 = const()[name = string("gate_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_25_strides_0 = const()[name = string("gate_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_25_pad_0 = const()[name = string("gate_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_25_dilations_0 = const()[name = string("gate_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_25_groups_0 = const()[name = string("gate_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_25 = conv(dilations = gate_25_dilations_0, groups = gate_25_groups_0, pad = gate_25_pad_0, pad_type = gate_25_pad_type_0, strides = gate_25_strides_0, weight = layers_6_mlp_gate_proj_weight_palettized, x = input_157)[name = string("gate_25")];
+            string up_13_pad_type_0 = const()[name = string("up_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_13_strides_0 = const()[name = string("up_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_13_pad_0 = const()[name = string("up_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_13_dilations_0 = const()[name = string("up_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_13_groups_0 = const()[name = string("up_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_13 = conv(dilations = up_13_dilations_0, groups = up_13_groups_0, pad = up_13_pad_0, pad_type = up_13_pad_type_0, strides = up_13_strides_0, weight = layers_6_mlp_up_proj_weight_palettized, x = input_157)[name = string("up_13")];
+            string gate_27_mode_0 = const()[name = string("gate_27_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_27 = gelu(mode = gate_27_mode_0, x = gate_25)[name = string("gate_27")];
+            tensor<fp16, [1, 10240, 1, 1]> input_159 = mul(x = gate_27, y = up_13)[name = string("input_159")];
+            string mlp_out_13_pad_type_0 = const()[name = string("mlp_out_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_13_strides_0 = const()[name = string("mlp_out_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_13_pad_0 = const()[name = string("mlp_out_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_13_dilations_0 = const()[name = string("mlp_out_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_13_groups_0 = const()[name = string("mlp_out_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_13 = conv(dilations = mlp_out_13_dilations_0, groups = mlp_out_13_groups_0, pad = mlp_out_13_pad_0, pad_type = mlp_out_13_pad_type_0, strides = mlp_out_13_strides_0, weight = layers_6_mlp_down_proj_weight_palettized, x = input_159)[name = string("mlp_out_13")];
+            tensor<int32, [1]> var_2956_axes_0 = const()[name = string("op_2956_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2956 = squeeze(axes = var_2956_axes_0, x = mlp_out_13)[name = string("op_2956")];
+            tensor<int32, [3]> var_2960 = const()[name = string("op_2960"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2966 = const()[name = string("op_2966"), val = int32(-1)];
+            fp16 const_53_promoted = const()[name = string("const_53_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_107 = transpose(perm = var_2960, x = var_2956)[name = string("transpose_25")];
+            tensor<fp16, [1, 1, 2560]> var_2968 = mul(x = x_107, y = const_53_promoted)[name = string("op_2968")];
+            bool input_161_interleave_0 = const()[name = string("input_161_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_161 = concat(axis = var_2966, interleave = input_161_interleave_0, values = (x_107, var_2968))[name = string("input_161")];
+            tensor<int32, [1]> normed_161_axes_0 = const()[name = string("normed_161_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2963_to_fp16 = const()[name = string("op_2963_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_161_cast_fp16 = layer_norm(axes = normed_161_axes_0, epsilon = var_2963_to_fp16, x = input_161)[name = string("normed_161_cast_fp16")];
+            tensor<int32, [2]> var_2973_split_sizes_0 = const()[name = string("op_2973_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2973_axis_0 = const()[name = string("op_2973_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2973_0, tensor<fp16, [1, 1, 2560]> var_2973_1 = split(axis = var_2973_axis_0, split_sizes = var_2973_split_sizes_0, x = normed_161_cast_fp16)[name = string("op_2973")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_63 = mul(x = var_2973_0, y = layers_6_post_feedforward_layernorm_weight)[name = string("hidden_states_63")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_65_cast_fp16 = add(x = x_105_cast_fp16, y = hidden_states_63)[name = string("hidden_states_65_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_13_begin_0 = const()[name = string("per_layer_slice_13_begin_0"), val = tensor<int32, [3]>([0, 0, 9984])];
+            tensor<int32, [3]> per_layer_slice_13_end_0 = const()[name = string("per_layer_slice_13_end_0"), val = tensor<int32, [3]>([1, 1, 10240])];
+            tensor<bool, [3]> per_layer_slice_13_end_mask_0 = const()[name = string("per_layer_slice_13_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_13_cast_fp16 = slice_by_index(begin = per_layer_slice_13_begin_0, end = per_layer_slice_13_end_0, end_mask = per_layer_slice_13_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_13_cast_fp16")];
+            tensor<int32, [3]> var_3001 = const()[name = string("op_3001"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_163_axes_0 = const()[name = string("input_163_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3002 = transpose(perm = var_3001, x = hidden_states_65_cast_fp16)[name = string("transpose_24")];
+            tensor<fp16, [1, 2560, 1, 1]> input_163 = expand_dims(axes = input_163_axes_0, x = var_3002)[name = string("input_163")];
+            string gated_37_pad_type_0 = const()[name = string("gated_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_37_strides_0 = const()[name = string("gated_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_37_pad_0 = const()[name = string("gated_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_37_dilations_0 = const()[name = string("gated_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_37_groups_0 = const()[name = string("gated_37_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_37 = conv(dilations = gated_37_dilations_0, groups = gated_37_groups_0, pad = gated_37_pad_0, pad_type = gated_37_pad_type_0, strides = gated_37_strides_0, weight = layers_6_per_layer_input_gate_weight_palettized, x = input_163)[name = string("gated_37")];
+            string gated_39_mode_0 = const()[name = string("gated_39_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_39 = gelu(mode = gated_39_mode_0, x = gated_37)[name = string("gated_39")];
+            tensor<int32, [3]> var_3021 = const()[name = string("op_3021"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_13_axes_0 = const()[name = string("per_layer_slice_conv_13_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_3022_cast_fp16 = transpose(perm = var_3021, x = per_layer_slice_13_cast_fp16)[name = string("transpose_23")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_13_cast_fp16 = expand_dims(axes = per_layer_slice_conv_13_axes_0, x = var_3022_cast_fp16)[name = string("per_layer_slice_conv_13_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_165_cast_fp16 = mul(x = gated_39, y = per_layer_slice_conv_13_cast_fp16)[name = string("input_165_cast_fp16")];
+            string gated_41_pad_type_0 = const()[name = string("gated_41_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_41_strides_0 = const()[name = string("gated_41_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_41_pad_0 = const()[name = string("gated_41_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_41_dilations_0 = const()[name = string("gated_41_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_41_groups_0 = const()[name = string("gated_41_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_6_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409078208))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409405952))))[name = string("layers_6_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_41_cast_fp16 = conv(dilations = gated_41_dilations_0, groups = gated_41_groups_0, pad = gated_41_pad_0, pad_type = gated_41_pad_type_0, strides = gated_41_strides_0, weight = layers_6_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_165_cast_fp16)[name = string("gated_41_cast_fp16")];
+            tensor<int32, [1]> var_3038_axes_0 = const()[name = string("op_3038_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3038_cast_fp16 = squeeze(axes = var_3038_axes_0, x = gated_41_cast_fp16)[name = string("op_3038_cast_fp16")];
+            tensor<int32, [3]> var_3042 = const()[name = string("op_3042"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3048 = const()[name = string("op_3048"), val = int32(-1)];
+            fp16 const_54_promoted_to_fp16 = const()[name = string("const_54_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_109_cast_fp16 = transpose(perm = var_3042, x = var_3038_cast_fp16)[name = string("transpose_22")];
+            tensor<fp16, [1, 1, 2560]> var_3050_cast_fp16 = mul(x = x_109_cast_fp16, y = const_54_promoted_to_fp16)[name = string("op_3050_cast_fp16")];
+            bool input_167_interleave_0 = const()[name = string("input_167_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_167_cast_fp16 = concat(axis = var_3048, interleave = input_167_interleave_0, values = (x_109_cast_fp16, var_3050_cast_fp16))[name = string("input_167_cast_fp16")];
+            tensor<int32, [1]> normed_165_axes_0 = const()[name = string("normed_165_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3045_to_fp16 = const()[name = string("op_3045_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_165_cast_fp16 = layer_norm(axes = normed_165_axes_0, epsilon = var_3045_to_fp16, x = input_167_cast_fp16)[name = string("normed_165_cast_fp16")];
+            tensor<int32, [2]> var_3055_split_sizes_0 = const()[name = string("op_3055_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3055_axis_0 = const()[name = string("op_3055_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3055_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3055_cast_fp16_1 = split(axis = var_3055_axis_0, split_sizes = var_3055_split_sizes_0, x = normed_165_cast_fp16)[name = string("op_3055_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_6_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409408576)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_69_cast_fp16 = mul(x = var_3055_cast_fp16_0, y = layers_6_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_69_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_71_cast_fp16 = add(x = hidden_states_65_cast_fp16, y = hidden_states_69_cast_fp16)[name = string("hidden_states_71_cast_fp16")];
+            tensor<fp16, [1]> const_55_promoted_to_fp16 = const()[name = string("const_55_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.b6p-1])];
+            tensor<fp16, [1, 1, 2560]> x_111_cast_fp16 = mul(x = hidden_states_71_cast_fp16, y = const_55_promoted_to_fp16)[name = string("x_111_cast_fp16")];
+            int32 var_3070 = const()[name = string("op_3070"), val = int32(-1)];
+            fp16 const_56_promoted_to_fp16 = const()[name = string("const_56_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_3072_cast_fp16 = mul(x = x_111_cast_fp16, y = const_56_promoted_to_fp16)[name = string("op_3072_cast_fp16")];
+            bool input_169_interleave_0 = const()[name = string("input_169_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_169_cast_fp16 = concat(axis = var_3070, interleave = input_169_interleave_0, values = (x_111_cast_fp16, var_3072_cast_fp16))[name = string("input_169_cast_fp16")];
+            tensor<int32, [1]> normed_169_axes_0 = const()[name = string("normed_169_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3067_to_fp16 = const()[name = string("op_3067_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_169_cast_fp16 = layer_norm(axes = normed_169_axes_0, epsilon = var_3067_to_fp16, x = input_169_cast_fp16)[name = string("normed_169_cast_fp16")];
+            tensor<int32, [2]> var_3077_split_sizes_0 = const()[name = string("op_3077_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3077_axis_0 = const()[name = string("op_3077_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3077_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3077_cast_fp16_1 = split(axis = var_3077_axis_0, split_sizes = var_3077_split_sizes_0, x = normed_169_cast_fp16)[name = string("op_3077_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_7_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409413760)))];
+            tensor<fp16, [1, 1, 2560]> h_43_cast_fp16 = mul(x = var_3077_cast_fp16_0, y = layers_7_input_layernorm_weight_promoted_to_fp16)[name = string("h_43_cast_fp16")];
+            tensor<int32, [3]> var_3083 = const()[name = string("op_3083"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_3086_axes_0 = const()[name = string("op_3086_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3084_cast_fp16 = transpose(perm = var_3083, x = h_43_cast_fp16)[name = string("transpose_21")];
+            tensor<fp16, [1, 2560, 1, 1]> var_3086_cast_fp16 = expand_dims(axes = var_3086_axes_0, x = var_3084_cast_fp16)[name = string("op_3086_cast_fp16")];
+            string var_3102_pad_type_0 = const()[name = string("op_3102_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_3102_strides_0 = const()[name = string("op_3102_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_3102_pad_0 = const()[name = string("op_3102_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_3102_dilations_0 = const()[name = string("op_3102_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_3102_groups_0 = const()[name = string("op_3102_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_3102 = conv(dilations = var_3102_dilations_0, groups = var_3102_groups_0, pad = var_3102_pad_0, pad_type = var_3102_pad_type_0, strides = var_3102_strides_0, weight = layers_7_self_attn_q_proj_weight_palettized, x = var_3086_cast_fp16)[name = string("op_3102")];
+            tensor<int32, [4]> var_3107 = const()[name = string("op_3107"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_3108 = reshape(shape = var_3107, x = var_3102)[name = string("op_3108")];
+            tensor<int32, [4]> var_3113 = const()[name = string("op_3113"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_3123 = const()[name = string("op_3123"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_3114 = transpose(perm = var_3113, x = var_3108)[name = string("transpose_20")];
+            tensor<fp16, [1, 8, 256]> x_113 = reshape(shape = var_3123, x = var_3114)[name = string("x_113")];
+            int32 var_3129 = const()[name = string("op_3129"), val = int32(-1)];
+            fp16 const_57_promoted = const()[name = string("const_57_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_3131 = mul(x = x_113, y = const_57_promoted)[name = string("op_3131")];
+            bool input_173_interleave_0 = const()[name = string("input_173_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_173 = concat(axis = var_3129, interleave = input_173_interleave_0, values = (x_113, var_3131))[name = string("input_173")];
+            tensor<int32, [1]> normed_173_axes_0 = const()[name = string("normed_173_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3126_to_fp16 = const()[name = string("op_3126_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_173_cast_fp16 = layer_norm(axes = normed_173_axes_0, epsilon = var_3126_to_fp16, x = input_173)[name = string("normed_173_cast_fp16")];
+            tensor<int32, [2]> var_3136_split_sizes_0 = const()[name = string("op_3136_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_3136_axis_0 = const()[name = string("op_3136_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_3136_0, tensor<fp16, [1, 8, 256]> var_3136_1 = split(axis = var_3136_axis_0, split_sizes = var_3136_split_sizes_0, x = normed_173_cast_fp16)[name = string("op_3136")];
+            tensor<fp16, [1, 8, 256]> var_3138 = mul(x = var_3136_0, y = layers_0_self_attn_q_norm_weight)[name = string("op_3138")];
+            tensor<int32, [4]> var_3143 = const()[name = string("op_3143"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_45 = reshape(shape = var_3143, x = var_3138)[name = string("q_45")];
+            tensor<fp16, [1, 8, 1, 256]> var_3145_cast_fp16 = mul(x = q_45, y = cos_s)[name = string("op_3145_cast_fp16")];
+            tensor<int32, [2]> var_3146_split_sizes_0 = const()[name = string("op_3146_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_3146_axis_0 = const()[name = string("op_3146_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_3146_0, tensor<fp16, [1, 8, 1, 128]> var_3146_1 = split(axis = var_3146_axis_0, split_sizes = var_3146_split_sizes_0, x = q_45)[name = string("op_3146")];
+            fp16 const_58_promoted = const()[name = string("const_58_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_3148 = mul(x = var_3146_1, y = const_58_promoted)[name = string("op_3148")];
+            int32 var_3150 = const()[name = string("op_3150"), val = int32(-1)];
+            bool var_3151_interleave_0 = const()[name = string("op_3151_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_3151 = concat(axis = var_3150, interleave = var_3151_interleave_0, values = (var_3148, var_3146_0))[name = string("op_3151")];
+            tensor<fp16, [1, 8, 1, 256]> var_3152_cast_fp16 = mul(x = var_3151, y = sin_s)[name = string("op_3152_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_47_cast_fp16 = add(x = var_3145_cast_fp16, y = var_3152_cast_fp16)[name = string("q_47_cast_fp16")];
+            bool attn_weights_29_transpose_x_0 = const()[name = string("attn_weights_29_transpose_x_0"), val = bool(false)];
+            bool attn_weights_29_transpose_y_0 = const()[name = string("attn_weights_29_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_29_cast_fp16 = matmul(transpose_x = attn_weights_29_transpose_x_0, transpose_y = attn_weights_29_transpose_y_0, x = q_47_cast_fp16, y = transpose_36_cast_fp16)[name = string("attn_weights_29_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_115_cast_fp16 = add(x = attn_weights_29_cast_fp16, y = causal_mask_sliding)[name = string("x_115_cast_fp16")];
+            tensor<int32, [1]> reduce_max_7_axes_0 = const()[name = string("reduce_max_7_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_7_keep_dims_0 = const()[name = string("reduce_max_7_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_7 = reduce_max(axes = reduce_max_7_axes_0, keep_dims = reduce_max_7_keep_dims_0, x = x_115_cast_fp16)[name = string("reduce_max_7")];
+            tensor<fp16, [1, 8, 1, 512]> var_3184 = sub(x = x_115_cast_fp16, y = reduce_max_7)[name = string("op_3184")];
+            tensor<fp16, [1, 8, 1, 512]> var_3190 = exp(x = var_3184)[name = string("op_3190")];
+            tensor<int32, [1]> var_3200_axes_0 = const()[name = string("op_3200_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3200_keep_dims_0 = const()[name = string("op_3200_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_3200 = reduce_sum(axes = var_3200_axes_0, keep_dims = var_3200_keep_dims_0, x = var_3190)[name = string("op_3200")];
+            tensor<fp16, [1, 8, 1, 512]> var_3206_cast_fp16 = real_div(x = var_3190, y = var_3200)[name = string("op_3206_cast_fp16")];
+            bool attn_output_43_transpose_x_0 = const()[name = string("attn_output_43_transpose_x_0"), val = bool(false)];
+            bool attn_output_43_transpose_y_0 = const()[name = string("attn_output_43_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_43_cast_fp16 = matmul(transpose_x = attn_output_43_transpose_x_0, transpose_y = attn_output_43_transpose_y_0, x = var_3206_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_43_cast_fp16")];
+            tensor<int32, [4]> var_3217 = const()[name = string("op_3217"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_3224 = const()[name = string("op_3224"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_3218_cast_fp16 = transpose(perm = var_3217, x = attn_output_43_cast_fp16)[name = string("transpose_19")];
+            tensor<fp16, [1, 1, 2048]> attn_output_45_cast_fp16 = reshape(shape = var_3224, x = var_3218_cast_fp16)[name = string("attn_output_45_cast_fp16")];
+            tensor<int32, [3]> var_3229 = const()[name = string("op_3229"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_3245_pad_type_0 = const()[name = string("op_3245_pad_type_0"), val = string("valid")];
+            int32 var_3245_groups_0 = const()[name = string("op_3245_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_3245_strides_0 = const()[name = string("op_3245_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_3245_pad_0 = const()[name = string("op_3245_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_3245_dilations_0 = const()[name = string("op_3245_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_7_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409418944))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412040448))))[name = string("squeeze_7_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_3230_cast_fp16 = transpose(perm = var_3229, x = attn_output_45_cast_fp16)[name = string("transpose_18")];
+            tensor<fp16, [1, 2560, 1]> var_3245_cast_fp16 = conv(dilations = var_3245_dilations_0, groups = var_3245_groups_0, pad = var_3245_pad_0, pad_type = var_3245_pad_type_0, strides = var_3245_strides_0, weight = squeeze_7_cast_fp16_to_fp32_to_fp16_palettized, x = var_3230_cast_fp16)[name = string("op_3245_cast_fp16")];
+            tensor<int32, [3]> var_3249 = const()[name = string("op_3249"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3255 = const()[name = string("op_3255"), val = int32(-1)];
+            fp16 const_59_promoted_to_fp16 = const()[name = string("const_59_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_119_cast_fp16 = transpose(perm = var_3249, x = var_3245_cast_fp16)[name = string("transpose_17")];
+            tensor<fp16, [1, 1, 2560]> var_3257_cast_fp16 = mul(x = x_119_cast_fp16, y = const_59_promoted_to_fp16)[name = string("op_3257_cast_fp16")];
+            bool input_177_interleave_0 = const()[name = string("input_177_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_177_cast_fp16 = concat(axis = var_3255, interleave = input_177_interleave_0, values = (x_119_cast_fp16, var_3257_cast_fp16))[name = string("input_177_cast_fp16")];
+            tensor<int32, [1]> normed_177_axes_0 = const()[name = string("normed_177_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3252_to_fp16 = const()[name = string("op_3252_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_177_cast_fp16 = layer_norm(axes = normed_177_axes_0, epsilon = var_3252_to_fp16, x = input_177_cast_fp16)[name = string("normed_177_cast_fp16")];
+            tensor<int32, [2]> var_3262_split_sizes_0 = const()[name = string("op_3262_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3262_axis_0 = const()[name = string("op_3262_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3262_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3262_cast_fp16_1 = split(axis = var_3262_axis_0, split_sizes = var_3262_split_sizes_0, x = normed_177_cast_fp16)[name = string("op_3262_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_7_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412043072)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_47_cast_fp16 = mul(x = var_3262_cast_fp16_0, y = layers_7_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_47_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_121_cast_fp16 = add(x = x_111_cast_fp16, y = attn_output_47_cast_fp16)[name = string("x_121_cast_fp16")];
+            int32 var_3271 = const()[name = string("op_3271"), val = int32(-1)];
+            fp16 const_60_promoted_to_fp16 = const()[name = string("const_60_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_3273_cast_fp16 = mul(x = x_121_cast_fp16, y = const_60_promoted_to_fp16)[name = string("op_3273_cast_fp16")];
+            bool input_179_interleave_0 = const()[name = string("input_179_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_179_cast_fp16 = concat(axis = var_3271, interleave = input_179_interleave_0, values = (x_121_cast_fp16, var_3273_cast_fp16))[name = string("input_179_cast_fp16")];
+            tensor<int32, [1]> normed_181_axes_0 = const()[name = string("normed_181_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3268_to_fp16 = const()[name = string("op_3268_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_181_cast_fp16 = layer_norm(axes = normed_181_axes_0, epsilon = var_3268_to_fp16, x = input_179_cast_fp16)[name = string("normed_181_cast_fp16")];
+            tensor<int32, [2]> var_3278_split_sizes_0 = const()[name = string("op_3278_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3278_axis_0 = const()[name = string("op_3278_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3278_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3278_cast_fp16_1 = split(axis = var_3278_axis_0, split_sizes = var_3278_split_sizes_0, x = normed_181_cast_fp16)[name = string("op_3278_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_7_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412048256)))];
+            tensor<fp16, [1, 1, 2560]> h_45_cast_fp16 = mul(x = var_3278_cast_fp16_0, y = layers_7_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_45_cast_fp16")];
+            tensor<int32, [3]> var_3289 = const()[name = string("op_3289"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_181_axes_0 = const()[name = string("input_181_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3290 = transpose(perm = var_3289, x = h_45_cast_fp16)[name = string("transpose_16")];
+            tensor<fp16, [1, 2560, 1, 1]> input_181 = expand_dims(axes = input_181_axes_0, x = var_3290)[name = string("input_181")];
+            string gate_29_pad_type_0 = const()[name = string("gate_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_29_strides_0 = const()[name = string("gate_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_29_pad_0 = const()[name = string("gate_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_29_dilations_0 = const()[name = string("gate_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_29_groups_0 = const()[name = string("gate_29_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_29 = conv(dilations = gate_29_dilations_0, groups = gate_29_groups_0, pad = gate_29_pad_0, pad_type = gate_29_pad_type_0, strides = gate_29_strides_0, weight = layers_7_mlp_gate_proj_weight_palettized, x = input_181)[name = string("gate_29")];
+            string up_15_pad_type_0 = const()[name = string("up_15_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_15_strides_0 = const()[name = string("up_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_15_pad_0 = const()[name = string("up_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_15_dilations_0 = const()[name = string("up_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_15_groups_0 = const()[name = string("up_15_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_15 = conv(dilations = up_15_dilations_0, groups = up_15_groups_0, pad = up_15_pad_0, pad_type = up_15_pad_type_0, strides = up_15_strides_0, weight = layers_7_mlp_up_proj_weight_palettized, x = input_181)[name = string("up_15")];
+            string gate_31_mode_0 = const()[name = string("gate_31_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_31 = gelu(mode = gate_31_mode_0, x = gate_29)[name = string("gate_31")];
+            tensor<fp16, [1, 10240, 1, 1]> input_183 = mul(x = gate_31, y = up_15)[name = string("input_183")];
+            string mlp_out_15_pad_type_0 = const()[name = string("mlp_out_15_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_15_strides_0 = const()[name = string("mlp_out_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_15_pad_0 = const()[name = string("mlp_out_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_15_dilations_0 = const()[name = string("mlp_out_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_15_groups_0 = const()[name = string("mlp_out_15_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_15 = conv(dilations = mlp_out_15_dilations_0, groups = mlp_out_15_groups_0, pad = mlp_out_15_pad_0, pad_type = mlp_out_15_pad_type_0, strides = mlp_out_15_strides_0, weight = layers_7_mlp_down_proj_weight_palettized, x = input_183)[name = string("mlp_out_15")];
+            tensor<int32, [1]> var_3330_axes_0 = const()[name = string("op_3330_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3330 = squeeze(axes = var_3330_axes_0, x = mlp_out_15)[name = string("op_3330")];
+            tensor<int32, [3]> var_3334 = const()[name = string("op_3334"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3340 = const()[name = string("op_3340"), val = int32(-1)];
+            fp16 const_61_promoted = const()[name = string("const_61_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_123 = transpose(perm = var_3334, x = var_3330)[name = string("transpose_15")];
+            tensor<fp16, [1, 1, 2560]> var_3342 = mul(x = x_123, y = const_61_promoted)[name = string("op_3342")];
+            bool input_185_interleave_0 = const()[name = string("input_185_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_185 = concat(axis = var_3340, interleave = input_185_interleave_0, values = (x_123, var_3342))[name = string("input_185")];
+            tensor<int32, [1]> normed_185_axes_0 = const()[name = string("normed_185_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3337_to_fp16 = const()[name = string("op_3337_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_185_cast_fp16 = layer_norm(axes = normed_185_axes_0, epsilon = var_3337_to_fp16, x = input_185)[name = string("normed_185_cast_fp16")];
+            tensor<int32, [2]> var_3347_split_sizes_0 = const()[name = string("op_3347_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3347_axis_0 = const()[name = string("op_3347_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3347_0, tensor<fp16, [1, 1, 2560]> var_3347_1 = split(axis = var_3347_axis_0, split_sizes = var_3347_split_sizes_0, x = normed_185_cast_fp16)[name = string("op_3347")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_73 = mul(x = var_3347_0, y = layers_7_post_feedforward_layernorm_weight)[name = string("hidden_states_73")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_75_cast_fp16 = add(x = x_121_cast_fp16, y = hidden_states_73)[name = string("hidden_states_75_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_15_begin_0 = const()[name = string("per_layer_slice_15_begin_0"), val = tensor<int32, [3]>([0, 0, 10240])];
+            tensor<int32, [3]> per_layer_slice_15_end_0 = const()[name = string("per_layer_slice_15_end_0"), val = tensor<int32, [3]>([1, 1, 10496])];
+            tensor<bool, [3]> per_layer_slice_15_end_mask_0 = const()[name = string("per_layer_slice_15_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_15_cast_fp16 = slice_by_index(begin = per_layer_slice_15_begin_0, end = per_layer_slice_15_end_0, end_mask = per_layer_slice_15_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_15_cast_fp16")];
+            tensor<int32, [3]> var_3375 = const()[name = string("op_3375"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_187_axes_0 = const()[name = string("input_187_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3376 = transpose(perm = var_3375, x = hidden_states_75_cast_fp16)[name = string("transpose_14")];
+            tensor<fp16, [1, 2560, 1, 1]> input_187 = expand_dims(axes = input_187_axes_0, x = var_3376)[name = string("input_187")];
+            string gated_43_pad_type_0 = const()[name = string("gated_43_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_43_strides_0 = const()[name = string("gated_43_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_43_pad_0 = const()[name = string("gated_43_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_43_dilations_0 = const()[name = string("gated_43_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_43_groups_0 = const()[name = string("gated_43_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_43 = conv(dilations = gated_43_dilations_0, groups = gated_43_groups_0, pad = gated_43_pad_0, pad_type = gated_43_pad_type_0, strides = gated_43_strides_0, weight = layers_7_per_layer_input_gate_weight_palettized, x = input_187)[name = string("gated_43")];
+            string gated_45_mode_0 = const()[name = string("gated_45_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_45 = gelu(mode = gated_45_mode_0, x = gated_43)[name = string("gated_45")];
+            tensor<int32, [3]> var_3395 = const()[name = string("op_3395"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_15_axes_0 = const()[name = string("per_layer_slice_conv_15_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_3396_cast_fp16 = transpose(perm = var_3395, x = per_layer_slice_15_cast_fp16)[name = string("transpose_13")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_15_cast_fp16 = expand_dims(axes = per_layer_slice_conv_15_axes_0, x = var_3396_cast_fp16)[name = string("per_layer_slice_conv_15_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_189_cast_fp16 = mul(x = gated_45, y = per_layer_slice_conv_15_cast_fp16)[name = string("input_189_cast_fp16")];
+            string gated_47_pad_type_0 = const()[name = string("gated_47_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_47_strides_0 = const()[name = string("gated_47_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_47_pad_0 = const()[name = string("gated_47_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_47_dilations_0 = const()[name = string("gated_47_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_47_groups_0 = const()[name = string("gated_47_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_7_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412053440))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412381184))))[name = string("layers_7_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_47_cast_fp16 = conv(dilations = gated_47_dilations_0, groups = gated_47_groups_0, pad = gated_47_pad_0, pad_type = gated_47_pad_type_0, strides = gated_47_strides_0, weight = layers_7_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_189_cast_fp16)[name = string("gated_47_cast_fp16")];
+            tensor<int32, [1]> var_3412_axes_0 = const()[name = string("op_3412_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3412_cast_fp16 = squeeze(axes = var_3412_axes_0, x = gated_47_cast_fp16)[name = string("op_3412_cast_fp16")];
+            tensor<int32, [3]> var_3416 = const()[name = string("op_3416"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3422 = const()[name = string("op_3422"), val = int32(-1)];
+            fp16 const_62_promoted_to_fp16 = const()[name = string("const_62_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_125_cast_fp16 = transpose(perm = var_3416, x = var_3412_cast_fp16)[name = string("transpose_12")];
+            tensor<fp16, [1, 1, 2560]> var_3424_cast_fp16 = mul(x = x_125_cast_fp16, y = const_62_promoted_to_fp16)[name = string("op_3424_cast_fp16")];
+            bool input_191_interleave_0 = const()[name = string("input_191_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_191_cast_fp16 = concat(axis = var_3422, interleave = input_191_interleave_0, values = (x_125_cast_fp16, var_3424_cast_fp16))[name = string("input_191_cast_fp16")];
+            tensor<int32, [1]> normed_189_axes_0 = const()[name = string("normed_189_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3419_to_fp16 = const()[name = string("op_3419_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_189_cast_fp16 = layer_norm(axes = normed_189_axes_0, epsilon = var_3419_to_fp16, x = input_191_cast_fp16)[name = string("normed_189_cast_fp16")];
+            tensor<int32, [2]> var_3429_split_sizes_0 = const()[name = string("op_3429_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3429_axis_0 = const()[name = string("op_3429_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3429_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3429_cast_fp16_1 = split(axis = var_3429_axis_0, split_sizes = var_3429_split_sizes_0, x = normed_189_cast_fp16)[name = string("op_3429_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_7_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412383808)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_79_cast_fp16 = mul(x = var_3429_cast_fp16_0, y = layers_7_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_79_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_81_cast_fp16 = add(x = hidden_states_75_cast_fp16, y = hidden_states_79_cast_fp16)[name = string("hidden_states_81_cast_fp16")];
+            tensor<fp16, [1]> const_63_promoted_to_fp16 = const()[name = string("const_63_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.9ep-1])];
+            tensor<fp16, [1, 1, 2560]> x_127_cast_fp16 = mul(x = hidden_states_81_cast_fp16, y = const_63_promoted_to_fp16)[name = string("x_127_cast_fp16")];
+            int32 var_3444 = const()[name = string("op_3444"), val = int32(-1)];
+            fp16 const_64_promoted_to_fp16 = const()[name = string("const_64_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_3446_cast_fp16 = mul(x = x_127_cast_fp16, y = const_64_promoted_to_fp16)[name = string("op_3446_cast_fp16")];
+            bool input_193_interleave_0 = const()[name = string("input_193_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_193_cast_fp16 = concat(axis = var_3444, interleave = input_193_interleave_0, values = (x_127_cast_fp16, var_3446_cast_fp16))[name = string("input_193_cast_fp16")];
+            tensor<int32, [1]> normed_193_axes_0 = const()[name = string("normed_193_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3441_to_fp16 = const()[name = string("op_3441_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_193_cast_fp16 = layer_norm(axes = normed_193_axes_0, epsilon = var_3441_to_fp16, x = input_193_cast_fp16)[name = string("normed_193_cast_fp16")];
+            tensor<int32, [2]> var_3451_split_sizes_0 = const()[name = string("op_3451_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3451_axis_0 = const()[name = string("op_3451_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3451_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3451_cast_fp16_1 = split(axis = var_3451_axis_0, split_sizes = var_3451_split_sizes_0, x = normed_193_cast_fp16)[name = string("op_3451_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_8_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412388992)))];
+            tensor<fp16, [1, 1, 2560]> h_49_cast_fp16 = mul(x = var_3451_cast_fp16_0, y = layers_8_input_layernorm_weight_promoted_to_fp16)[name = string("h_49_cast_fp16")];
+            tensor<int32, [3]> var_3457 = const()[name = string("op_3457"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_3460_axes_0 = const()[name = string("op_3460_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3458_cast_fp16 = transpose(perm = var_3457, x = h_49_cast_fp16)[name = string("transpose_11")];
+            tensor<fp16, [1, 2560, 1, 1]> var_3460_cast_fp16 = expand_dims(axes = var_3460_axes_0, x = var_3458_cast_fp16)[name = string("op_3460_cast_fp16")];
+            string var_3476_pad_type_0 = const()[name = string("op_3476_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_3476_strides_0 = const()[name = string("op_3476_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_3476_pad_0 = const()[name = string("op_3476_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_3476_dilations_0 = const()[name = string("op_3476_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_3476_groups_0 = const()[name = string("op_3476_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 4096, 1, 1]> var_3476 = conv(dilations = var_3476_dilations_0, groups = var_3476_groups_0, pad = var_3476_pad_0, pad_type = var_3476_pad_type_0, strides = var_3476_strides_0, weight = layers_8_self_attn_q_proj_weight_palettized, x = var_3460_cast_fp16)[name = string("op_3476")];
+            tensor<int32, [4]> var_3481 = const()[name = string("op_3481"), val = tensor<int32, [4]>([1, 8, 512, 1])];
+            tensor<fp16, [1, 8, 512, 1]> var_3482 = reshape(shape = var_3481, x = var_3476)[name = string("op_3482")];
+            tensor<int32, [4]> var_3487 = const()[name = string("op_3487"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_3497 = const()[name = string("op_3497"), val = tensor<int32, [3]>([1, 8, 512])];
+            tensor<fp16, [1, 8, 1, 512]> var_3488 = transpose(perm = var_3487, x = var_3482)[name = string("transpose_10")];
+            tensor<fp16, [1, 8, 512]> x_129 = reshape(shape = var_3497, x = var_3488)[name = string("x_129")];
+            int32 var_3503 = const()[name = string("op_3503"), val = int32(-1)];
+            fp16 const_65_promoted = const()[name = string("const_65_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 512]> var_3505 = mul(x = x_129, y = const_65_promoted)[name = string("op_3505")];
+            bool input_197_interleave_0 = const()[name = string("input_197_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1024]> input_197 = concat(axis = var_3503, interleave = input_197_interleave_0, values = (x_129, var_3505))[name = string("input_197")];
+            tensor<int32, [1]> normed_197_axes_0 = const()[name = string("normed_197_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3500_to_fp16 = const()[name = string("op_3500_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 1024]> normed_197_cast_fp16 = layer_norm(axes = normed_197_axes_0, epsilon = var_3500_to_fp16, x = input_197)[name = string("normed_197_cast_fp16")];
+            tensor<int32, [2]> var_3510_split_sizes_0 = const()[name = string("op_3510_split_sizes_0"), val = tensor<int32, [2]>([512, 512])];
+            int32 var_3510_axis_0 = const()[name = string("op_3510_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 512]> var_3510_0, tensor<fp16, [1, 8, 512]> var_3510_1 = split(axis = var_3510_axis_0, split_sizes = var_3510_split_sizes_0, x = normed_197_cast_fp16)[name = string("op_3510")];
+            tensor<fp16, [1, 8, 512]> var_3512 = mul(x = var_3510_0, y = layers_2_self_attn_q_norm_weight)[name = string("op_3512")];
+            tensor<int32, [4]> var_3517 = const()[name = string("op_3517"), val = tensor<int32, [4]>([1, 8, 1, 512])];
+            tensor<fp16, [1, 8, 1, 512]> q_51 = reshape(shape = var_3517, x = var_3512)[name = string("q_51")];
+            tensor<fp16, [1, 8, 1, 512]> var_3519_cast_fp16 = mul(x = q_51, y = cos_f)[name = string("op_3519_cast_fp16")];
+            tensor<int32, [2]> var_3520_split_sizes_0 = const()[name = string("op_3520_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_3520_axis_0 = const()[name = string("op_3520_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 256]> var_3520_0, tensor<fp16, [1, 8, 1, 256]> var_3520_1 = split(axis = var_3520_axis_0, split_sizes = var_3520_split_sizes_0, x = q_51)[name = string("op_3520")];
+            fp16 const_66_promoted = const()[name = string("const_66_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 256]> var_3522 = mul(x = var_3520_1, y = const_66_promoted)[name = string("op_3522")];
+            int32 var_3524 = const()[name = string("op_3524"), val = int32(-1)];
+            bool var_3525_interleave_0 = const()[name = string("op_3525_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> var_3525 = concat(axis = var_3524, interleave = var_3525_interleave_0, values = (var_3522, var_3520_0))[name = string("op_3525")];
+            tensor<fp16, [1, 8, 1, 512]> var_3526_cast_fp16 = mul(x = var_3525, y = sin_f)[name = string("op_3526_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> q_cast_fp16 = add(x = var_3519_cast_fp16, y = var_3526_cast_fp16)[name = string("q_cast_fp16")];
+            bool attn_weights_33_transpose_x_0 = const()[name = string("attn_weights_33_transpose_x_0"), val = bool(false)];
+            bool attn_weights_33_transpose_y_0 = const()[name = string("attn_weights_33_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 2048]> attn_weights_33_cast_fp16 = matmul(transpose_x = attn_weights_33_transpose_x_0, transpose_y = attn_weights_33_transpose_y_0, x = q_cast_fp16, y = transpose_38_cast_fp16)[name = string("attn_weights_33_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 2048]> x_131_cast_fp16 = add(x = attn_weights_33_cast_fp16, y = causal_mask_full)[name = string("x_131_cast_fp16")];
+            tensor<int32, [1]> reduce_max_8_axes_0 = const()[name = string("reduce_max_8_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_8_keep_dims_0 = const()[name = string("reduce_max_8_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_8 = reduce_max(axes = reduce_max_8_axes_0, keep_dims = reduce_max_8_keep_dims_0, x = x_131_cast_fp16)[name = string("reduce_max_8")];
+            tensor<fp16, [1, 8, 1, 2048]> var_3558 = sub(x = x_131_cast_fp16, y = reduce_max_8)[name = string("op_3558")];
+            tensor<fp16, [1, 8, 1, 2048]> var_3564 = exp(x = var_3558)[name = string("op_3564")];
+            tensor<int32, [1]> var_3574_axes_0 = const()[name = string("op_3574_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3574_keep_dims_0 = const()[name = string("op_3574_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_3574 = reduce_sum(axes = var_3574_axes_0, keep_dims = var_3574_keep_dims_0, x = var_3564)[name = string("op_3574")];
+            tensor<fp16, [1, 8, 1, 2048]> var_3580_cast_fp16 = real_div(x = var_3564, y = var_3574)[name = string("op_3580_cast_fp16")];
+            bool attn_output_49_transpose_x_0 = const()[name = string("attn_output_49_transpose_x_0"), val = bool(false)];
+            bool attn_output_49_transpose_y_0 = const()[name = string("attn_output_49_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> attn_output_49_cast_fp16 = matmul(transpose_x = attn_output_49_transpose_x_0, transpose_y = attn_output_49_transpose_y_0, x = var_3580_cast_fp16, y = V_expanded_5_cast_fp16)[name = string("attn_output_49_cast_fp16")];
+            tensor<int32, [4]> var_3591 = const()[name = string("op_3591"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_3598 = const()[name = string("op_3598"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 512]> var_3592_cast_fp16 = transpose(perm = var_3591, x = attn_output_49_cast_fp16)[name = string("transpose_9")];
+            tensor<fp16, [1, 1, 4096]> attn_output_51_cast_fp16 = reshape(shape = var_3598, x = var_3592_cast_fp16)[name = string("attn_output_51_cast_fp16")];
+            tensor<int32, [3]> var_3603 = const()[name = string("op_3603"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_3619_pad_type_0 = const()[name = string("op_3619_pad_type_0"), val = string("valid")];
+            int32 var_3619_groups_0 = const()[name = string("op_3619_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_3619_strides_0 = const()[name = string("op_3619_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_3619_pad_0 = const()[name = string("op_3619_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_3619_dilations_0 = const()[name = string("op_3619_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 4096, 1]> squeeze_8_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 4096, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412394176))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(417637120))))[name = string("squeeze_8_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 4096, 1]> var_3604_cast_fp16 = transpose(perm = var_3603, x = attn_output_51_cast_fp16)[name = string("transpose_8")];
+            tensor<fp16, [1, 2560, 1]> var_3619_cast_fp16 = conv(dilations = var_3619_dilations_0, groups = var_3619_groups_0, pad = var_3619_pad_0, pad_type = var_3619_pad_type_0, strides = var_3619_strides_0, weight = squeeze_8_cast_fp16_to_fp32_to_fp16_palettized, x = var_3604_cast_fp16)[name = string("op_3619_cast_fp16")];
+            tensor<int32, [3]> var_3623 = const()[name = string("op_3623"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3629 = const()[name = string("op_3629"), val = int32(-1)];
+            fp16 const_67_promoted_to_fp16 = const()[name = string("const_67_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_135_cast_fp16 = transpose(perm = var_3623, x = var_3619_cast_fp16)[name = string("transpose_7")];
+            tensor<fp16, [1, 1, 2560]> var_3631_cast_fp16 = mul(x = x_135_cast_fp16, y = const_67_promoted_to_fp16)[name = string("op_3631_cast_fp16")];
+            bool input_201_interleave_0 = const()[name = string("input_201_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_201_cast_fp16 = concat(axis = var_3629, interleave = input_201_interleave_0, values = (x_135_cast_fp16, var_3631_cast_fp16))[name = string("input_201_cast_fp16")];
+            tensor<int32, [1]> normed_201_axes_0 = const()[name = string("normed_201_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3626_to_fp16 = const()[name = string("op_3626_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_201_cast_fp16 = layer_norm(axes = normed_201_axes_0, epsilon = var_3626_to_fp16, x = input_201_cast_fp16)[name = string("normed_201_cast_fp16")];
+            tensor<int32, [2]> var_3636_split_sizes_0 = const()[name = string("op_3636_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3636_axis_0 = const()[name = string("op_3636_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3636_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3636_cast_fp16_1 = split(axis = var_3636_axis_0, split_sizes = var_3636_split_sizes_0, x = normed_201_cast_fp16)[name = string("op_3636_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_8_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(417639744)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_cast_fp16 = mul(x = var_3636_cast_fp16_0, y = layers_8_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_137_cast_fp16 = add(x = x_127_cast_fp16, y = attn_output_cast_fp16)[name = string("x_137_cast_fp16")];
+            int32 var_3645 = const()[name = string("op_3645"), val = int32(-1)];
+            fp16 const_68_promoted_to_fp16 = const()[name = string("const_68_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_3647_cast_fp16 = mul(x = x_137_cast_fp16, y = const_68_promoted_to_fp16)[name = string("op_3647_cast_fp16")];
+            bool input_203_interleave_0 = const()[name = string("input_203_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_203_cast_fp16 = concat(axis = var_3645, interleave = input_203_interleave_0, values = (x_137_cast_fp16, var_3647_cast_fp16))[name = string("input_203_cast_fp16")];
+            tensor<int32, [1]> normed_205_axes_0 = const()[name = string("normed_205_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3642_to_fp16 = const()[name = string("op_3642_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_205_cast_fp16 = layer_norm(axes = normed_205_axes_0, epsilon = var_3642_to_fp16, x = input_203_cast_fp16)[name = string("normed_205_cast_fp16")];
+            tensor<int32, [2]> var_3652_split_sizes_0 = const()[name = string("op_3652_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3652_axis_0 = const()[name = string("op_3652_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3652_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3652_cast_fp16_1 = split(axis = var_3652_axis_0, split_sizes = var_3652_split_sizes_0, x = normed_205_cast_fp16)[name = string("op_3652_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_8_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(417644928)))];
+            tensor<fp16, [1, 1, 2560]> h_51_cast_fp16 = mul(x = var_3652_cast_fp16_0, y = layers_8_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_51_cast_fp16")];
+            tensor<int32, [3]> var_3663 = const()[name = string("op_3663"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_205_axes_0 = const()[name = string("input_205_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3664 = transpose(perm = var_3663, x = h_51_cast_fp16)[name = string("transpose_6")];
+            tensor<fp16, [1, 2560, 1, 1]> input_205 = expand_dims(axes = input_205_axes_0, x = var_3664)[name = string("input_205")];
+            string gate_33_pad_type_0 = const()[name = string("gate_33_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_33_strides_0 = const()[name = string("gate_33_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_33_pad_0 = const()[name = string("gate_33_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_33_dilations_0 = const()[name = string("gate_33_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_33_groups_0 = const()[name = string("gate_33_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_33 = conv(dilations = gate_33_dilations_0, groups = gate_33_groups_0, pad = gate_33_pad_0, pad_type = gate_33_pad_type_0, strides = gate_33_strides_0, weight = layers_8_mlp_gate_proj_weight_palettized, x = input_205)[name = string("gate_33")];
+            string up_pad_type_0 = const()[name = string("up_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_strides_0 = const()[name = string("up_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_pad_0 = const()[name = string("up_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_dilations_0 = const()[name = string("up_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_groups_0 = const()[name = string("up_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up = conv(dilations = up_dilations_0, groups = up_groups_0, pad = up_pad_0, pad_type = up_pad_type_0, strides = up_strides_0, weight = layers_8_mlp_up_proj_weight_palettized, x = input_205)[name = string("up")];
+            string gate_mode_0 = const()[name = string("gate_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate = gelu(mode = gate_mode_0, x = gate_33)[name = string("gate")];
+            tensor<fp16, [1, 10240, 1, 1]> input_207 = mul(x = gate, y = up)[name = string("input_207")];
+            string mlp_out_pad_type_0 = const()[name = string("mlp_out_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_strides_0 = const()[name = string("mlp_out_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_pad_0 = const()[name = string("mlp_out_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_dilations_0 = const()[name = string("mlp_out_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_groups_0 = const()[name = string("mlp_out_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out = conv(dilations = mlp_out_dilations_0, groups = mlp_out_groups_0, pad = mlp_out_pad_0, pad_type = mlp_out_pad_type_0, strides = mlp_out_strides_0, weight = layers_8_mlp_down_proj_weight_palettized, x = input_207)[name = string("mlp_out")];
+            tensor<int32, [1]> var_3704_axes_0 = const()[name = string("op_3704_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3704 = squeeze(axes = var_3704_axes_0, x = mlp_out)[name = string("op_3704")];
+            tensor<int32, [3]> var_3708 = const()[name = string("op_3708"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3714 = const()[name = string("op_3714"), val = int32(-1)];
+            fp16 const_69_promoted = const()[name = string("const_69_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_139 = transpose(perm = var_3708, x = var_3704)[name = string("transpose_5")];
+            tensor<fp16, [1, 1, 2560]> var_3716 = mul(x = x_139, y = const_69_promoted)[name = string("op_3716")];
+            bool input_209_interleave_0 = const()[name = string("input_209_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_209 = concat(axis = var_3714, interleave = input_209_interleave_0, values = (x_139, var_3716))[name = string("input_209")];
+            tensor<int32, [1]> normed_209_axes_0 = const()[name = string("normed_209_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3711_to_fp16 = const()[name = string("op_3711_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_209_cast_fp16 = layer_norm(axes = normed_209_axes_0, epsilon = var_3711_to_fp16, x = input_209)[name = string("normed_209_cast_fp16")];
+            tensor<int32, [2]> var_3721_split_sizes_0 = const()[name = string("op_3721_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3721_axis_0 = const()[name = string("op_3721_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3721_0, tensor<fp16, [1, 1, 2560]> var_3721_1 = split(axis = var_3721_axis_0, split_sizes = var_3721_split_sizes_0, x = normed_209_cast_fp16)[name = string("op_3721")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_83 = mul(x = var_3721_0, y = layers_8_post_feedforward_layernorm_weight)[name = string("hidden_states_83")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_85_cast_fp16 = add(x = x_137_cast_fp16, y = hidden_states_83)[name = string("hidden_states_85_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_begin_0 = const()[name = string("per_layer_slice_begin_0"), val = tensor<int32, [3]>([0, 0, 10496])];
+            tensor<int32, [3]> per_layer_slice_end_0 = const()[name = string("per_layer_slice_end_0"), val = tensor<int32, [3]>([1, 1, 1])];
+            tensor<bool, [3]> per_layer_slice_end_mask_0 = const()[name = string("per_layer_slice_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_cast_fp16 = slice_by_index(begin = per_layer_slice_begin_0, end = per_layer_slice_end_0, end_mask = per_layer_slice_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_cast_fp16")];
+            tensor<int32, [3]> var_3749 = const()[name = string("op_3749"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_211_axes_0 = const()[name = string("input_211_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3750 = transpose(perm = var_3749, x = hidden_states_85_cast_fp16)[name = string("transpose_4")];
+            tensor<fp16, [1, 2560, 1, 1]> input_211 = expand_dims(axes = input_211_axes_0, x = var_3750)[name = string("input_211")];
+            string gated_49_pad_type_0 = const()[name = string("gated_49_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_49_strides_0 = const()[name = string("gated_49_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_49_pad_0 = const()[name = string("gated_49_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_49_dilations_0 = const()[name = string("gated_49_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_49_groups_0 = const()[name = string("gated_49_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_49 = conv(dilations = gated_49_dilations_0, groups = gated_49_groups_0, pad = gated_49_pad_0, pad_type = gated_49_pad_type_0, strides = gated_49_strides_0, weight = layers_8_per_layer_input_gate_weight_palettized, x = input_211)[name = string("gated_49")];
+            string gated_51_mode_0 = const()[name = string("gated_51_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_51 = gelu(mode = gated_51_mode_0, x = gated_49)[name = string("gated_51")];
+            tensor<int32, [3]> var_3769 = const()[name = string("op_3769"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_axes_0 = const()[name = string("per_layer_slice_conv_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_3770_cast_fp16 = transpose(perm = var_3769, x = per_layer_slice_cast_fp16)[name = string("transpose_3")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_cast_fp16 = expand_dims(axes = per_layer_slice_conv_axes_0, x = var_3770_cast_fp16)[name = string("per_layer_slice_conv_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_213_cast_fp16 = mul(x = gated_51, y = per_layer_slice_conv_cast_fp16)[name = string("input_213_cast_fp16")];
+            string gated_pad_type_0 = const()[name = string("gated_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_strides_0 = const()[name = string("gated_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_pad_0 = const()[name = string("gated_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_dilations_0 = const()[name = string("gated_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_groups_0 = const()[name = string("gated_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_8_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(417650112))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(417977856))))[name = string("layers_8_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_cast_fp16 = conv(dilations = gated_dilations_0, groups = gated_groups_0, pad = gated_pad_0, pad_type = gated_pad_type_0, strides = gated_strides_0, weight = layers_8_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_213_cast_fp16)[name = string("gated_cast_fp16")];
+            tensor<int32, [1]> var_3786_axes_0 = const()[name = string("op_3786_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3786_cast_fp16 = squeeze(axes = var_3786_axes_0, x = gated_cast_fp16)[name = string("op_3786_cast_fp16")];
+            tensor<int32, [3]> var_3790 = const()[name = string("op_3790"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3796 = const()[name = string("op_3796"), val = int32(-1)];
+            fp16 const_70_promoted_to_fp16 = const()[name = string("const_70_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_141_cast_fp16 = transpose(perm = var_3790, x = var_3786_cast_fp16)[name = string("transpose_2")];
+            tensor<fp16, [1, 1, 2560]> var_3798_cast_fp16 = mul(x = x_141_cast_fp16, y = const_70_promoted_to_fp16)[name = string("op_3798_cast_fp16")];
+            bool input_215_interleave_0 = const()[name = string("input_215_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_215_cast_fp16 = concat(axis = var_3796, interleave = input_215_interleave_0, values = (x_141_cast_fp16, var_3798_cast_fp16))[name = string("input_215_cast_fp16")];
+            tensor<int32, [1]> normed_213_axes_0 = const()[name = string("normed_213_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3793_to_fp16 = const()[name = string("op_3793_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_213_cast_fp16 = layer_norm(axes = normed_213_axes_0, epsilon = var_3793_to_fp16, x = input_215_cast_fp16)[name = string("normed_213_cast_fp16")];
+            tensor<int32, [2]> var_3803_split_sizes_0 = const()[name = string("op_3803_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3803_axis_0 = const()[name = string("op_3803_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3803_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3803_cast_fp16_1 = split(axis = var_3803_axis_0, split_sizes = var_3803_split_sizes_0, x = normed_213_cast_fp16)[name = string("op_3803_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_8_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(417980480)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_89_cast_fp16 = mul(x = var_3803_cast_fp16_0, y = layers_8_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_89_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_cast_fp16 = add(x = hidden_states_85_cast_fp16, y = hidden_states_89_cast_fp16)[name = string("hidden_states_cast_fp16")];
+            tensor<fp16, [1]> const_71_promoted_to_fp16 = const()[name = string("const_71_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.c8p-2])];
+            tensor<fp16, [1, 1, 2560]> x_cast_fp16 = mul(x = hidden_states_cast_fp16, y = const_71_promoted_to_fp16)[name = string("x_cast_fp16")];
+            int32 var_3818 = const()[name = string("op_3818"), val = int32(-1)];
+            fp16 const_72_promoted_to_fp16 = const()[name = string("const_72_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_3820_cast_fp16 = mul(x = x_cast_fp16, y = const_72_promoted_to_fp16)[name = string("op_3820_cast_fp16")];
+            bool input_217_interleave_0 = const()[name = string("input_217_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_217_cast_fp16 = concat(axis = var_3818, interleave = input_217_interleave_0, values = (x_cast_fp16, var_3820_cast_fp16))[name = string("input_217_cast_fp16")];
+            tensor<int32, [1]> normed_217_axes_0 = const()[name = string("normed_217_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3815_to_fp16 = const()[name = string("op_3815_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_217_cast_fp16 = layer_norm(axes = normed_217_axes_0, epsilon = var_3815_to_fp16, x = input_217_cast_fp16)[name = string("normed_217_cast_fp16")];
+            tensor<int32, [2]> var_3825_split_sizes_0 = const()[name = string("op_3825_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3825_axis_0 = const()[name = string("op_3825_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3825_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3825_cast_fp16_1 = split(axis = var_3825_axis_0, split_sizes = var_3825_split_sizes_0, x = normed_217_cast_fp16)[name = string("op_3825_cast_fp16")];
+            tensor<fp16, [2560]> norm_weight_promoted_to_fp16 = const()[name = string("norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(417985664)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_out = mul(x = var_3825_cast_fp16_0, y = norm_weight_promoted_to_fp16)[name = string("normed_221_cast_fp16")];
+            tensor<int32, [3]> var_3836 = const()[name = string("op_3836"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp16, [262144, 2560, 1]> squeeze_9_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [262144, 2560, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(417990848))), lut = tensor<fp16, [8192, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(753535232))))[name = string("squeeze_9_palettized")];
+            string var_3852_pad_type_0 = const()[name = string("op_3852_pad_type_0"), val = string("valid")];
+            int32 var_3852_groups_0 = const()[name = string("op_3852_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_3852_strides_0 = const()[name = string("op_3852_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_3852_pad_0 = const()[name = string("op_3852_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_3852_dilations_0 = const()[name = string("op_3852_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 2560, 1]> var_3837 = transpose(perm = var_3836, x = hidden_states_out)[name = string("transpose_1")];
+            tensor<fp16, [1, 262144, 1]> var_3852 = conv(dilations = var_3852_dilations_0, groups = var_3852_groups_0, pad = var_3852_pad_0, pad_type = var_3852_pad_type_0, strides = var_3852_strides_0, weight = squeeze_9_palettized, x = var_3837)[name = string("op_3852")];
+            tensor<int32, [3]> var_3856 = const()[name = string("op_3856"), val = tensor<int32, [3]>([0, 2, 1])];
+            fp16 _inversed_3859_y_0_to_fp16 = const()[name = string("_inversed_3859_y_0_to_fp16"), val = fp16(0x1.11p-5)];
+            tensor<fp16, [1, 1, 262144]> logits_1 = transpose(perm = var_3856, x = var_3852)[name = string("transpose_0")];
+            tensor<fp16, [1, 1, 262144]> _inversed_3859_cast_fp16 = mul(x = logits_1, y = _inversed_3859_y_0_to_fp16)[name = string("_inversed_3859_cast_fp16")];
+            tensor<fp16, [1, 1, 262144]> var_3860_cast_fp16 = tanh(x = _inversed_3859_cast_fp16)[name = string("op_3860_cast_fp16")];
+            fp16 var_3861_to_fp16 = const()[name = string("op_3861_to_fp16"), val = fp16(0x1.ep+4)];
+            tensor<fp16, [1, 1, 262144]> logits_3_cast_fp16 = mul(x = var_3860_cast_fp16, y = var_3861_to_fp16)[name = string("logits_3_cast_fp16")];
+            tensor<int32, [1]> logits_axes_0 = const()[name = string("logits_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 262144]> logits_cast_fp16 = squeeze(axes = logits_axes_0, x = logits_3_cast_fp16)[name = string("logits_cast_fp16")];
+            int32 var_3866 = const()[name = string("op_3866"), val = int32(-1)];
+            int32 token_id_axis_0 = const()[name = string("token_id_axis_0"), val = int32(-1)];
+            bool token_id_keep_dims_0 = const()[name = string("token_id_keep_dims_0"), val = bool(false)];
+            string token_id_output_dtype_0 = const()[name = string("token_id_output_dtype_0"), val = string("int32")];
+            tensor<int32, [1]> token_id = reduce_argmax(axis = token_id_axis_0, keep_dims = token_id_keep_dims_0, output_dtype = token_id_output_dtype_0, x = logits_cast_fp16)[name = string("token_id_cast_fp16")];
+            tensor<int32, [1]> var_3868_axes_0 = const()[name = string("op_3868_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<int32, [1, 1]> var_3868 = expand_dims(axes = var_3868_axes_0, x = token_id)[name = string("op_3868")];
+            bool var_3869_validate_indices_0 = const()[name = string("op_3869_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [1, 1]> var_3869_cast_fp16 = gather_along_axis(axis = var_3866, indices = var_3868, validate_indices = var_3869_validate_indices_0, x = logits_cast_fp16)[name = string("op_3869_cast_fp16")];
+            tensor<int32, [1]> var_3870_axes_0 = const()[name = string("op_3870_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp16, [1]> token_logit = squeeze(axes = var_3870_axes_0, x = var_3869_cast_fp16)[name = string("op_3870_cast_fp16")];
+            tensor<fp16, [1, 1, 2048, 1]> update_mask_tmp = identity(x = update_mask)[name = string("update_mask_tmp")];
+        } -> (token_id, token_logit, hidden_states_out);
+}
\ No newline at end of file
diff --git a/chunk3_3way.mlmodelc/weights/weight.bin b/chunk3_3way.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..15bc928e2d7a003f03c909e6e1e2c768f065d6d5
--- /dev/null
+++ b/chunk3_3way.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:155bcb0a818cb9f95184346c2cc319980d33f6acf5ec4b14fec14abc61888cd9
+size 753797440
diff --git a/chunk4.mlmodelc/analytics/coremldata.bin b/chunk4.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..bc04edd91e2bc058db3b238437f022a6655eb9e1
--- /dev/null
+++ b/chunk4.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:58c1fb89f6c05774b2ac875839fcc1e5c153cc195cfe223ad9ffb42d2d30ea48
+size 243
diff --git a/chunk4.mlmodelc/coremldata.bin b/chunk4.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..78e37ebded6a28bd7beceb068c14f45fcb21382d
--- /dev/null
+++ b/chunk4.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b78c0ef3198782c4e4060bdf45f682dcb566666e05fee15858a5c2467c05965b
+size 1014
diff --git a/chunk4.mlmodelc/model.mil b/chunk4.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..9a947bdc7fbff8084eaac9cb943c51f9c8c3b663
--- /dev/null
+++ b/chunk4.mlmodelc/model.mil
@@ -0,0 +1,3946 @@
+program(1.3)
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3500.14.1"}, {"coremlc-version", "3500.32.1"}})]
+{
+    func decode_q1<ios18>(tensor<fp16, [1, 1, 1, 2048]> causal_mask_full, tensor<fp16, [1, 1, 1, 512]> causal_mask_sliding, tensor<fp16, [1, 1, 1, 512]> cos_f, tensor<fp16, [1, 1, 1, 256]> cos_s, tensor<fp16, [1, 1, 2560]> hidden_states, tensor<fp16, [1, 2, 512, 256]> kv13_k, tensor<fp16, [1, 2, 512, 256]> kv13_v, tensor<fp16, [1, 2, 2048, 512]> kv14_k, tensor<fp16, [1, 2, 2048, 512]> kv14_v, tensor<fp16, [1, 1, 10752]> per_layer_combined, tensor<fp16, [1, 1, 1, 512]> sin_f, tensor<fp16, [1, 1, 1, 256]> sin_s, tensor<fp16, [1, 1, 2048, 1]> update_mask) {
+            tensor<fp16, [2048, 2560, 1, 1]> layers_0_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2621568))))[name = string("layers_0_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_0_self_attn_q_norm_weight = const()[name = string("layers_0_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2623680)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_0_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2624256))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(15731520))))[name = string("layers_0_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_0_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(15741824))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(28849088))))[name = string("layers_0_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_0_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(28859392))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(41966656))))[name = string("layers_0_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_0_post_feedforward_layernorm_weight = const()[name = string("layers_0_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(41969280)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_0_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(41974464))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(42302208))))[name = string("layers_0_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_1_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(42302528))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(44924032))))[name = string("layers_1_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_1_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(44926144))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(58033408))))[name = string("layers_1_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_1_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(58043712))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(71150976))))[name = string("layers_1_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_1_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(71161280))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(84268544))))[name = string("layers_1_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_1_post_feedforward_layernorm_weight = const()[name = string("layers_1_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(84271168)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_1_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(84276352))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(84604096))))[name = string("layers_1_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [4096, 2560, 1, 1]> layers_2_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(84604416))), lut = tensor<fp16, [128, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(89847360))))[name = string("layers_2_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [512]> layers_2_self_attn_q_norm_weight = const()[name = string("layers_2_self_attn_q_norm_weight"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(89851520)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_2_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(89852608))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(102959872))))[name = string("layers_2_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_2_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(102970176))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(116077440))))[name = string("layers_2_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_2_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(116087744))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(129195008))))[name = string("layers_2_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_2_post_feedforward_layernorm_weight = const()[name = string("layers_2_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(129197632)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_2_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(129202816))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(129530560))))[name = string("layers_2_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_3_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(129530880))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(132152384))))[name = string("layers_3_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_3_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(132154496))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(145261760))))[name = string("layers_3_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_3_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(145272064))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(158379328))))[name = string("layers_3_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_3_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(158389632))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(171496896))))[name = string("layers_3_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_3_post_feedforward_layernorm_weight = const()[name = string("layers_3_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(171499520)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_3_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(171504704))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(171832448))))[name = string("layers_3_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_4_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(171832768))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(174454272))))[name = string("layers_4_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_4_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(174456384))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(187563648))))[name = string("layers_4_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_4_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(187573952))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(200681216))))[name = string("layers_4_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_4_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(200691520))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(213798784))))[name = string("layers_4_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_4_post_feedforward_layernorm_weight = const()[name = string("layers_4_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(213801408)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_4_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(213806592))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(214134336))))[name = string("layers_4_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_5_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(214134656))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(216756160))))[name = string("layers_5_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_5_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(216758272))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(229865536))))[name = string("layers_5_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_5_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(229875840))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(242983104))))[name = string("layers_5_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_5_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(242993408))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(256100672))))[name = string("layers_5_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_5_post_feedforward_layernorm_weight = const()[name = string("layers_5_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(256103296)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_5_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(256108480))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(256436224))))[name = string("layers_5_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_6_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(256436544))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(259058048))))[name = string("layers_6_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_6_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(259060160))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(272167424))))[name = string("layers_6_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_6_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(272177728))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(285284992))))[name = string("layers_6_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_6_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(285295296))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(298402560))))[name = string("layers_6_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_6_post_feedforward_layernorm_weight = const()[name = string("layers_6_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(298405184)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_6_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(298410368))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(298738112))))[name = string("layers_6_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_7_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(298738432))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(301359936))))[name = string("layers_7_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_7_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(301362048))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(314469312))))[name = string("layers_7_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_7_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(314479616))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(327586880))))[name = string("layers_7_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_7_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(327597184))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(340704448))))[name = string("layers_7_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_7_post_feedforward_layernorm_weight = const()[name = string("layers_7_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(340707072)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_7_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(340712256))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(341040000))))[name = string("layers_7_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [4096, 2560, 1, 1]> layers_8_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(341040320))), lut = tensor<fp16, [128, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(346283264))))[name = string("layers_8_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_8_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(346287424))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(359394688))))[name = string("layers_8_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_8_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(359404992))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(372512256))))[name = string("layers_8_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_8_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(372522560))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(385629824))))[name = string("layers_8_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_8_post_feedforward_layernorm_weight = const()[name = string("layers_8_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(385632448)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_8_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(385637632))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(385965376))))[name = string("layers_8_per_layer_input_gate_weight_palettized")];
+            int32 var_452 = const()[name = string("op_452"), val = int32(-1)];
+            fp16 const_0_promoted_to_fp16 = const()[name = string("const_0_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_454_cast_fp16 = mul(x = hidden_states, y = const_0_promoted_to_fp16)[name = string("op_454_cast_fp16")];
+            bool input_1_interleave_0 = const()[name = string("input_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_1_cast_fp16 = concat(axis = var_452, interleave = input_1_interleave_0, values = (hidden_states, var_454_cast_fp16))[name = string("input_1_cast_fp16")];
+            tensor<int32, [1]> normed_1_axes_0 = const()[name = string("normed_1_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_449_to_fp16 = const()[name = string("op_449_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_1_cast_fp16 = layer_norm(axes = normed_1_axes_0, epsilon = var_449_to_fp16, x = input_1_cast_fp16)[name = string("normed_1_cast_fp16")];
+            tensor<int32, [2]> var_459_split_sizes_0 = const()[name = string("op_459_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_459_axis_0 = const()[name = string("op_459_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_459_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_459_cast_fp16_1 = split(axis = var_459_axis_0, split_sizes = var_459_split_sizes_0, x = normed_1_cast_fp16)[name = string("op_459_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_0_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(385965696)))];
+            tensor<fp16, [1, 1, 2560]> h_1_cast_fp16 = mul(x = var_459_cast_fp16_0, y = layers_0_input_layernorm_weight_promoted_to_fp16)[name = string("h_1_cast_fp16")];
+            tensor<int32, [3]> var_465 = const()[name = string("op_465"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_468_axes_0 = const()[name = string("op_468_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_466_cast_fp16 = transpose(perm = var_465, x = h_1_cast_fp16)[name = string("transpose_103")];
+            tensor<fp16, [1, 2560, 1, 1]> var_468_cast_fp16 = expand_dims(axes = var_468_axes_0, x = var_466_cast_fp16)[name = string("op_468_cast_fp16")];
+            string var_484_pad_type_0 = const()[name = string("op_484_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_484_strides_0 = const()[name = string("op_484_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_484_pad_0 = const()[name = string("op_484_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_484_dilations_0 = const()[name = string("op_484_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_484_groups_0 = const()[name = string("op_484_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_484 = conv(dilations = var_484_dilations_0, groups = var_484_groups_0, pad = var_484_pad_0, pad_type = var_484_pad_type_0, strides = var_484_strides_0, weight = layers_0_self_attn_q_proj_weight_palettized, x = var_468_cast_fp16)[name = string("op_484")];
+            tensor<int32, [4]> var_489 = const()[name = string("op_489"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_490 = reshape(shape = var_489, x = var_484)[name = string("op_490")];
+            tensor<int32, [4]> var_495 = const()[name = string("op_495"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_505 = const()[name = string("op_505"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_496 = transpose(perm = var_495, x = var_490)[name = string("transpose_102")];
+            tensor<fp16, [1, 8, 256]> x_1 = reshape(shape = var_505, x = var_496)[name = string("x_1")];
+            int32 var_511 = const()[name = string("op_511"), val = int32(-1)];
+            fp16 const_1_promoted = const()[name = string("const_1_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_513 = mul(x = x_1, y = const_1_promoted)[name = string("op_513")];
+            bool input_5_interleave_0 = const()[name = string("input_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_5 = concat(axis = var_511, interleave = input_5_interleave_0, values = (x_1, var_513))[name = string("input_5")];
+            tensor<int32, [1]> normed_5_axes_0 = const()[name = string("normed_5_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_508_to_fp16 = const()[name = string("op_508_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_5_cast_fp16 = layer_norm(axes = normed_5_axes_0, epsilon = var_508_to_fp16, x = input_5)[name = string("normed_5_cast_fp16")];
+            tensor<int32, [2]> var_518_split_sizes_0 = const()[name = string("op_518_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_518_axis_0 = const()[name = string("op_518_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_518_0, tensor<fp16, [1, 8, 256]> var_518_1 = split(axis = var_518_axis_0, split_sizes = var_518_split_sizes_0, x = normed_5_cast_fp16)[name = string("op_518")];
+            tensor<fp16, [1, 8, 256]> var_520 = mul(x = var_518_0, y = layers_0_self_attn_q_norm_weight)[name = string("op_520")];
+            tensor<int32, [4]> var_525 = const()[name = string("op_525"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_3 = reshape(shape = var_525, x = var_520)[name = string("q_3")];
+            tensor<fp16, [1, 8, 1, 256]> var_527_cast_fp16 = mul(x = q_3, y = cos_s)[name = string("op_527_cast_fp16")];
+            tensor<int32, [2]> var_528_split_sizes_0 = const()[name = string("op_528_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_528_axis_0 = const()[name = string("op_528_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_528_0, tensor<fp16, [1, 8, 1, 128]> var_528_1 = split(axis = var_528_axis_0, split_sizes = var_528_split_sizes_0, x = q_3)[name = string("op_528")];
+            fp16 const_2_promoted = const()[name = string("const_2_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_530 = mul(x = var_528_1, y = const_2_promoted)[name = string("op_530")];
+            int32 var_532 = const()[name = string("op_532"), val = int32(-1)];
+            bool var_533_interleave_0 = const()[name = string("op_533_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_533 = concat(axis = var_532, interleave = var_533_interleave_0, values = (var_530, var_528_0))[name = string("op_533")];
+            tensor<fp16, [1, 8, 1, 256]> var_534_cast_fp16 = mul(x = var_533, y = sin_s)[name = string("op_534_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_5_cast_fp16 = add(x = var_527_cast_fp16, y = var_534_cast_fp16)[name = string("q_5_cast_fp16")];
+            tensor<int32, [4]> transpose_0_perm_0 = const()[name = string("transpose_0_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_0_reps_0 = const()[name = string("tile_0_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_0_cast_fp16 = transpose(perm = transpose_0_perm_0, x = kv13_k)[name = string("transpose_101")];
+            tensor<fp16, [8, 1, 512, 256]> tile_0_cast_fp16 = tile(reps = tile_0_reps_0, x = transpose_0_cast_fp16)[name = string("tile_0_cast_fp16")];
+            tensor<int32, [5]> concat_0 = const()[name = string("concat_0"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_0_cast_fp16 = reshape(shape = concat_0, x = tile_0_cast_fp16)[name = string("reshape_0_cast_fp16")];
+            tensor<int32, [5]> transpose_1_perm_0 = const()[name = string("transpose_1_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_1 = const()[name = string("concat_1"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_1_cast_fp16 = transpose(perm = transpose_1_perm_0, x = reshape_0_cast_fp16)[name = string("transpose_100")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_1_cast_fp16 = reshape(shape = concat_1, x = transpose_1_cast_fp16)[name = string("reshape_1_cast_fp16")];
+            tensor<int32, [4]> transpose_36_perm_0 = const()[name = string("transpose_36_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_2_perm_0 = const()[name = string("transpose_2_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_1_reps_0 = const()[name = string("tile_1_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_2_cast_fp16 = transpose(perm = transpose_2_perm_0, x = kv13_v)[name = string("transpose_99")];
+            tensor<fp16, [8, 1, 512, 256]> tile_1_cast_fp16 = tile(reps = tile_1_reps_0, x = transpose_2_cast_fp16)[name = string("tile_1_cast_fp16")];
+            tensor<int32, [5]> concat_2 = const()[name = string("concat_2"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_2_cast_fp16 = reshape(shape = concat_2, x = tile_1_cast_fp16)[name = string("reshape_2_cast_fp16")];
+            tensor<int32, [5]> transpose_3_perm_0 = const()[name = string("transpose_3_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_3 = const()[name = string("concat_3"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_3_cast_fp16 = transpose(perm = transpose_3_perm_0, x = reshape_2_cast_fp16)[name = string("transpose_98")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_3_cast_fp16 = reshape(shape = concat_3, x = transpose_3_cast_fp16)[name = string("reshape_3_cast_fp16")];
+            tensor<int32, [4]> V_expanded_1_perm_0 = const()[name = string("V_expanded_1_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(false)];
+            bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_36_cast_fp16 = transpose(perm = transpose_36_perm_0, x = reshape_1_cast_fp16)[name = string("transpose_97")];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = q_5_cast_fp16, y = transpose_36_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = causal_mask_sliding)[name = string("x_3_cast_fp16")];
+            tensor<int32, [1]> reduce_max_0_axes_0 = const()[name = string("reduce_max_0_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_0_keep_dims_0 = const()[name = string("reduce_max_0_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_0 = reduce_max(axes = reduce_max_0_axes_0, keep_dims = reduce_max_0_keep_dims_0, x = x_3_cast_fp16)[name = string("reduce_max_0")];
+            tensor<fp16, [1, 8, 1, 512]> var_566 = sub(x = x_3_cast_fp16, y = reduce_max_0)[name = string("op_566")];
+            tensor<fp16, [1, 8, 1, 512]> var_572 = exp(x = var_566)[name = string("op_572")];
+            tensor<int32, [1]> var_582_axes_0 = const()[name = string("op_582_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_582_keep_dims_0 = const()[name = string("op_582_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_582 = reduce_sum(axes = var_582_axes_0, keep_dims = var_582_keep_dims_0, x = var_572)[name = string("op_582")];
+            tensor<fp16, [1, 8, 1, 512]> var_588_cast_fp16 = real_div(x = var_572, y = var_582)[name = string("op_588_cast_fp16")];
+            bool attn_output_1_transpose_x_0 = const()[name = string("attn_output_1_transpose_x_0"), val = bool(false)];
+            bool attn_output_1_transpose_y_0 = const()[name = string("attn_output_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_1_cast_fp16 = transpose(perm = V_expanded_1_perm_0, x = reshape_3_cast_fp16)[name = string("transpose_96")];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_1_cast_fp16 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = var_588_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_1_cast_fp16")];
+            tensor<int32, [4]> var_599 = const()[name = string("op_599"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_606 = const()[name = string("op_606"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_600_cast_fp16 = transpose(perm = var_599, x = attn_output_1_cast_fp16)[name = string("transpose_95")];
+            tensor<fp16, [1, 1, 2048]> attn_output_3_cast_fp16 = reshape(shape = var_606, x = var_600_cast_fp16)[name = string("attn_output_3_cast_fp16")];
+            tensor<int32, [3]> var_611 = const()[name = string("op_611"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_627_pad_type_0 = const()[name = string("op_627_pad_type_0"), val = string("valid")];
+            int32 var_627_groups_0 = const()[name = string("op_627_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_627_strides_0 = const()[name = string("op_627_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_627_pad_0 = const()[name = string("op_627_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_627_dilations_0 = const()[name = string("op_627_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_0_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(385970880))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388592384))))[name = string("squeeze_0_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_612_cast_fp16 = transpose(perm = var_611, x = attn_output_3_cast_fp16)[name = string("transpose_94")];
+            tensor<fp16, [1, 2560, 1]> var_627_cast_fp16 = conv(dilations = var_627_dilations_0, groups = var_627_groups_0, pad = var_627_pad_0, pad_type = var_627_pad_type_0, strides = var_627_strides_0, weight = squeeze_0_cast_fp16_to_fp32_to_fp16_palettized, x = var_612_cast_fp16)[name = string("op_627_cast_fp16")];
+            tensor<int32, [3]> var_631 = const()[name = string("op_631"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_637 = const()[name = string("op_637"), val = int32(-1)];
+            fp16 const_3_promoted_to_fp16 = const()[name = string("const_3_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_7_cast_fp16 = transpose(perm = var_631, x = var_627_cast_fp16)[name = string("transpose_93")];
+            tensor<fp16, [1, 1, 2560]> var_639_cast_fp16 = mul(x = x_7_cast_fp16, y = const_3_promoted_to_fp16)[name = string("op_639_cast_fp16")];
+            bool input_9_interleave_0 = const()[name = string("input_9_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_9_cast_fp16 = concat(axis = var_637, interleave = input_9_interleave_0, values = (x_7_cast_fp16, var_639_cast_fp16))[name = string("input_9_cast_fp16")];
+            tensor<int32, [1]> normed_9_axes_0 = const()[name = string("normed_9_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_634_to_fp16 = const()[name = string("op_634_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_9_cast_fp16 = layer_norm(axes = normed_9_axes_0, epsilon = var_634_to_fp16, x = input_9_cast_fp16)[name = string("normed_9_cast_fp16")];
+            tensor<int32, [2]> var_644_split_sizes_0 = const()[name = string("op_644_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_644_axis_0 = const()[name = string("op_644_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_644_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_644_cast_fp16_1 = split(axis = var_644_axis_0, split_sizes = var_644_split_sizes_0, x = normed_9_cast_fp16)[name = string("op_644_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_0_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388595008)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_5_cast_fp16 = mul(x = var_644_cast_fp16_0, y = layers_0_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_5_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_9_cast_fp16 = add(x = hidden_states, y = attn_output_5_cast_fp16)[name = string("x_9_cast_fp16")];
+            int32 var_653 = const()[name = string("op_653"), val = int32(-1)];
+            fp16 const_4_promoted_to_fp16 = const()[name = string("const_4_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_655_cast_fp16 = mul(x = x_9_cast_fp16, y = const_4_promoted_to_fp16)[name = string("op_655_cast_fp16")];
+            bool input_11_interleave_0 = const()[name = string("input_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_11_cast_fp16 = concat(axis = var_653, interleave = input_11_interleave_0, values = (x_9_cast_fp16, var_655_cast_fp16))[name = string("input_11_cast_fp16")];
+            tensor<int32, [1]> normed_13_axes_0 = const()[name = string("normed_13_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_650_to_fp16 = const()[name = string("op_650_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_13_cast_fp16 = layer_norm(axes = normed_13_axes_0, epsilon = var_650_to_fp16, x = input_11_cast_fp16)[name = string("normed_13_cast_fp16")];
+            tensor<int32, [2]> var_660_split_sizes_0 = const()[name = string("op_660_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_660_axis_0 = const()[name = string("op_660_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_660_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_660_cast_fp16_1 = split(axis = var_660_axis_0, split_sizes = var_660_split_sizes_0, x = normed_13_cast_fp16)[name = string("op_660_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_0_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388600192)))];
+            tensor<fp16, [1, 1, 2560]> h_3_cast_fp16 = mul(x = var_660_cast_fp16_0, y = layers_0_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_3_cast_fp16")];
+            tensor<int32, [3]> var_671 = const()[name = string("op_671"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_13_axes_0 = const()[name = string("input_13_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_672 = transpose(perm = var_671, x = h_3_cast_fp16)[name = string("transpose_92")];
+            tensor<fp16, [1, 2560, 1, 1]> input_13 = expand_dims(axes = input_13_axes_0, x = var_672)[name = string("input_13")];
+            string gate_1_pad_type_0 = const()[name = string("gate_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_1_strides_0 = const()[name = string("gate_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_1_pad_0 = const()[name = string("gate_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_1_dilations_0 = const()[name = string("gate_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_1_groups_0 = const()[name = string("gate_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_1 = conv(dilations = gate_1_dilations_0, groups = gate_1_groups_0, pad = gate_1_pad_0, pad_type = gate_1_pad_type_0, strides = gate_1_strides_0, weight = layers_0_mlp_gate_proj_weight_palettized, x = input_13)[name = string("gate_1")];
+            string up_1_pad_type_0 = const()[name = string("up_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_1_strides_0 = const()[name = string("up_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_1_pad_0 = const()[name = string("up_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_1_dilations_0 = const()[name = string("up_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_1_groups_0 = const()[name = string("up_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_1 = conv(dilations = up_1_dilations_0, groups = up_1_groups_0, pad = up_1_pad_0, pad_type = up_1_pad_type_0, strides = up_1_strides_0, weight = layers_0_mlp_up_proj_weight_palettized, x = input_13)[name = string("up_1")];
+            string gate_3_mode_0 = const()[name = string("gate_3_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_3 = gelu(mode = gate_3_mode_0, x = gate_1)[name = string("gate_3")];
+            tensor<fp16, [1, 10240, 1, 1]> input_15 = mul(x = gate_3, y = up_1)[name = string("input_15")];
+            string mlp_out_1_pad_type_0 = const()[name = string("mlp_out_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_1_strides_0 = const()[name = string("mlp_out_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_1_pad_0 = const()[name = string("mlp_out_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_1_dilations_0 = const()[name = string("mlp_out_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_1_groups_0 = const()[name = string("mlp_out_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_1 = conv(dilations = mlp_out_1_dilations_0, groups = mlp_out_1_groups_0, pad = mlp_out_1_pad_0, pad_type = mlp_out_1_pad_type_0, strides = mlp_out_1_strides_0, weight = layers_0_mlp_down_proj_weight_palettized, x = input_15)[name = string("mlp_out_1")];
+            tensor<int32, [1]> var_712_axes_0 = const()[name = string("op_712_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_712 = squeeze(axes = var_712_axes_0, x = mlp_out_1)[name = string("op_712")];
+            tensor<int32, [3]> var_716 = const()[name = string("op_716"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_722 = const()[name = string("op_722"), val = int32(-1)];
+            fp16 const_5_promoted = const()[name = string("const_5_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_11 = transpose(perm = var_716, x = var_712)[name = string("transpose_91")];
+            tensor<fp16, [1, 1, 2560]> var_724 = mul(x = x_11, y = const_5_promoted)[name = string("op_724")];
+            bool input_17_interleave_0 = const()[name = string("input_17_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_17 = concat(axis = var_722, interleave = input_17_interleave_0, values = (x_11, var_724))[name = string("input_17")];
+            tensor<int32, [1]> normed_17_axes_0 = const()[name = string("normed_17_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_719_to_fp16 = const()[name = string("op_719_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_17_cast_fp16 = layer_norm(axes = normed_17_axes_0, epsilon = var_719_to_fp16, x = input_17)[name = string("normed_17_cast_fp16")];
+            tensor<int32, [2]> var_729_split_sizes_0 = const()[name = string("op_729_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_729_axis_0 = const()[name = string("op_729_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_729_0, tensor<fp16, [1, 1, 2560]> var_729_1 = split(axis = var_729_axis_0, split_sizes = var_729_split_sizes_0, x = normed_17_cast_fp16)[name = string("op_729")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_3 = mul(x = var_729_0, y = layers_0_post_feedforward_layernorm_weight)[name = string("hidden_states_3")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_5_cast_fp16 = add(x = x_9_cast_fp16, y = hidden_states_3)[name = string("hidden_states_5_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_1_begin_0 = const()[name = string("per_layer_slice_1_begin_0"), val = tensor<int32, [3]>([0, 0, 8448])];
+            tensor<int32, [3]> per_layer_slice_1_end_0 = const()[name = string("per_layer_slice_1_end_0"), val = tensor<int32, [3]>([1, 1, 8704])];
+            tensor<bool, [3]> per_layer_slice_1_end_mask_0 = const()[name = string("per_layer_slice_1_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_1_cast_fp16 = slice_by_index(begin = per_layer_slice_1_begin_0, end = per_layer_slice_1_end_0, end_mask = per_layer_slice_1_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_1_cast_fp16")];
+            tensor<int32, [3]> var_757 = const()[name = string("op_757"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_19_axes_0 = const()[name = string("input_19_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_758 = transpose(perm = var_757, x = hidden_states_5_cast_fp16)[name = string("transpose_90")];
+            tensor<fp16, [1, 2560, 1, 1]> input_19 = expand_dims(axes = input_19_axes_0, x = var_758)[name = string("input_19")];
+            string gated_1_pad_type_0 = const()[name = string("gated_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_1_strides_0 = const()[name = string("gated_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_1_pad_0 = const()[name = string("gated_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_1_dilations_0 = const()[name = string("gated_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_1_groups_0 = const()[name = string("gated_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_1 = conv(dilations = gated_1_dilations_0, groups = gated_1_groups_0, pad = gated_1_pad_0, pad_type = gated_1_pad_type_0, strides = gated_1_strides_0, weight = layers_0_per_layer_input_gate_weight_palettized, x = input_19)[name = string("gated_1")];
+            string gated_3_mode_0 = const()[name = string("gated_3_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_3 = gelu(mode = gated_3_mode_0, x = gated_1)[name = string("gated_3")];
+            tensor<int32, [3]> var_777 = const()[name = string("op_777"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_1_axes_0 = const()[name = string("per_layer_slice_conv_1_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_778_cast_fp16 = transpose(perm = var_777, x = per_layer_slice_1_cast_fp16)[name = string("transpose_89")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_1_cast_fp16 = expand_dims(axes = per_layer_slice_conv_1_axes_0, x = var_778_cast_fp16)[name = string("per_layer_slice_conv_1_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_21_cast_fp16 = mul(x = gated_3, y = per_layer_slice_conv_1_cast_fp16)[name = string("input_21_cast_fp16")];
+            string gated_5_pad_type_0 = const()[name = string("gated_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_5_strides_0 = const()[name = string("gated_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_5_pad_0 = const()[name = string("gated_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_5_dilations_0 = const()[name = string("gated_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_5_groups_0 = const()[name = string("gated_5_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_0_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388605376))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388933120))))[name = string("layers_0_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_5_cast_fp16 = conv(dilations = gated_5_dilations_0, groups = gated_5_groups_0, pad = gated_5_pad_0, pad_type = gated_5_pad_type_0, strides = gated_5_strides_0, weight = layers_0_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_21_cast_fp16)[name = string("gated_5_cast_fp16")];
+            tensor<int32, [1]> var_794_axes_0 = const()[name = string("op_794_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_794_cast_fp16 = squeeze(axes = var_794_axes_0, x = gated_5_cast_fp16)[name = string("op_794_cast_fp16")];
+            tensor<int32, [3]> var_798 = const()[name = string("op_798"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_804 = const()[name = string("op_804"), val = int32(-1)];
+            fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_13_cast_fp16 = transpose(perm = var_798, x = var_794_cast_fp16)[name = string("transpose_88")];
+            tensor<fp16, [1, 1, 2560]> var_806_cast_fp16 = mul(x = x_13_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_806_cast_fp16")];
+            bool input_23_interleave_0 = const()[name = string("input_23_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_23_cast_fp16 = concat(axis = var_804, interleave = input_23_interleave_0, values = (x_13_cast_fp16, var_806_cast_fp16))[name = string("input_23_cast_fp16")];
+            tensor<int32, [1]> normed_21_axes_0 = const()[name = string("normed_21_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_801_to_fp16 = const()[name = string("op_801_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_21_cast_fp16 = layer_norm(axes = normed_21_axes_0, epsilon = var_801_to_fp16, x = input_23_cast_fp16)[name = string("normed_21_cast_fp16")];
+            tensor<int32, [2]> var_811_split_sizes_0 = const()[name = string("op_811_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_811_axis_0 = const()[name = string("op_811_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_811_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_811_cast_fp16_1 = split(axis = var_811_axis_0, split_sizes = var_811_split_sizes_0, x = normed_21_cast_fp16)[name = string("op_811_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_0_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388935744)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_9_cast_fp16 = mul(x = var_811_cast_fp16_0, y = layers_0_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_9_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_11_cast_fp16 = add(x = hidden_states_5_cast_fp16, y = hidden_states_9_cast_fp16)[name = string("hidden_states_11_cast_fp16")];
+            tensor<fp16, [1]> const_7_promoted_to_fp16 = const()[name = string("const_7_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.a6p-1])];
+            tensor<fp16, [1, 1, 2560]> x_15_cast_fp16 = mul(x = hidden_states_11_cast_fp16, y = const_7_promoted_to_fp16)[name = string("x_15_cast_fp16")];
+            int32 var_826 = const()[name = string("op_826"), val = int32(-1)];
+            fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_828_cast_fp16 = mul(x = x_15_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_828_cast_fp16")];
+            bool input_25_interleave_0 = const()[name = string("input_25_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_25_cast_fp16 = concat(axis = var_826, interleave = input_25_interleave_0, values = (x_15_cast_fp16, var_828_cast_fp16))[name = string("input_25_cast_fp16")];
+            tensor<int32, [1]> normed_25_axes_0 = const()[name = string("normed_25_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_823_to_fp16 = const()[name = string("op_823_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_25_cast_fp16 = layer_norm(axes = normed_25_axes_0, epsilon = var_823_to_fp16, x = input_25_cast_fp16)[name = string("normed_25_cast_fp16")];
+            tensor<int32, [2]> var_833_split_sizes_0 = const()[name = string("op_833_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_833_axis_0 = const()[name = string("op_833_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_833_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_833_cast_fp16_1 = split(axis = var_833_axis_0, split_sizes = var_833_split_sizes_0, x = normed_25_cast_fp16)[name = string("op_833_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_1_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388940928)))];
+            tensor<fp16, [1, 1, 2560]> h_7_cast_fp16 = mul(x = var_833_cast_fp16_0, y = layers_1_input_layernorm_weight_promoted_to_fp16)[name = string("h_7_cast_fp16")];
+            tensor<int32, [3]> var_839 = const()[name = string("op_839"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_842_axes_0 = const()[name = string("op_842_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_840_cast_fp16 = transpose(perm = var_839, x = h_7_cast_fp16)[name = string("transpose_87")];
+            tensor<fp16, [1, 2560, 1, 1]> var_842_cast_fp16 = expand_dims(axes = var_842_axes_0, x = var_840_cast_fp16)[name = string("op_842_cast_fp16")];
+            string var_858_pad_type_0 = const()[name = string("op_858_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_858_strides_0 = const()[name = string("op_858_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_858_pad_0 = const()[name = string("op_858_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_858_dilations_0 = const()[name = string("op_858_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_858_groups_0 = const()[name = string("op_858_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_858 = conv(dilations = var_858_dilations_0, groups = var_858_groups_0, pad = var_858_pad_0, pad_type = var_858_pad_type_0, strides = var_858_strides_0, weight = layers_1_self_attn_q_proj_weight_palettized, x = var_842_cast_fp16)[name = string("op_858")];
+            tensor<int32, [4]> var_863 = const()[name = string("op_863"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_864 = reshape(shape = var_863, x = var_858)[name = string("op_864")];
+            tensor<int32, [4]> var_869 = const()[name = string("op_869"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_879 = const()[name = string("op_879"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_870 = transpose(perm = var_869, x = var_864)[name = string("transpose_86")];
+            tensor<fp16, [1, 8, 256]> x_17 = reshape(shape = var_879, x = var_870)[name = string("x_17")];
+            int32 var_885 = const()[name = string("op_885"), val = int32(-1)];
+            fp16 const_9_promoted = const()[name = string("const_9_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_887 = mul(x = x_17, y = const_9_promoted)[name = string("op_887")];
+            bool input_29_interleave_0 = const()[name = string("input_29_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_29 = concat(axis = var_885, interleave = input_29_interleave_0, values = (x_17, var_887))[name = string("input_29")];
+            tensor<int32, [1]> normed_29_axes_0 = const()[name = string("normed_29_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_882_to_fp16 = const()[name = string("op_882_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_29_cast_fp16 = layer_norm(axes = normed_29_axes_0, epsilon = var_882_to_fp16, x = input_29)[name = string("normed_29_cast_fp16")];
+            tensor<int32, [2]> var_892_split_sizes_0 = const()[name = string("op_892_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_892_axis_0 = const()[name = string("op_892_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_892_0, tensor<fp16, [1, 8, 256]> var_892_1 = split(axis = var_892_axis_0, split_sizes = var_892_split_sizes_0, x = normed_29_cast_fp16)[name = string("op_892")];
+            tensor<fp16, [1, 8, 256]> var_894 = mul(x = var_892_0, y = layers_0_self_attn_q_norm_weight)[name = string("op_894")];
+            tensor<int32, [4]> var_899 = const()[name = string("op_899"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_9 = reshape(shape = var_899, x = var_894)[name = string("q_9")];
+            tensor<fp16, [1, 8, 1, 256]> var_901_cast_fp16 = mul(x = q_9, y = cos_s)[name = string("op_901_cast_fp16")];
+            tensor<int32, [2]> var_902_split_sizes_0 = const()[name = string("op_902_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_902_axis_0 = const()[name = string("op_902_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_902_0, tensor<fp16, [1, 8, 1, 128]> var_902_1 = split(axis = var_902_axis_0, split_sizes = var_902_split_sizes_0, x = q_9)[name = string("op_902")];
+            fp16 const_10_promoted = const()[name = string("const_10_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_904 = mul(x = var_902_1, y = const_10_promoted)[name = string("op_904")];
+            int32 var_906 = const()[name = string("op_906"), val = int32(-1)];
+            bool var_907_interleave_0 = const()[name = string("op_907_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_907 = concat(axis = var_906, interleave = var_907_interleave_0, values = (var_904, var_902_0))[name = string("op_907")];
+            tensor<fp16, [1, 8, 1, 256]> var_908_cast_fp16 = mul(x = var_907, y = sin_s)[name = string("op_908_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_11_cast_fp16 = add(x = var_901_cast_fp16, y = var_908_cast_fp16)[name = string("q_11_cast_fp16")];
+            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(false)];
+            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = q_11_cast_fp16, y = transpose_36_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_19_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = causal_mask_sliding)[name = string("x_19_cast_fp16")];
+            tensor<int32, [1]> reduce_max_1_axes_0 = const()[name = string("reduce_max_1_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_1_keep_dims_0 = const()[name = string("reduce_max_1_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_1 = reduce_max(axes = reduce_max_1_axes_0, keep_dims = reduce_max_1_keep_dims_0, x = x_19_cast_fp16)[name = string("reduce_max_1")];
+            tensor<fp16, [1, 8, 1, 512]> var_940 = sub(x = x_19_cast_fp16, y = reduce_max_1)[name = string("op_940")];
+            tensor<fp16, [1, 8, 1, 512]> var_946 = exp(x = var_940)[name = string("op_946")];
+            tensor<int32, [1]> var_956_axes_0 = const()[name = string("op_956_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_956_keep_dims_0 = const()[name = string("op_956_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_956 = reduce_sum(axes = var_956_axes_0, keep_dims = var_956_keep_dims_0, x = var_946)[name = string("op_956")];
+            tensor<fp16, [1, 8, 1, 512]> var_962_cast_fp16 = real_div(x = var_946, y = var_956)[name = string("op_962_cast_fp16")];
+            bool attn_output_7_transpose_x_0 = const()[name = string("attn_output_7_transpose_x_0"), val = bool(false)];
+            bool attn_output_7_transpose_y_0 = const()[name = string("attn_output_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_7_cast_fp16 = matmul(transpose_x = attn_output_7_transpose_x_0, transpose_y = attn_output_7_transpose_y_0, x = var_962_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_7_cast_fp16")];
+            tensor<int32, [4]> var_973 = const()[name = string("op_973"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_980 = const()[name = string("op_980"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_974_cast_fp16 = transpose(perm = var_973, x = attn_output_7_cast_fp16)[name = string("transpose_85")];
+            tensor<fp16, [1, 1, 2048]> attn_output_9_cast_fp16 = reshape(shape = var_980, x = var_974_cast_fp16)[name = string("attn_output_9_cast_fp16")];
+            tensor<int32, [3]> var_985 = const()[name = string("op_985"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_1001_pad_type_0 = const()[name = string("op_1001_pad_type_0"), val = string("valid")];
+            int32 var_1001_groups_0 = const()[name = string("op_1001_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_1001_strides_0 = const()[name = string("op_1001_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_1001_pad_0 = const()[name = string("op_1001_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_1001_dilations_0 = const()[name = string("op_1001_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_1_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388946112))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391567616))))[name = string("squeeze_1_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_986_cast_fp16 = transpose(perm = var_985, x = attn_output_9_cast_fp16)[name = string("transpose_84")];
+            tensor<fp16, [1, 2560, 1]> var_1001_cast_fp16 = conv(dilations = var_1001_dilations_0, groups = var_1001_groups_0, pad = var_1001_pad_0, pad_type = var_1001_pad_type_0, strides = var_1001_strides_0, weight = squeeze_1_cast_fp16_to_fp32_to_fp16_palettized, x = var_986_cast_fp16)[name = string("op_1001_cast_fp16")];
+            tensor<int32, [3]> var_1005 = const()[name = string("op_1005"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1011 = const()[name = string("op_1011"), val = int32(-1)];
+            fp16 const_11_promoted_to_fp16 = const()[name = string("const_11_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_23_cast_fp16 = transpose(perm = var_1005, x = var_1001_cast_fp16)[name = string("transpose_83")];
+            tensor<fp16, [1, 1, 2560]> var_1013_cast_fp16 = mul(x = x_23_cast_fp16, y = const_11_promoted_to_fp16)[name = string("op_1013_cast_fp16")];
+            bool input_33_interleave_0 = const()[name = string("input_33_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_33_cast_fp16 = concat(axis = var_1011, interleave = input_33_interleave_0, values = (x_23_cast_fp16, var_1013_cast_fp16))[name = string("input_33_cast_fp16")];
+            tensor<int32, [1]> normed_33_axes_0 = const()[name = string("normed_33_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1008_to_fp16 = const()[name = string("op_1008_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_33_cast_fp16 = layer_norm(axes = normed_33_axes_0, epsilon = var_1008_to_fp16, x = input_33_cast_fp16)[name = string("normed_33_cast_fp16")];
+            tensor<int32, [2]> var_1018_split_sizes_0 = const()[name = string("op_1018_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1018_axis_0 = const()[name = string("op_1018_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1018_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1018_cast_fp16_1 = split(axis = var_1018_axis_0, split_sizes = var_1018_split_sizes_0, x = normed_33_cast_fp16)[name = string("op_1018_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_1_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391570240)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_11_cast_fp16 = mul(x = var_1018_cast_fp16_0, y = layers_1_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_11_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_25_cast_fp16 = add(x = x_15_cast_fp16, y = attn_output_11_cast_fp16)[name = string("x_25_cast_fp16")];
+            int32 var_1027 = const()[name = string("op_1027"), val = int32(-1)];
+            fp16 const_12_promoted_to_fp16 = const()[name = string("const_12_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_1029_cast_fp16 = mul(x = x_25_cast_fp16, y = const_12_promoted_to_fp16)[name = string("op_1029_cast_fp16")];
+            bool input_35_interleave_0 = const()[name = string("input_35_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_35_cast_fp16 = concat(axis = var_1027, interleave = input_35_interleave_0, values = (x_25_cast_fp16, var_1029_cast_fp16))[name = string("input_35_cast_fp16")];
+            tensor<int32, [1]> normed_37_axes_0 = const()[name = string("normed_37_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1024_to_fp16 = const()[name = string("op_1024_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_37_cast_fp16 = layer_norm(axes = normed_37_axes_0, epsilon = var_1024_to_fp16, x = input_35_cast_fp16)[name = string("normed_37_cast_fp16")];
+            tensor<int32, [2]> var_1034_split_sizes_0 = const()[name = string("op_1034_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1034_axis_0 = const()[name = string("op_1034_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1034_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1034_cast_fp16_1 = split(axis = var_1034_axis_0, split_sizes = var_1034_split_sizes_0, x = normed_37_cast_fp16)[name = string("op_1034_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_1_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391575424)))];
+            tensor<fp16, [1, 1, 2560]> h_9_cast_fp16 = mul(x = var_1034_cast_fp16_0, y = layers_1_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_9_cast_fp16")];
+            tensor<int32, [3]> var_1045 = const()[name = string("op_1045"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_37_axes_0 = const()[name = string("input_37_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1046 = transpose(perm = var_1045, x = h_9_cast_fp16)[name = string("transpose_82")];
+            tensor<fp16, [1, 2560, 1, 1]> input_37 = expand_dims(axes = input_37_axes_0, x = var_1046)[name = string("input_37")];
+            string gate_5_pad_type_0 = const()[name = string("gate_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_5_strides_0 = const()[name = string("gate_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_5_pad_0 = const()[name = string("gate_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_5_dilations_0 = const()[name = string("gate_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_5_groups_0 = const()[name = string("gate_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_5 = conv(dilations = gate_5_dilations_0, groups = gate_5_groups_0, pad = gate_5_pad_0, pad_type = gate_5_pad_type_0, strides = gate_5_strides_0, weight = layers_1_mlp_gate_proj_weight_palettized, x = input_37)[name = string("gate_5")];
+            string up_3_pad_type_0 = const()[name = string("up_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_3_strides_0 = const()[name = string("up_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_3_pad_0 = const()[name = string("up_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_3_dilations_0 = const()[name = string("up_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_3_groups_0 = const()[name = string("up_3_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_3 = conv(dilations = up_3_dilations_0, groups = up_3_groups_0, pad = up_3_pad_0, pad_type = up_3_pad_type_0, strides = up_3_strides_0, weight = layers_1_mlp_up_proj_weight_palettized, x = input_37)[name = string("up_3")];
+            string gate_7_mode_0 = const()[name = string("gate_7_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_7 = gelu(mode = gate_7_mode_0, x = gate_5)[name = string("gate_7")];
+            tensor<fp16, [1, 10240, 1, 1]> input_39 = mul(x = gate_7, y = up_3)[name = string("input_39")];
+            string mlp_out_3_pad_type_0 = const()[name = string("mlp_out_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_3_strides_0 = const()[name = string("mlp_out_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_3_pad_0 = const()[name = string("mlp_out_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_3_dilations_0 = const()[name = string("mlp_out_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_3_groups_0 = const()[name = string("mlp_out_3_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_3 = conv(dilations = mlp_out_3_dilations_0, groups = mlp_out_3_groups_0, pad = mlp_out_3_pad_0, pad_type = mlp_out_3_pad_type_0, strides = mlp_out_3_strides_0, weight = layers_1_mlp_down_proj_weight_palettized, x = input_39)[name = string("mlp_out_3")];
+            tensor<int32, [1]> var_1086_axes_0 = const()[name = string("op_1086_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1086 = squeeze(axes = var_1086_axes_0, x = mlp_out_3)[name = string("op_1086")];
+            tensor<int32, [3]> var_1090 = const()[name = string("op_1090"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1096 = const()[name = string("op_1096"), val = int32(-1)];
+            fp16 const_13_promoted = const()[name = string("const_13_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_27 = transpose(perm = var_1090, x = var_1086)[name = string("transpose_81")];
+            tensor<fp16, [1, 1, 2560]> var_1098 = mul(x = x_27, y = const_13_promoted)[name = string("op_1098")];
+            bool input_41_interleave_0 = const()[name = string("input_41_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_41 = concat(axis = var_1096, interleave = input_41_interleave_0, values = (x_27, var_1098))[name = string("input_41")];
+            tensor<int32, [1]> normed_41_axes_0 = const()[name = string("normed_41_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1093_to_fp16 = const()[name = string("op_1093_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_41_cast_fp16 = layer_norm(axes = normed_41_axes_0, epsilon = var_1093_to_fp16, x = input_41)[name = string("normed_41_cast_fp16")];
+            tensor<int32, [2]> var_1103_split_sizes_0 = const()[name = string("op_1103_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1103_axis_0 = const()[name = string("op_1103_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1103_0, tensor<fp16, [1, 1, 2560]> var_1103_1 = split(axis = var_1103_axis_0, split_sizes = var_1103_split_sizes_0, x = normed_41_cast_fp16)[name = string("op_1103")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_13 = mul(x = var_1103_0, y = layers_1_post_feedforward_layernorm_weight)[name = string("hidden_states_13")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_15_cast_fp16 = add(x = x_25_cast_fp16, y = hidden_states_13)[name = string("hidden_states_15_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_3_begin_0 = const()[name = string("per_layer_slice_3_begin_0"), val = tensor<int32, [3]>([0, 0, 8704])];
+            tensor<int32, [3]> per_layer_slice_3_end_0 = const()[name = string("per_layer_slice_3_end_0"), val = tensor<int32, [3]>([1, 1, 8960])];
+            tensor<bool, [3]> per_layer_slice_3_end_mask_0 = const()[name = string("per_layer_slice_3_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_3_cast_fp16 = slice_by_index(begin = per_layer_slice_3_begin_0, end = per_layer_slice_3_end_0, end_mask = per_layer_slice_3_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_3_cast_fp16")];
+            tensor<int32, [3]> var_1131 = const()[name = string("op_1131"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_43_axes_0 = const()[name = string("input_43_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1132 = transpose(perm = var_1131, x = hidden_states_15_cast_fp16)[name = string("transpose_80")];
+            tensor<fp16, [1, 2560, 1, 1]> input_43 = expand_dims(axes = input_43_axes_0, x = var_1132)[name = string("input_43")];
+            string gated_7_pad_type_0 = const()[name = string("gated_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_7_strides_0 = const()[name = string("gated_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_7_pad_0 = const()[name = string("gated_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_7_dilations_0 = const()[name = string("gated_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_7_groups_0 = const()[name = string("gated_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_7 = conv(dilations = gated_7_dilations_0, groups = gated_7_groups_0, pad = gated_7_pad_0, pad_type = gated_7_pad_type_0, strides = gated_7_strides_0, weight = layers_1_per_layer_input_gate_weight_palettized, x = input_43)[name = string("gated_7")];
+            string gated_9_mode_0 = const()[name = string("gated_9_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_9 = gelu(mode = gated_9_mode_0, x = gated_7)[name = string("gated_9")];
+            tensor<int32, [3]> var_1151 = const()[name = string("op_1151"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_3_axes_0 = const()[name = string("per_layer_slice_conv_3_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_1152_cast_fp16 = transpose(perm = var_1151, x = per_layer_slice_3_cast_fp16)[name = string("transpose_79")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_3_cast_fp16 = expand_dims(axes = per_layer_slice_conv_3_axes_0, x = var_1152_cast_fp16)[name = string("per_layer_slice_conv_3_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_45_cast_fp16 = mul(x = gated_9, y = per_layer_slice_conv_3_cast_fp16)[name = string("input_45_cast_fp16")];
+            string gated_11_pad_type_0 = const()[name = string("gated_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_11_strides_0 = const()[name = string("gated_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_11_pad_0 = const()[name = string("gated_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_11_dilations_0 = const()[name = string("gated_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_11_groups_0 = const()[name = string("gated_11_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_1_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391580608))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391908352))))[name = string("layers_1_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_11_cast_fp16 = conv(dilations = gated_11_dilations_0, groups = gated_11_groups_0, pad = gated_11_pad_0, pad_type = gated_11_pad_type_0, strides = gated_11_strides_0, weight = layers_1_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_45_cast_fp16)[name = string("gated_11_cast_fp16")];
+            tensor<int32, [1]> var_1168_axes_0 = const()[name = string("op_1168_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1168_cast_fp16 = squeeze(axes = var_1168_axes_0, x = gated_11_cast_fp16)[name = string("op_1168_cast_fp16")];
+            tensor<int32, [3]> var_1172 = const()[name = string("op_1172"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1178 = const()[name = string("op_1178"), val = int32(-1)];
+            fp16 const_14_promoted_to_fp16 = const()[name = string("const_14_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_29_cast_fp16 = transpose(perm = var_1172, x = var_1168_cast_fp16)[name = string("transpose_78")];
+            tensor<fp16, [1, 1, 2560]> var_1180_cast_fp16 = mul(x = x_29_cast_fp16, y = const_14_promoted_to_fp16)[name = string("op_1180_cast_fp16")];
+            bool input_47_interleave_0 = const()[name = string("input_47_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_47_cast_fp16 = concat(axis = var_1178, interleave = input_47_interleave_0, values = (x_29_cast_fp16, var_1180_cast_fp16))[name = string("input_47_cast_fp16")];
+            tensor<int32, [1]> normed_45_axes_0 = const()[name = string("normed_45_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1175_to_fp16 = const()[name = string("op_1175_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_45_cast_fp16 = layer_norm(axes = normed_45_axes_0, epsilon = var_1175_to_fp16, x = input_47_cast_fp16)[name = string("normed_45_cast_fp16")];
+            tensor<int32, [2]> var_1185_split_sizes_0 = const()[name = string("op_1185_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1185_axis_0 = const()[name = string("op_1185_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1185_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1185_cast_fp16_1 = split(axis = var_1185_axis_0, split_sizes = var_1185_split_sizes_0, x = normed_45_cast_fp16)[name = string("op_1185_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_1_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391910976)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_19_cast_fp16 = mul(x = var_1185_cast_fp16_0, y = layers_1_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_19_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_21_cast_fp16 = add(x = hidden_states_15_cast_fp16, y = hidden_states_19_cast_fp16)[name = string("hidden_states_21_cast_fp16")];
+            tensor<fp16, [1]> const_15_promoted_to_fp16 = const()[name = string("const_15_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.acp-1])];
+            tensor<fp16, [1, 1, 2560]> x_31_cast_fp16 = mul(x = hidden_states_21_cast_fp16, y = const_15_promoted_to_fp16)[name = string("x_31_cast_fp16")];
+            int32 var_1200 = const()[name = string("op_1200"), val = int32(-1)];
+            fp16 const_16_promoted_to_fp16 = const()[name = string("const_16_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_1202_cast_fp16 = mul(x = x_31_cast_fp16, y = const_16_promoted_to_fp16)[name = string("op_1202_cast_fp16")];
+            bool input_49_interleave_0 = const()[name = string("input_49_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_49_cast_fp16 = concat(axis = var_1200, interleave = input_49_interleave_0, values = (x_31_cast_fp16, var_1202_cast_fp16))[name = string("input_49_cast_fp16")];
+            tensor<int32, [1]> normed_49_axes_0 = const()[name = string("normed_49_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1197_to_fp16 = const()[name = string("op_1197_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_49_cast_fp16 = layer_norm(axes = normed_49_axes_0, epsilon = var_1197_to_fp16, x = input_49_cast_fp16)[name = string("normed_49_cast_fp16")];
+            tensor<int32, [2]> var_1207_split_sizes_0 = const()[name = string("op_1207_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1207_axis_0 = const()[name = string("op_1207_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1207_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1207_cast_fp16_1 = split(axis = var_1207_axis_0, split_sizes = var_1207_split_sizes_0, x = normed_49_cast_fp16)[name = string("op_1207_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_2_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391916160)))];
+            tensor<fp16, [1, 1, 2560]> h_13_cast_fp16 = mul(x = var_1207_cast_fp16_0, y = layers_2_input_layernorm_weight_promoted_to_fp16)[name = string("h_13_cast_fp16")];
+            tensor<int32, [3]> var_1213 = const()[name = string("op_1213"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1216_axes_0 = const()[name = string("op_1216_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1214_cast_fp16 = transpose(perm = var_1213, x = h_13_cast_fp16)[name = string("transpose_77")];
+            tensor<fp16, [1, 2560, 1, 1]> var_1216_cast_fp16 = expand_dims(axes = var_1216_axes_0, x = var_1214_cast_fp16)[name = string("op_1216_cast_fp16")];
+            string var_1232_pad_type_0 = const()[name = string("op_1232_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1232_strides_0 = const()[name = string("op_1232_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1232_pad_0 = const()[name = string("op_1232_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1232_dilations_0 = const()[name = string("op_1232_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1232_groups_0 = const()[name = string("op_1232_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 4096, 1, 1]> var_1232 = conv(dilations = var_1232_dilations_0, groups = var_1232_groups_0, pad = var_1232_pad_0, pad_type = var_1232_pad_type_0, strides = var_1232_strides_0, weight = layers_2_self_attn_q_proj_weight_palettized, x = var_1216_cast_fp16)[name = string("op_1232")];
+            tensor<int32, [4]> var_1237 = const()[name = string("op_1237"), val = tensor<int32, [4]>([1, 8, 512, 1])];
+            tensor<fp16, [1, 8, 512, 1]> var_1238 = reshape(shape = var_1237, x = var_1232)[name = string("op_1238")];
+            tensor<int32, [4]> var_1243 = const()[name = string("op_1243"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_1253 = const()[name = string("op_1253"), val = tensor<int32, [3]>([1, 8, 512])];
+            tensor<fp16, [1, 8, 1, 512]> var_1244 = transpose(perm = var_1243, x = var_1238)[name = string("transpose_76")];
+            tensor<fp16, [1, 8, 512]> x_33 = reshape(shape = var_1253, x = var_1244)[name = string("x_33")];
+            int32 var_1259 = const()[name = string("op_1259"), val = int32(-1)];
+            fp16 const_17_promoted = const()[name = string("const_17_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 512]> var_1261 = mul(x = x_33, y = const_17_promoted)[name = string("op_1261")];
+            bool input_53_interleave_0 = const()[name = string("input_53_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1024]> input_53 = concat(axis = var_1259, interleave = input_53_interleave_0, values = (x_33, var_1261))[name = string("input_53")];
+            tensor<int32, [1]> normed_53_axes_0 = const()[name = string("normed_53_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1256_to_fp16 = const()[name = string("op_1256_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 1024]> normed_53_cast_fp16 = layer_norm(axes = normed_53_axes_0, epsilon = var_1256_to_fp16, x = input_53)[name = string("normed_53_cast_fp16")];
+            tensor<int32, [2]> var_1266_split_sizes_0 = const()[name = string("op_1266_split_sizes_0"), val = tensor<int32, [2]>([512, 512])];
+            int32 var_1266_axis_0 = const()[name = string("op_1266_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 512]> var_1266_0, tensor<fp16, [1, 8, 512]> var_1266_1 = split(axis = var_1266_axis_0, split_sizes = var_1266_split_sizes_0, x = normed_53_cast_fp16)[name = string("op_1266")];
+            tensor<fp16, [1, 8, 512]> var_1268 = mul(x = var_1266_0, y = layers_2_self_attn_q_norm_weight)[name = string("op_1268")];
+            tensor<int32, [4]> var_1273 = const()[name = string("op_1273"), val = tensor<int32, [4]>([1, 8, 1, 512])];
+            tensor<fp16, [1, 8, 1, 512]> q_15 = reshape(shape = var_1273, x = var_1268)[name = string("q_15")];
+            tensor<fp16, [1, 8, 1, 512]> var_1275_cast_fp16 = mul(x = q_15, y = cos_f)[name = string("op_1275_cast_fp16")];
+            tensor<int32, [2]> var_1276_split_sizes_0 = const()[name = string("op_1276_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_1276_axis_0 = const()[name = string("op_1276_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 256]> var_1276_0, tensor<fp16, [1, 8, 1, 256]> var_1276_1 = split(axis = var_1276_axis_0, split_sizes = var_1276_split_sizes_0, x = q_15)[name = string("op_1276")];
+            fp16 const_18_promoted = const()[name = string("const_18_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 256]> var_1278 = mul(x = var_1276_1, y = const_18_promoted)[name = string("op_1278")];
+            int32 var_1280 = const()[name = string("op_1280"), val = int32(-1)];
+            bool var_1281_interleave_0 = const()[name = string("op_1281_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> var_1281 = concat(axis = var_1280, interleave = var_1281_interleave_0, values = (var_1278, var_1276_0))[name = string("op_1281")];
+            tensor<fp16, [1, 8, 1, 512]> var_1282_cast_fp16 = mul(x = var_1281, y = sin_f)[name = string("op_1282_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> q_17_cast_fp16 = add(x = var_1275_cast_fp16, y = var_1282_cast_fp16)[name = string("q_17_cast_fp16")];
+            tensor<int32, [4]> transpose_8_perm_0 = const()[name = string("transpose_8_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_4_reps_0 = const()[name = string("tile_4_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 2048, 512]> transpose_8_cast_fp16 = transpose(perm = transpose_8_perm_0, x = kv14_k)[name = string("transpose_75")];
+            tensor<fp16, [8, 1, 2048, 512]> tile_4_cast_fp16 = tile(reps = tile_4_reps_0, x = transpose_8_cast_fp16)[name = string("tile_4_cast_fp16")];
+            tensor<int32, [5]> concat_8 = const()[name = string("concat_8"), val = tensor<int32, [5]>([4, 2, 1, 2048, 512])];
+            tensor<fp16, [4, 2, 1, 2048, 512]> reshape_8_cast_fp16 = reshape(shape = concat_8, x = tile_4_cast_fp16)[name = string("reshape_8_cast_fp16")];
+            tensor<int32, [5]> transpose_9_perm_0 = const()[name = string("transpose_9_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_9 = const()[name = string("concat_9"), val = tensor<int32, [4]>([-1, 1, 2048, 512])];
+            tensor<fp16, [2, 4, 1, 2048, 512]> transpose_9_cast_fp16 = transpose(perm = transpose_9_perm_0, x = reshape_8_cast_fp16)[name = string("transpose_74")];
+            tensor<fp16, [8, 1, 2048, 512]> reshape_9_cast_fp16 = reshape(shape = concat_9, x = transpose_9_cast_fp16)[name = string("reshape_9_cast_fp16")];
+            tensor<int32, [4]> transpose_38_perm_0 = const()[name = string("transpose_38_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_10_perm_0 = const()[name = string("transpose_10_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_5_reps_0 = const()[name = string("tile_5_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 2048, 512]> transpose_10_cast_fp16 = transpose(perm = transpose_10_perm_0, x = kv14_v)[name = string("transpose_73")];
+            tensor<fp16, [8, 1, 2048, 512]> tile_5_cast_fp16 = tile(reps = tile_5_reps_0, x = transpose_10_cast_fp16)[name = string("tile_5_cast_fp16")];
+            tensor<int32, [5]> concat_10 = const()[name = string("concat_10"), val = tensor<int32, [5]>([4, 2, 1, 2048, 512])];
+            tensor<fp16, [4, 2, 1, 2048, 512]> reshape_10_cast_fp16 = reshape(shape = concat_10, x = tile_5_cast_fp16)[name = string("reshape_10_cast_fp16")];
+            tensor<int32, [5]> transpose_11_perm_0 = const()[name = string("transpose_11_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_11 = const()[name = string("concat_11"), val = tensor<int32, [4]>([-1, 1, 2048, 512])];
+            tensor<fp16, [2, 4, 1, 2048, 512]> transpose_11_cast_fp16 = transpose(perm = transpose_11_perm_0, x = reshape_10_cast_fp16)[name = string("transpose_72")];
+            tensor<fp16, [8, 1, 2048, 512]> reshape_11_cast_fp16 = reshape(shape = concat_11, x = transpose_11_cast_fp16)[name = string("reshape_11_cast_fp16")];
+            tensor<int32, [4]> V_expanded_5_perm_0 = const()[name = string("V_expanded_5_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(false)];
+            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 2048]> transpose_38_cast_fp16 = transpose(perm = transpose_38_perm_0, x = reshape_9_cast_fp16)[name = string("transpose_71")];
+            tensor<fp16, [1, 8, 1, 2048]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = q_17_cast_fp16, y = transpose_38_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 2048]> x_35_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = causal_mask_full)[name = string("x_35_cast_fp16")];
+            tensor<int32, [1]> reduce_max_2_axes_0 = const()[name = string("reduce_max_2_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_2_keep_dims_0 = const()[name = string("reduce_max_2_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_2 = reduce_max(axes = reduce_max_2_axes_0, keep_dims = reduce_max_2_keep_dims_0, x = x_35_cast_fp16)[name = string("reduce_max_2")];
+            tensor<fp16, [1, 8, 1, 2048]> var_1314 = sub(x = x_35_cast_fp16, y = reduce_max_2)[name = string("op_1314")];
+            tensor<fp16, [1, 8, 1, 2048]> var_1320 = exp(x = var_1314)[name = string("op_1320")];
+            tensor<int32, [1]> var_1330_axes_0 = const()[name = string("op_1330_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1330_keep_dims_0 = const()[name = string("op_1330_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_1330 = reduce_sum(axes = var_1330_axes_0, keep_dims = var_1330_keep_dims_0, x = var_1320)[name = string("op_1330")];
+            tensor<fp16, [1, 8, 1, 2048]> var_1336_cast_fp16 = real_div(x = var_1320, y = var_1330)[name = string("op_1336_cast_fp16")];
+            bool attn_output_13_transpose_x_0 = const()[name = string("attn_output_13_transpose_x_0"), val = bool(false)];
+            bool attn_output_13_transpose_y_0 = const()[name = string("attn_output_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 2048, 512]> V_expanded_5_cast_fp16 = transpose(perm = V_expanded_5_perm_0, x = reshape_11_cast_fp16)[name = string("transpose_70")];
+            tensor<fp16, [1, 8, 1, 512]> attn_output_13_cast_fp16 = matmul(transpose_x = attn_output_13_transpose_x_0, transpose_y = attn_output_13_transpose_y_0, x = var_1336_cast_fp16, y = V_expanded_5_cast_fp16)[name = string("attn_output_13_cast_fp16")];
+            tensor<int32, [4]> var_1347 = const()[name = string("op_1347"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1354 = const()[name = string("op_1354"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 512]> var_1348_cast_fp16 = transpose(perm = var_1347, x = attn_output_13_cast_fp16)[name = string("transpose_69")];
+            tensor<fp16, [1, 1, 4096]> attn_output_15_cast_fp16 = reshape(shape = var_1354, x = var_1348_cast_fp16)[name = string("attn_output_15_cast_fp16")];
+            tensor<int32, [3]> var_1359 = const()[name = string("op_1359"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_1375_pad_type_0 = const()[name = string("op_1375_pad_type_0"), val = string("valid")];
+            int32 var_1375_groups_0 = const()[name = string("op_1375_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_1375_strides_0 = const()[name = string("op_1375_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_1375_pad_0 = const()[name = string("op_1375_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_1375_dilations_0 = const()[name = string("op_1375_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 4096, 1]> squeeze_2_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 4096, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391921344))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397164288))))[name = string("squeeze_2_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 4096, 1]> var_1360_cast_fp16 = transpose(perm = var_1359, x = attn_output_15_cast_fp16)[name = string("transpose_68")];
+            tensor<fp16, [1, 2560, 1]> var_1375_cast_fp16 = conv(dilations = var_1375_dilations_0, groups = var_1375_groups_0, pad = var_1375_pad_0, pad_type = var_1375_pad_type_0, strides = var_1375_strides_0, weight = squeeze_2_cast_fp16_to_fp32_to_fp16_palettized, x = var_1360_cast_fp16)[name = string("op_1375_cast_fp16")];
+            tensor<int32, [3]> var_1379 = const()[name = string("op_1379"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1385 = const()[name = string("op_1385"), val = int32(-1)];
+            fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_39_cast_fp16 = transpose(perm = var_1379, x = var_1375_cast_fp16)[name = string("transpose_67")];
+            tensor<fp16, [1, 1, 2560]> var_1387_cast_fp16 = mul(x = x_39_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_1387_cast_fp16")];
+            bool input_57_interleave_0 = const()[name = string("input_57_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_57_cast_fp16 = concat(axis = var_1385, interleave = input_57_interleave_0, values = (x_39_cast_fp16, var_1387_cast_fp16))[name = string("input_57_cast_fp16")];
+            tensor<int32, [1]> normed_57_axes_0 = const()[name = string("normed_57_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1382_to_fp16 = const()[name = string("op_1382_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_57_cast_fp16 = layer_norm(axes = normed_57_axes_0, epsilon = var_1382_to_fp16, x = input_57_cast_fp16)[name = string("normed_57_cast_fp16")];
+            tensor<int32, [2]> var_1392_split_sizes_0 = const()[name = string("op_1392_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1392_axis_0 = const()[name = string("op_1392_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1392_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1392_cast_fp16_1 = split(axis = var_1392_axis_0, split_sizes = var_1392_split_sizes_0, x = normed_57_cast_fp16)[name = string("op_1392_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_2_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397166912)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_17_cast_fp16 = mul(x = var_1392_cast_fp16_0, y = layers_2_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_17_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_41_cast_fp16 = add(x = x_31_cast_fp16, y = attn_output_17_cast_fp16)[name = string("x_41_cast_fp16")];
+            int32 var_1401 = const()[name = string("op_1401"), val = int32(-1)];
+            fp16 const_20_promoted_to_fp16 = const()[name = string("const_20_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_1403_cast_fp16 = mul(x = x_41_cast_fp16, y = const_20_promoted_to_fp16)[name = string("op_1403_cast_fp16")];
+            bool input_59_interleave_0 = const()[name = string("input_59_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_59_cast_fp16 = concat(axis = var_1401, interleave = input_59_interleave_0, values = (x_41_cast_fp16, var_1403_cast_fp16))[name = string("input_59_cast_fp16")];
+            tensor<int32, [1]> normed_61_axes_0 = const()[name = string("normed_61_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1398_to_fp16 = const()[name = string("op_1398_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_61_cast_fp16 = layer_norm(axes = normed_61_axes_0, epsilon = var_1398_to_fp16, x = input_59_cast_fp16)[name = string("normed_61_cast_fp16")];
+            tensor<int32, [2]> var_1408_split_sizes_0 = const()[name = string("op_1408_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1408_axis_0 = const()[name = string("op_1408_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1408_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1408_cast_fp16_1 = split(axis = var_1408_axis_0, split_sizes = var_1408_split_sizes_0, x = normed_61_cast_fp16)[name = string("op_1408_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_2_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397172096)))];
+            tensor<fp16, [1, 1, 2560]> h_15_cast_fp16 = mul(x = var_1408_cast_fp16_0, y = layers_2_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_15_cast_fp16")];
+            tensor<int32, [3]> var_1419 = const()[name = string("op_1419"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_61_axes_0 = const()[name = string("input_61_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1420 = transpose(perm = var_1419, x = h_15_cast_fp16)[name = string("transpose_66")];
+            tensor<fp16, [1, 2560, 1, 1]> input_61 = expand_dims(axes = input_61_axes_0, x = var_1420)[name = string("input_61")];
+            string gate_9_pad_type_0 = const()[name = string("gate_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_9_strides_0 = const()[name = string("gate_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_9_pad_0 = const()[name = string("gate_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_9_dilations_0 = const()[name = string("gate_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_9_groups_0 = const()[name = string("gate_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_9 = conv(dilations = gate_9_dilations_0, groups = gate_9_groups_0, pad = gate_9_pad_0, pad_type = gate_9_pad_type_0, strides = gate_9_strides_0, weight = layers_2_mlp_gate_proj_weight_palettized, x = input_61)[name = string("gate_9")];
+            string up_5_pad_type_0 = const()[name = string("up_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_5_strides_0 = const()[name = string("up_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_5_pad_0 = const()[name = string("up_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_5_dilations_0 = const()[name = string("up_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_5_groups_0 = const()[name = string("up_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_5 = conv(dilations = up_5_dilations_0, groups = up_5_groups_0, pad = up_5_pad_0, pad_type = up_5_pad_type_0, strides = up_5_strides_0, weight = layers_2_mlp_up_proj_weight_palettized, x = input_61)[name = string("up_5")];
+            string gate_11_mode_0 = const()[name = string("gate_11_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_11 = gelu(mode = gate_11_mode_0, x = gate_9)[name = string("gate_11")];
+            tensor<fp16, [1, 10240, 1, 1]> input_63 = mul(x = gate_11, y = up_5)[name = string("input_63")];
+            string mlp_out_5_pad_type_0 = const()[name = string("mlp_out_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_5_strides_0 = const()[name = string("mlp_out_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_5_pad_0 = const()[name = string("mlp_out_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_5_dilations_0 = const()[name = string("mlp_out_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_5_groups_0 = const()[name = string("mlp_out_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_5 = conv(dilations = mlp_out_5_dilations_0, groups = mlp_out_5_groups_0, pad = mlp_out_5_pad_0, pad_type = mlp_out_5_pad_type_0, strides = mlp_out_5_strides_0, weight = layers_2_mlp_down_proj_weight_palettized, x = input_63)[name = string("mlp_out_5")];
+            tensor<int32, [1]> var_1460_axes_0 = const()[name = string("op_1460_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1460 = squeeze(axes = var_1460_axes_0, x = mlp_out_5)[name = string("op_1460")];
+            tensor<int32, [3]> var_1464 = const()[name = string("op_1464"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1470 = const()[name = string("op_1470"), val = int32(-1)];
+            fp16 const_21_promoted = const()[name = string("const_21_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_43 = transpose(perm = var_1464, x = var_1460)[name = string("transpose_65")];
+            tensor<fp16, [1, 1, 2560]> var_1472 = mul(x = x_43, y = const_21_promoted)[name = string("op_1472")];
+            bool input_65_interleave_0 = const()[name = string("input_65_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_65 = concat(axis = var_1470, interleave = input_65_interleave_0, values = (x_43, var_1472))[name = string("input_65")];
+            tensor<int32, [1]> normed_65_axes_0 = const()[name = string("normed_65_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1467_to_fp16 = const()[name = string("op_1467_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_65_cast_fp16 = layer_norm(axes = normed_65_axes_0, epsilon = var_1467_to_fp16, x = input_65)[name = string("normed_65_cast_fp16")];
+            tensor<int32, [2]> var_1477_split_sizes_0 = const()[name = string("op_1477_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1477_axis_0 = const()[name = string("op_1477_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1477_0, tensor<fp16, [1, 1, 2560]> var_1477_1 = split(axis = var_1477_axis_0, split_sizes = var_1477_split_sizes_0, x = normed_65_cast_fp16)[name = string("op_1477")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_23 = mul(x = var_1477_0, y = layers_2_post_feedforward_layernorm_weight)[name = string("hidden_states_23")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_25_cast_fp16 = add(x = x_41_cast_fp16, y = hidden_states_23)[name = string("hidden_states_25_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_5_begin_0 = const()[name = string("per_layer_slice_5_begin_0"), val = tensor<int32, [3]>([0, 0, 8960])];
+            tensor<int32, [3]> per_layer_slice_5_end_0 = const()[name = string("per_layer_slice_5_end_0"), val = tensor<int32, [3]>([1, 1, 9216])];
+            tensor<bool, [3]> per_layer_slice_5_end_mask_0 = const()[name = string("per_layer_slice_5_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_5_cast_fp16 = slice_by_index(begin = per_layer_slice_5_begin_0, end = per_layer_slice_5_end_0, end_mask = per_layer_slice_5_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_5_cast_fp16")];
+            tensor<int32, [3]> var_1505 = const()[name = string("op_1505"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_67_axes_0 = const()[name = string("input_67_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1506 = transpose(perm = var_1505, x = hidden_states_25_cast_fp16)[name = string("transpose_64")];
+            tensor<fp16, [1, 2560, 1, 1]> input_67 = expand_dims(axes = input_67_axes_0, x = var_1506)[name = string("input_67")];
+            string gated_13_pad_type_0 = const()[name = string("gated_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_13_strides_0 = const()[name = string("gated_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_13_pad_0 = const()[name = string("gated_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_13_dilations_0 = const()[name = string("gated_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_13_groups_0 = const()[name = string("gated_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_13 = conv(dilations = gated_13_dilations_0, groups = gated_13_groups_0, pad = gated_13_pad_0, pad_type = gated_13_pad_type_0, strides = gated_13_strides_0, weight = layers_2_per_layer_input_gate_weight_palettized, x = input_67)[name = string("gated_13")];
+            string gated_15_mode_0 = const()[name = string("gated_15_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_15 = gelu(mode = gated_15_mode_0, x = gated_13)[name = string("gated_15")];
+            tensor<int32, [3]> var_1525 = const()[name = string("op_1525"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_5_axes_0 = const()[name = string("per_layer_slice_conv_5_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_1526_cast_fp16 = transpose(perm = var_1525, x = per_layer_slice_5_cast_fp16)[name = string("transpose_63")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_5_cast_fp16 = expand_dims(axes = per_layer_slice_conv_5_axes_0, x = var_1526_cast_fp16)[name = string("per_layer_slice_conv_5_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_69_cast_fp16 = mul(x = gated_15, y = per_layer_slice_conv_5_cast_fp16)[name = string("input_69_cast_fp16")];
+            string gated_17_pad_type_0 = const()[name = string("gated_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_17_strides_0 = const()[name = string("gated_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_17_pad_0 = const()[name = string("gated_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_17_dilations_0 = const()[name = string("gated_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_17_groups_0 = const()[name = string("gated_17_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_2_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397177280))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397505024))))[name = string("layers_2_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_17_cast_fp16 = conv(dilations = gated_17_dilations_0, groups = gated_17_groups_0, pad = gated_17_pad_0, pad_type = gated_17_pad_type_0, strides = gated_17_strides_0, weight = layers_2_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_69_cast_fp16)[name = string("gated_17_cast_fp16")];
+            tensor<int32, [1]> var_1542_axes_0 = const()[name = string("op_1542_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1542_cast_fp16 = squeeze(axes = var_1542_axes_0, x = gated_17_cast_fp16)[name = string("op_1542_cast_fp16")];
+            tensor<int32, [3]> var_1546 = const()[name = string("op_1546"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1552 = const()[name = string("op_1552"), val = int32(-1)];
+            fp16 const_22_promoted_to_fp16 = const()[name = string("const_22_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_45_cast_fp16 = transpose(perm = var_1546, x = var_1542_cast_fp16)[name = string("transpose_62")];
+            tensor<fp16, [1, 1, 2560]> var_1554_cast_fp16 = mul(x = x_45_cast_fp16, y = const_22_promoted_to_fp16)[name = string("op_1554_cast_fp16")];
+            bool input_71_interleave_0 = const()[name = string("input_71_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_71_cast_fp16 = concat(axis = var_1552, interleave = input_71_interleave_0, values = (x_45_cast_fp16, var_1554_cast_fp16))[name = string("input_71_cast_fp16")];
+            tensor<int32, [1]> normed_69_axes_0 = const()[name = string("normed_69_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1549_to_fp16 = const()[name = string("op_1549_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_69_cast_fp16 = layer_norm(axes = normed_69_axes_0, epsilon = var_1549_to_fp16, x = input_71_cast_fp16)[name = string("normed_69_cast_fp16")];
+            tensor<int32, [2]> var_1559_split_sizes_0 = const()[name = string("op_1559_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1559_axis_0 = const()[name = string("op_1559_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1559_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1559_cast_fp16_1 = split(axis = var_1559_axis_0, split_sizes = var_1559_split_sizes_0, x = normed_69_cast_fp16)[name = string("op_1559_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_2_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397507648)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_29_cast_fp16 = mul(x = var_1559_cast_fp16_0, y = layers_2_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_29_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_31_cast_fp16 = add(x = hidden_states_25_cast_fp16, y = hidden_states_29_cast_fp16)[name = string("hidden_states_31_cast_fp16")];
+            tensor<fp16, [1]> const_23_promoted_to_fp16 = const()[name = string("const_23_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.acp-1])];
+            tensor<fp16, [1, 1, 2560]> x_47_cast_fp16 = mul(x = hidden_states_31_cast_fp16, y = const_23_promoted_to_fp16)[name = string("x_47_cast_fp16")];
+            int32 var_1574 = const()[name = string("op_1574"), val = int32(-1)];
+            fp16 const_24_promoted_to_fp16 = const()[name = string("const_24_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_1576_cast_fp16 = mul(x = x_47_cast_fp16, y = const_24_promoted_to_fp16)[name = string("op_1576_cast_fp16")];
+            bool input_73_interleave_0 = const()[name = string("input_73_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_73_cast_fp16 = concat(axis = var_1574, interleave = input_73_interleave_0, values = (x_47_cast_fp16, var_1576_cast_fp16))[name = string("input_73_cast_fp16")];
+            tensor<int32, [1]> normed_73_axes_0 = const()[name = string("normed_73_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1571_to_fp16 = const()[name = string("op_1571_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_73_cast_fp16 = layer_norm(axes = normed_73_axes_0, epsilon = var_1571_to_fp16, x = input_73_cast_fp16)[name = string("normed_73_cast_fp16")];
+            tensor<int32, [2]> var_1581_split_sizes_0 = const()[name = string("op_1581_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1581_axis_0 = const()[name = string("op_1581_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1581_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1581_cast_fp16_1 = split(axis = var_1581_axis_0, split_sizes = var_1581_split_sizes_0, x = normed_73_cast_fp16)[name = string("op_1581_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_3_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397512832)))];
+            tensor<fp16, [1, 1, 2560]> h_19_cast_fp16 = mul(x = var_1581_cast_fp16_0, y = layers_3_input_layernorm_weight_promoted_to_fp16)[name = string("h_19_cast_fp16")];
+            tensor<int32, [3]> var_1587 = const()[name = string("op_1587"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1590_axes_0 = const()[name = string("op_1590_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1588_cast_fp16 = transpose(perm = var_1587, x = h_19_cast_fp16)[name = string("transpose_61")];
+            tensor<fp16, [1, 2560, 1, 1]> var_1590_cast_fp16 = expand_dims(axes = var_1590_axes_0, x = var_1588_cast_fp16)[name = string("op_1590_cast_fp16")];
+            string var_1606_pad_type_0 = const()[name = string("op_1606_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1606_strides_0 = const()[name = string("op_1606_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1606_pad_0 = const()[name = string("op_1606_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1606_dilations_0 = const()[name = string("op_1606_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1606_groups_0 = const()[name = string("op_1606_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_1606 = conv(dilations = var_1606_dilations_0, groups = var_1606_groups_0, pad = var_1606_pad_0, pad_type = var_1606_pad_type_0, strides = var_1606_strides_0, weight = layers_3_self_attn_q_proj_weight_palettized, x = var_1590_cast_fp16)[name = string("op_1606")];
+            tensor<int32, [4]> var_1611 = const()[name = string("op_1611"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_1612 = reshape(shape = var_1611, x = var_1606)[name = string("op_1612")];
+            tensor<int32, [4]> var_1617 = const()[name = string("op_1617"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_1627 = const()[name = string("op_1627"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_1618 = transpose(perm = var_1617, x = var_1612)[name = string("transpose_60")];
+            tensor<fp16, [1, 8, 256]> x_49 = reshape(shape = var_1627, x = var_1618)[name = string("x_49")];
+            int32 var_1633 = const()[name = string("op_1633"), val = int32(-1)];
+            fp16 const_25_promoted = const()[name = string("const_25_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_1635 = mul(x = x_49, y = const_25_promoted)[name = string("op_1635")];
+            bool input_77_interleave_0 = const()[name = string("input_77_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_77 = concat(axis = var_1633, interleave = input_77_interleave_0, values = (x_49, var_1635))[name = string("input_77")];
+            tensor<int32, [1]> normed_77_axes_0 = const()[name = string("normed_77_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1630_to_fp16 = const()[name = string("op_1630_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_77_cast_fp16 = layer_norm(axes = normed_77_axes_0, epsilon = var_1630_to_fp16, x = input_77)[name = string("normed_77_cast_fp16")];
+            tensor<int32, [2]> var_1640_split_sizes_0 = const()[name = string("op_1640_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_1640_axis_0 = const()[name = string("op_1640_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_1640_0, tensor<fp16, [1, 8, 256]> var_1640_1 = split(axis = var_1640_axis_0, split_sizes = var_1640_split_sizes_0, x = normed_77_cast_fp16)[name = string("op_1640")];
+            tensor<fp16, [1, 8, 256]> var_1642 = mul(x = var_1640_0, y = layers_0_self_attn_q_norm_weight)[name = string("op_1642")];
+            tensor<int32, [4]> var_1647 = const()[name = string("op_1647"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_21 = reshape(shape = var_1647, x = var_1642)[name = string("q_21")];
+            tensor<fp16, [1, 8, 1, 256]> var_1649_cast_fp16 = mul(x = q_21, y = cos_s)[name = string("op_1649_cast_fp16")];
+            tensor<int32, [2]> var_1650_split_sizes_0 = const()[name = string("op_1650_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_1650_axis_0 = const()[name = string("op_1650_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_1650_0, tensor<fp16, [1, 8, 1, 128]> var_1650_1 = split(axis = var_1650_axis_0, split_sizes = var_1650_split_sizes_0, x = q_21)[name = string("op_1650")];
+            fp16 const_26_promoted = const()[name = string("const_26_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_1652 = mul(x = var_1650_1, y = const_26_promoted)[name = string("op_1652")];
+            int32 var_1654 = const()[name = string("op_1654"), val = int32(-1)];
+            bool var_1655_interleave_0 = const()[name = string("op_1655_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_1655 = concat(axis = var_1654, interleave = var_1655_interleave_0, values = (var_1652, var_1650_0))[name = string("op_1655")];
+            tensor<fp16, [1, 8, 1, 256]> var_1656_cast_fp16 = mul(x = var_1655, y = sin_s)[name = string("op_1656_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_23_cast_fp16 = add(x = var_1649_cast_fp16, y = var_1656_cast_fp16)[name = string("q_23_cast_fp16")];
+            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(false)];
+            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = q_23_cast_fp16, y = transpose_36_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_51_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = causal_mask_sliding)[name = string("x_51_cast_fp16")];
+            tensor<int32, [1]> reduce_max_3_axes_0 = const()[name = string("reduce_max_3_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_3_keep_dims_0 = const()[name = string("reduce_max_3_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_3 = reduce_max(axes = reduce_max_3_axes_0, keep_dims = reduce_max_3_keep_dims_0, x = x_51_cast_fp16)[name = string("reduce_max_3")];
+            tensor<fp16, [1, 8, 1, 512]> var_1688 = sub(x = x_51_cast_fp16, y = reduce_max_3)[name = string("op_1688")];
+            tensor<fp16, [1, 8, 1, 512]> var_1694 = exp(x = var_1688)[name = string("op_1694")];
+            tensor<int32, [1]> var_1704_axes_0 = const()[name = string("op_1704_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1704_keep_dims_0 = const()[name = string("op_1704_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_1704 = reduce_sum(axes = var_1704_axes_0, keep_dims = var_1704_keep_dims_0, x = var_1694)[name = string("op_1704")];
+            tensor<fp16, [1, 8, 1, 512]> var_1710_cast_fp16 = real_div(x = var_1694, y = var_1704)[name = string("op_1710_cast_fp16")];
+            bool attn_output_19_transpose_x_0 = const()[name = string("attn_output_19_transpose_x_0"), val = bool(false)];
+            bool attn_output_19_transpose_y_0 = const()[name = string("attn_output_19_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_19_cast_fp16 = matmul(transpose_x = attn_output_19_transpose_x_0, transpose_y = attn_output_19_transpose_y_0, x = var_1710_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_19_cast_fp16")];
+            tensor<int32, [4]> var_1721 = const()[name = string("op_1721"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1728 = const()[name = string("op_1728"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_1722_cast_fp16 = transpose(perm = var_1721, x = attn_output_19_cast_fp16)[name = string("transpose_59")];
+            tensor<fp16, [1, 1, 2048]> attn_output_21_cast_fp16 = reshape(shape = var_1728, x = var_1722_cast_fp16)[name = string("attn_output_21_cast_fp16")];
+            tensor<int32, [3]> var_1733 = const()[name = string("op_1733"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_1749_pad_type_0 = const()[name = string("op_1749_pad_type_0"), val = string("valid")];
+            int32 var_1749_groups_0 = const()[name = string("op_1749_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_1749_strides_0 = const()[name = string("op_1749_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_1749_pad_0 = const()[name = string("op_1749_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_1749_dilations_0 = const()[name = string("op_1749_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_3_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397518016))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400139520))))[name = string("squeeze_3_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_1734_cast_fp16 = transpose(perm = var_1733, x = attn_output_21_cast_fp16)[name = string("transpose_58")];
+            tensor<fp16, [1, 2560, 1]> var_1749_cast_fp16 = conv(dilations = var_1749_dilations_0, groups = var_1749_groups_0, pad = var_1749_pad_0, pad_type = var_1749_pad_type_0, strides = var_1749_strides_0, weight = squeeze_3_cast_fp16_to_fp32_to_fp16_palettized, x = var_1734_cast_fp16)[name = string("op_1749_cast_fp16")];
+            tensor<int32, [3]> var_1753 = const()[name = string("op_1753"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1759 = const()[name = string("op_1759"), val = int32(-1)];
+            fp16 const_27_promoted_to_fp16 = const()[name = string("const_27_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_55_cast_fp16 = transpose(perm = var_1753, x = var_1749_cast_fp16)[name = string("transpose_57")];
+            tensor<fp16, [1, 1, 2560]> var_1761_cast_fp16 = mul(x = x_55_cast_fp16, y = const_27_promoted_to_fp16)[name = string("op_1761_cast_fp16")];
+            bool input_81_interleave_0 = const()[name = string("input_81_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_81_cast_fp16 = concat(axis = var_1759, interleave = input_81_interleave_0, values = (x_55_cast_fp16, var_1761_cast_fp16))[name = string("input_81_cast_fp16")];
+            tensor<int32, [1]> normed_81_axes_0 = const()[name = string("normed_81_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1756_to_fp16 = const()[name = string("op_1756_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_81_cast_fp16 = layer_norm(axes = normed_81_axes_0, epsilon = var_1756_to_fp16, x = input_81_cast_fp16)[name = string("normed_81_cast_fp16")];
+            tensor<int32, [2]> var_1766_split_sizes_0 = const()[name = string("op_1766_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1766_axis_0 = const()[name = string("op_1766_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1766_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1766_cast_fp16_1 = split(axis = var_1766_axis_0, split_sizes = var_1766_split_sizes_0, x = normed_81_cast_fp16)[name = string("op_1766_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_3_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400142144)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_23_cast_fp16 = mul(x = var_1766_cast_fp16_0, y = layers_3_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_23_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_57_cast_fp16 = add(x = x_47_cast_fp16, y = attn_output_23_cast_fp16)[name = string("x_57_cast_fp16")];
+            int32 var_1775 = const()[name = string("op_1775"), val = int32(-1)];
+            fp16 const_28_promoted_to_fp16 = const()[name = string("const_28_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_1777_cast_fp16 = mul(x = x_57_cast_fp16, y = const_28_promoted_to_fp16)[name = string("op_1777_cast_fp16")];
+            bool input_83_interleave_0 = const()[name = string("input_83_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_83_cast_fp16 = concat(axis = var_1775, interleave = input_83_interleave_0, values = (x_57_cast_fp16, var_1777_cast_fp16))[name = string("input_83_cast_fp16")];
+            tensor<int32, [1]> normed_85_axes_0 = const()[name = string("normed_85_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1772_to_fp16 = const()[name = string("op_1772_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_85_cast_fp16 = layer_norm(axes = normed_85_axes_0, epsilon = var_1772_to_fp16, x = input_83_cast_fp16)[name = string("normed_85_cast_fp16")];
+            tensor<int32, [2]> var_1782_split_sizes_0 = const()[name = string("op_1782_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1782_axis_0 = const()[name = string("op_1782_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1782_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1782_cast_fp16_1 = split(axis = var_1782_axis_0, split_sizes = var_1782_split_sizes_0, x = normed_85_cast_fp16)[name = string("op_1782_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_3_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400147328)))];
+            tensor<fp16, [1, 1, 2560]> h_21_cast_fp16 = mul(x = var_1782_cast_fp16_0, y = layers_3_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_21_cast_fp16")];
+            tensor<int32, [3]> var_1793 = const()[name = string("op_1793"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_85_axes_0 = const()[name = string("input_85_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1794 = transpose(perm = var_1793, x = h_21_cast_fp16)[name = string("transpose_56")];
+            tensor<fp16, [1, 2560, 1, 1]> input_85 = expand_dims(axes = input_85_axes_0, x = var_1794)[name = string("input_85")];
+            string gate_13_pad_type_0 = const()[name = string("gate_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_13_strides_0 = const()[name = string("gate_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_13_pad_0 = const()[name = string("gate_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_13_dilations_0 = const()[name = string("gate_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_13_groups_0 = const()[name = string("gate_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_13 = conv(dilations = gate_13_dilations_0, groups = gate_13_groups_0, pad = gate_13_pad_0, pad_type = gate_13_pad_type_0, strides = gate_13_strides_0, weight = layers_3_mlp_gate_proj_weight_palettized, x = input_85)[name = string("gate_13")];
+            string up_7_pad_type_0 = const()[name = string("up_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_7_strides_0 = const()[name = string("up_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_7_pad_0 = const()[name = string("up_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_7_dilations_0 = const()[name = string("up_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_7_groups_0 = const()[name = string("up_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_7 = conv(dilations = up_7_dilations_0, groups = up_7_groups_0, pad = up_7_pad_0, pad_type = up_7_pad_type_0, strides = up_7_strides_0, weight = layers_3_mlp_up_proj_weight_palettized, x = input_85)[name = string("up_7")];
+            string gate_15_mode_0 = const()[name = string("gate_15_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_15 = gelu(mode = gate_15_mode_0, x = gate_13)[name = string("gate_15")];
+            tensor<fp16, [1, 10240, 1, 1]> input_87 = mul(x = gate_15, y = up_7)[name = string("input_87")];
+            string mlp_out_7_pad_type_0 = const()[name = string("mlp_out_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_7_strides_0 = const()[name = string("mlp_out_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_7_pad_0 = const()[name = string("mlp_out_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_7_dilations_0 = const()[name = string("mlp_out_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_7_groups_0 = const()[name = string("mlp_out_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_7 = conv(dilations = mlp_out_7_dilations_0, groups = mlp_out_7_groups_0, pad = mlp_out_7_pad_0, pad_type = mlp_out_7_pad_type_0, strides = mlp_out_7_strides_0, weight = layers_3_mlp_down_proj_weight_palettized, x = input_87)[name = string("mlp_out_7")];
+            tensor<int32, [1]> var_1834_axes_0 = const()[name = string("op_1834_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1834 = squeeze(axes = var_1834_axes_0, x = mlp_out_7)[name = string("op_1834")];
+            tensor<int32, [3]> var_1838 = const()[name = string("op_1838"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1844 = const()[name = string("op_1844"), val = int32(-1)];
+            fp16 const_29_promoted = const()[name = string("const_29_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_59 = transpose(perm = var_1838, x = var_1834)[name = string("transpose_55")];
+            tensor<fp16, [1, 1, 2560]> var_1846 = mul(x = x_59, y = const_29_promoted)[name = string("op_1846")];
+            bool input_89_interleave_0 = const()[name = string("input_89_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_89 = concat(axis = var_1844, interleave = input_89_interleave_0, values = (x_59, var_1846))[name = string("input_89")];
+            tensor<int32, [1]> normed_89_axes_0 = const()[name = string("normed_89_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1841_to_fp16 = const()[name = string("op_1841_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_89_cast_fp16 = layer_norm(axes = normed_89_axes_0, epsilon = var_1841_to_fp16, x = input_89)[name = string("normed_89_cast_fp16")];
+            tensor<int32, [2]> var_1851_split_sizes_0 = const()[name = string("op_1851_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1851_axis_0 = const()[name = string("op_1851_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1851_0, tensor<fp16, [1, 1, 2560]> var_1851_1 = split(axis = var_1851_axis_0, split_sizes = var_1851_split_sizes_0, x = normed_89_cast_fp16)[name = string("op_1851")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_33 = mul(x = var_1851_0, y = layers_3_post_feedforward_layernorm_weight)[name = string("hidden_states_33")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_35_cast_fp16 = add(x = x_57_cast_fp16, y = hidden_states_33)[name = string("hidden_states_35_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_7_begin_0 = const()[name = string("per_layer_slice_7_begin_0"), val = tensor<int32, [3]>([0, 0, 9216])];
+            tensor<int32, [3]> per_layer_slice_7_end_0 = const()[name = string("per_layer_slice_7_end_0"), val = tensor<int32, [3]>([1, 1, 9472])];
+            tensor<bool, [3]> per_layer_slice_7_end_mask_0 = const()[name = string("per_layer_slice_7_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_7_cast_fp16 = slice_by_index(begin = per_layer_slice_7_begin_0, end = per_layer_slice_7_end_0, end_mask = per_layer_slice_7_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_7_cast_fp16")];
+            tensor<int32, [3]> var_1879 = const()[name = string("op_1879"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_91_axes_0 = const()[name = string("input_91_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1880 = transpose(perm = var_1879, x = hidden_states_35_cast_fp16)[name = string("transpose_54")];
+            tensor<fp16, [1, 2560, 1, 1]> input_91 = expand_dims(axes = input_91_axes_0, x = var_1880)[name = string("input_91")];
+            string gated_19_pad_type_0 = const()[name = string("gated_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_19_strides_0 = const()[name = string("gated_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_19_pad_0 = const()[name = string("gated_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_19_dilations_0 = const()[name = string("gated_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_19_groups_0 = const()[name = string("gated_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_19 = conv(dilations = gated_19_dilations_0, groups = gated_19_groups_0, pad = gated_19_pad_0, pad_type = gated_19_pad_type_0, strides = gated_19_strides_0, weight = layers_3_per_layer_input_gate_weight_palettized, x = input_91)[name = string("gated_19")];
+            string gated_21_mode_0 = const()[name = string("gated_21_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_21 = gelu(mode = gated_21_mode_0, x = gated_19)[name = string("gated_21")];
+            tensor<int32, [3]> var_1899 = const()[name = string("op_1899"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_7_axes_0 = const()[name = string("per_layer_slice_conv_7_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_1900_cast_fp16 = transpose(perm = var_1899, x = per_layer_slice_7_cast_fp16)[name = string("transpose_53")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_7_cast_fp16 = expand_dims(axes = per_layer_slice_conv_7_axes_0, x = var_1900_cast_fp16)[name = string("per_layer_slice_conv_7_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_93_cast_fp16 = mul(x = gated_21, y = per_layer_slice_conv_7_cast_fp16)[name = string("input_93_cast_fp16")];
+            string gated_23_pad_type_0 = const()[name = string("gated_23_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_23_strides_0 = const()[name = string("gated_23_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_23_pad_0 = const()[name = string("gated_23_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_23_dilations_0 = const()[name = string("gated_23_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_23_groups_0 = const()[name = string("gated_23_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_3_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400152512))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400480256))))[name = string("layers_3_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_23_cast_fp16 = conv(dilations = gated_23_dilations_0, groups = gated_23_groups_0, pad = gated_23_pad_0, pad_type = gated_23_pad_type_0, strides = gated_23_strides_0, weight = layers_3_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_93_cast_fp16)[name = string("gated_23_cast_fp16")];
+            tensor<int32, [1]> var_1916_axes_0 = const()[name = string("op_1916_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1916_cast_fp16 = squeeze(axes = var_1916_axes_0, x = gated_23_cast_fp16)[name = string("op_1916_cast_fp16")];
+            tensor<int32, [3]> var_1920 = const()[name = string("op_1920"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1926 = const()[name = string("op_1926"), val = int32(-1)];
+            fp16 const_30_promoted_to_fp16 = const()[name = string("const_30_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_61_cast_fp16 = transpose(perm = var_1920, x = var_1916_cast_fp16)[name = string("transpose_52")];
+            tensor<fp16, [1, 1, 2560]> var_1928_cast_fp16 = mul(x = x_61_cast_fp16, y = const_30_promoted_to_fp16)[name = string("op_1928_cast_fp16")];
+            bool input_95_interleave_0 = const()[name = string("input_95_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_95_cast_fp16 = concat(axis = var_1926, interleave = input_95_interleave_0, values = (x_61_cast_fp16, var_1928_cast_fp16))[name = string("input_95_cast_fp16")];
+            tensor<int32, [1]> normed_93_axes_0 = const()[name = string("normed_93_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1923_to_fp16 = const()[name = string("op_1923_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_93_cast_fp16 = layer_norm(axes = normed_93_axes_0, epsilon = var_1923_to_fp16, x = input_95_cast_fp16)[name = string("normed_93_cast_fp16")];
+            tensor<int32, [2]> var_1933_split_sizes_0 = const()[name = string("op_1933_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1933_axis_0 = const()[name = string("op_1933_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1933_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1933_cast_fp16_1 = split(axis = var_1933_axis_0, split_sizes = var_1933_split_sizes_0, x = normed_93_cast_fp16)[name = string("op_1933_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_3_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400482880)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_39_cast_fp16 = mul(x = var_1933_cast_fp16_0, y = layers_3_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_39_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_41_cast_fp16 = add(x = hidden_states_35_cast_fp16, y = hidden_states_39_cast_fp16)[name = string("hidden_states_41_cast_fp16")];
+            tensor<fp16, [1]> const_31_promoted_to_fp16 = const()[name = string("const_31_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.b6p-1])];
+            tensor<fp16, [1, 1, 2560]> x_63_cast_fp16 = mul(x = hidden_states_41_cast_fp16, y = const_31_promoted_to_fp16)[name = string("x_63_cast_fp16")];
+            int32 var_1948 = const()[name = string("op_1948"), val = int32(-1)];
+            fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_1950_cast_fp16 = mul(x = x_63_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_1950_cast_fp16")];
+            bool input_97_interleave_0 = const()[name = string("input_97_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_97_cast_fp16 = concat(axis = var_1948, interleave = input_97_interleave_0, values = (x_63_cast_fp16, var_1950_cast_fp16))[name = string("input_97_cast_fp16")];
+            tensor<int32, [1]> normed_97_axes_0 = const()[name = string("normed_97_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1945_to_fp16 = const()[name = string("op_1945_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_97_cast_fp16 = layer_norm(axes = normed_97_axes_0, epsilon = var_1945_to_fp16, x = input_97_cast_fp16)[name = string("normed_97_cast_fp16")];
+            tensor<int32, [2]> var_1955_split_sizes_0 = const()[name = string("op_1955_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1955_axis_0 = const()[name = string("op_1955_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_1955_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_1955_cast_fp16_1 = split(axis = var_1955_axis_0, split_sizes = var_1955_split_sizes_0, x = normed_97_cast_fp16)[name = string("op_1955_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_4_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400488064)))];
+            tensor<fp16, [1, 1, 2560]> h_25_cast_fp16 = mul(x = var_1955_cast_fp16_0, y = layers_4_input_layernorm_weight_promoted_to_fp16)[name = string("h_25_cast_fp16")];
+            tensor<int32, [3]> var_1961 = const()[name = string("op_1961"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1964_axes_0 = const()[name = string("op_1964_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_1962_cast_fp16 = transpose(perm = var_1961, x = h_25_cast_fp16)[name = string("transpose_51")];
+            tensor<fp16, [1, 2560, 1, 1]> var_1964_cast_fp16 = expand_dims(axes = var_1964_axes_0, x = var_1962_cast_fp16)[name = string("op_1964_cast_fp16")];
+            string var_1980_pad_type_0 = const()[name = string("op_1980_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_1980_strides_0 = const()[name = string("op_1980_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_1980_pad_0 = const()[name = string("op_1980_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_1980_dilations_0 = const()[name = string("op_1980_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_1980_groups_0 = const()[name = string("op_1980_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_1980 = conv(dilations = var_1980_dilations_0, groups = var_1980_groups_0, pad = var_1980_pad_0, pad_type = var_1980_pad_type_0, strides = var_1980_strides_0, weight = layers_4_self_attn_q_proj_weight_palettized, x = var_1964_cast_fp16)[name = string("op_1980")];
+            tensor<int32, [4]> var_1985 = const()[name = string("op_1985"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_1986 = reshape(shape = var_1985, x = var_1980)[name = string("op_1986")];
+            tensor<int32, [4]> var_1991 = const()[name = string("op_1991"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_2001 = const()[name = string("op_2001"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_1992 = transpose(perm = var_1991, x = var_1986)[name = string("transpose_50")];
+            tensor<fp16, [1, 8, 256]> x_65 = reshape(shape = var_2001, x = var_1992)[name = string("x_65")];
+            int32 var_2007 = const()[name = string("op_2007"), val = int32(-1)];
+            fp16 const_33_promoted = const()[name = string("const_33_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_2009 = mul(x = x_65, y = const_33_promoted)[name = string("op_2009")];
+            bool input_101_interleave_0 = const()[name = string("input_101_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_101 = concat(axis = var_2007, interleave = input_101_interleave_0, values = (x_65, var_2009))[name = string("input_101")];
+            tensor<int32, [1]> normed_101_axes_0 = const()[name = string("normed_101_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2004_to_fp16 = const()[name = string("op_2004_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_101_cast_fp16 = layer_norm(axes = normed_101_axes_0, epsilon = var_2004_to_fp16, x = input_101)[name = string("normed_101_cast_fp16")];
+            tensor<int32, [2]> var_2014_split_sizes_0 = const()[name = string("op_2014_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2014_axis_0 = const()[name = string("op_2014_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_2014_0, tensor<fp16, [1, 8, 256]> var_2014_1 = split(axis = var_2014_axis_0, split_sizes = var_2014_split_sizes_0, x = normed_101_cast_fp16)[name = string("op_2014")];
+            tensor<fp16, [1, 8, 256]> var_2016 = mul(x = var_2014_0, y = layers_0_self_attn_q_norm_weight)[name = string("op_2016")];
+            tensor<int32, [4]> var_2021 = const()[name = string("op_2021"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_27 = reshape(shape = var_2021, x = var_2016)[name = string("q_27")];
+            tensor<fp16, [1, 8, 1, 256]> var_2023_cast_fp16 = mul(x = q_27, y = cos_s)[name = string("op_2023_cast_fp16")];
+            tensor<int32, [2]> var_2024_split_sizes_0 = const()[name = string("op_2024_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2024_axis_0 = const()[name = string("op_2024_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_2024_0, tensor<fp16, [1, 8, 1, 128]> var_2024_1 = split(axis = var_2024_axis_0, split_sizes = var_2024_split_sizes_0, x = q_27)[name = string("op_2024")];
+            fp16 const_34_promoted = const()[name = string("const_34_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_2026 = mul(x = var_2024_1, y = const_34_promoted)[name = string("op_2026")];
+            int32 var_2028 = const()[name = string("op_2028"), val = int32(-1)];
+            bool var_2029_interleave_0 = const()[name = string("op_2029_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_2029 = concat(axis = var_2028, interleave = var_2029_interleave_0, values = (var_2026, var_2024_0))[name = string("op_2029")];
+            tensor<fp16, [1, 8, 1, 256]> var_2030_cast_fp16 = mul(x = var_2029, y = sin_s)[name = string("op_2030_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_29_cast_fp16 = add(x = var_2023_cast_fp16, y = var_2030_cast_fp16)[name = string("q_29_cast_fp16")];
+            bool attn_weights_17_transpose_x_0 = const()[name = string("attn_weights_17_transpose_x_0"), val = bool(false)];
+            bool attn_weights_17_transpose_y_0 = const()[name = string("attn_weights_17_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_17_cast_fp16 = matmul(transpose_x = attn_weights_17_transpose_x_0, transpose_y = attn_weights_17_transpose_y_0, x = q_29_cast_fp16, y = transpose_36_cast_fp16)[name = string("attn_weights_17_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_67_cast_fp16 = add(x = attn_weights_17_cast_fp16, y = causal_mask_sliding)[name = string("x_67_cast_fp16")];
+            tensor<int32, [1]> reduce_max_4_axes_0 = const()[name = string("reduce_max_4_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_4_keep_dims_0 = const()[name = string("reduce_max_4_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_4 = reduce_max(axes = reduce_max_4_axes_0, keep_dims = reduce_max_4_keep_dims_0, x = x_67_cast_fp16)[name = string("reduce_max_4")];
+            tensor<fp16, [1, 8, 1, 512]> var_2062 = sub(x = x_67_cast_fp16, y = reduce_max_4)[name = string("op_2062")];
+            tensor<fp16, [1, 8, 1, 512]> var_2068 = exp(x = var_2062)[name = string("op_2068")];
+            tensor<int32, [1]> var_2078_axes_0 = const()[name = string("op_2078_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2078_keep_dims_0 = const()[name = string("op_2078_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_2078 = reduce_sum(axes = var_2078_axes_0, keep_dims = var_2078_keep_dims_0, x = var_2068)[name = string("op_2078")];
+            tensor<fp16, [1, 8, 1, 512]> var_2084_cast_fp16 = real_div(x = var_2068, y = var_2078)[name = string("op_2084_cast_fp16")];
+            bool attn_output_25_transpose_x_0 = const()[name = string("attn_output_25_transpose_x_0"), val = bool(false)];
+            bool attn_output_25_transpose_y_0 = const()[name = string("attn_output_25_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_25_cast_fp16 = matmul(transpose_x = attn_output_25_transpose_x_0, transpose_y = attn_output_25_transpose_y_0, x = var_2084_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_25_cast_fp16")];
+            tensor<int32, [4]> var_2095 = const()[name = string("op_2095"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_2102 = const()[name = string("op_2102"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_2096_cast_fp16 = transpose(perm = var_2095, x = attn_output_25_cast_fp16)[name = string("transpose_49")];
+            tensor<fp16, [1, 1, 2048]> attn_output_27_cast_fp16 = reshape(shape = var_2102, x = var_2096_cast_fp16)[name = string("attn_output_27_cast_fp16")];
+            tensor<int32, [3]> var_2107 = const()[name = string("op_2107"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_2123_pad_type_0 = const()[name = string("op_2123_pad_type_0"), val = string("valid")];
+            int32 var_2123_groups_0 = const()[name = string("op_2123_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_2123_strides_0 = const()[name = string("op_2123_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_2123_pad_0 = const()[name = string("op_2123_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_2123_dilations_0 = const()[name = string("op_2123_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_4_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400493248))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403114752))))[name = string("squeeze_4_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_2108_cast_fp16 = transpose(perm = var_2107, x = attn_output_27_cast_fp16)[name = string("transpose_48")];
+            tensor<fp16, [1, 2560, 1]> var_2123_cast_fp16 = conv(dilations = var_2123_dilations_0, groups = var_2123_groups_0, pad = var_2123_pad_0, pad_type = var_2123_pad_type_0, strides = var_2123_strides_0, weight = squeeze_4_cast_fp16_to_fp32_to_fp16_palettized, x = var_2108_cast_fp16)[name = string("op_2123_cast_fp16")];
+            tensor<int32, [3]> var_2127 = const()[name = string("op_2127"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2133 = const()[name = string("op_2133"), val = int32(-1)];
+            fp16 const_35_promoted_to_fp16 = const()[name = string("const_35_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_71_cast_fp16 = transpose(perm = var_2127, x = var_2123_cast_fp16)[name = string("transpose_47")];
+            tensor<fp16, [1, 1, 2560]> var_2135_cast_fp16 = mul(x = x_71_cast_fp16, y = const_35_promoted_to_fp16)[name = string("op_2135_cast_fp16")];
+            bool input_105_interleave_0 = const()[name = string("input_105_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_105_cast_fp16 = concat(axis = var_2133, interleave = input_105_interleave_0, values = (x_71_cast_fp16, var_2135_cast_fp16))[name = string("input_105_cast_fp16")];
+            tensor<int32, [1]> normed_105_axes_0 = const()[name = string("normed_105_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2130_to_fp16 = const()[name = string("op_2130_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_105_cast_fp16 = layer_norm(axes = normed_105_axes_0, epsilon = var_2130_to_fp16, x = input_105_cast_fp16)[name = string("normed_105_cast_fp16")];
+            tensor<int32, [2]> var_2140_split_sizes_0 = const()[name = string("op_2140_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2140_axis_0 = const()[name = string("op_2140_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2140_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2140_cast_fp16_1 = split(axis = var_2140_axis_0, split_sizes = var_2140_split_sizes_0, x = normed_105_cast_fp16)[name = string("op_2140_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_4_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403117376)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_29_cast_fp16 = mul(x = var_2140_cast_fp16_0, y = layers_4_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_29_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_73_cast_fp16 = add(x = x_63_cast_fp16, y = attn_output_29_cast_fp16)[name = string("x_73_cast_fp16")];
+            int32 var_2149 = const()[name = string("op_2149"), val = int32(-1)];
+            fp16 const_36_promoted_to_fp16 = const()[name = string("const_36_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_2151_cast_fp16 = mul(x = x_73_cast_fp16, y = const_36_promoted_to_fp16)[name = string("op_2151_cast_fp16")];
+            bool input_107_interleave_0 = const()[name = string("input_107_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_107_cast_fp16 = concat(axis = var_2149, interleave = input_107_interleave_0, values = (x_73_cast_fp16, var_2151_cast_fp16))[name = string("input_107_cast_fp16")];
+            tensor<int32, [1]> normed_109_axes_0 = const()[name = string("normed_109_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2146_to_fp16 = const()[name = string("op_2146_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_109_cast_fp16 = layer_norm(axes = normed_109_axes_0, epsilon = var_2146_to_fp16, x = input_107_cast_fp16)[name = string("normed_109_cast_fp16")];
+            tensor<int32, [2]> var_2156_split_sizes_0 = const()[name = string("op_2156_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2156_axis_0 = const()[name = string("op_2156_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2156_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2156_cast_fp16_1 = split(axis = var_2156_axis_0, split_sizes = var_2156_split_sizes_0, x = normed_109_cast_fp16)[name = string("op_2156_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_4_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403122560)))];
+            tensor<fp16, [1, 1, 2560]> h_27_cast_fp16 = mul(x = var_2156_cast_fp16_0, y = layers_4_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_27_cast_fp16")];
+            tensor<int32, [3]> var_2167 = const()[name = string("op_2167"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_109_axes_0 = const()[name = string("input_109_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2168 = transpose(perm = var_2167, x = h_27_cast_fp16)[name = string("transpose_46")];
+            tensor<fp16, [1, 2560, 1, 1]> input_109 = expand_dims(axes = input_109_axes_0, x = var_2168)[name = string("input_109")];
+            string gate_17_pad_type_0 = const()[name = string("gate_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_17_strides_0 = const()[name = string("gate_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_17_pad_0 = const()[name = string("gate_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_17_dilations_0 = const()[name = string("gate_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_17_groups_0 = const()[name = string("gate_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_17 = conv(dilations = gate_17_dilations_0, groups = gate_17_groups_0, pad = gate_17_pad_0, pad_type = gate_17_pad_type_0, strides = gate_17_strides_0, weight = layers_4_mlp_gate_proj_weight_palettized, x = input_109)[name = string("gate_17")];
+            string up_9_pad_type_0 = const()[name = string("up_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_9_strides_0 = const()[name = string("up_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_9_pad_0 = const()[name = string("up_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_9_dilations_0 = const()[name = string("up_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_9_groups_0 = const()[name = string("up_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_9 = conv(dilations = up_9_dilations_0, groups = up_9_groups_0, pad = up_9_pad_0, pad_type = up_9_pad_type_0, strides = up_9_strides_0, weight = layers_4_mlp_up_proj_weight_palettized, x = input_109)[name = string("up_9")];
+            string gate_19_mode_0 = const()[name = string("gate_19_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_19 = gelu(mode = gate_19_mode_0, x = gate_17)[name = string("gate_19")];
+            tensor<fp16, [1, 10240, 1, 1]> input_111 = mul(x = gate_19, y = up_9)[name = string("input_111")];
+            string mlp_out_9_pad_type_0 = const()[name = string("mlp_out_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_9_strides_0 = const()[name = string("mlp_out_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_9_pad_0 = const()[name = string("mlp_out_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_9_dilations_0 = const()[name = string("mlp_out_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_9_groups_0 = const()[name = string("mlp_out_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_9 = conv(dilations = mlp_out_9_dilations_0, groups = mlp_out_9_groups_0, pad = mlp_out_9_pad_0, pad_type = mlp_out_9_pad_type_0, strides = mlp_out_9_strides_0, weight = layers_4_mlp_down_proj_weight_palettized, x = input_111)[name = string("mlp_out_9")];
+            tensor<int32, [1]> var_2208_axes_0 = const()[name = string("op_2208_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2208 = squeeze(axes = var_2208_axes_0, x = mlp_out_9)[name = string("op_2208")];
+            tensor<int32, [3]> var_2212 = const()[name = string("op_2212"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2218 = const()[name = string("op_2218"), val = int32(-1)];
+            fp16 const_37_promoted = const()[name = string("const_37_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_75 = transpose(perm = var_2212, x = var_2208)[name = string("transpose_45")];
+            tensor<fp16, [1, 1, 2560]> var_2220 = mul(x = x_75, y = const_37_promoted)[name = string("op_2220")];
+            bool input_113_interleave_0 = const()[name = string("input_113_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_113 = concat(axis = var_2218, interleave = input_113_interleave_0, values = (x_75, var_2220))[name = string("input_113")];
+            tensor<int32, [1]> normed_113_axes_0 = const()[name = string("normed_113_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2215_to_fp16 = const()[name = string("op_2215_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_113_cast_fp16 = layer_norm(axes = normed_113_axes_0, epsilon = var_2215_to_fp16, x = input_113)[name = string("normed_113_cast_fp16")];
+            tensor<int32, [2]> var_2225_split_sizes_0 = const()[name = string("op_2225_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2225_axis_0 = const()[name = string("op_2225_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2225_0, tensor<fp16, [1, 1, 2560]> var_2225_1 = split(axis = var_2225_axis_0, split_sizes = var_2225_split_sizes_0, x = normed_113_cast_fp16)[name = string("op_2225")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_43 = mul(x = var_2225_0, y = layers_4_post_feedforward_layernorm_weight)[name = string("hidden_states_43")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_45_cast_fp16 = add(x = x_73_cast_fp16, y = hidden_states_43)[name = string("hidden_states_45_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_9_begin_0 = const()[name = string("per_layer_slice_9_begin_0"), val = tensor<int32, [3]>([0, 0, 9472])];
+            tensor<int32, [3]> per_layer_slice_9_end_0 = const()[name = string("per_layer_slice_9_end_0"), val = tensor<int32, [3]>([1, 1, 9728])];
+            tensor<bool, [3]> per_layer_slice_9_end_mask_0 = const()[name = string("per_layer_slice_9_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_9_cast_fp16 = slice_by_index(begin = per_layer_slice_9_begin_0, end = per_layer_slice_9_end_0, end_mask = per_layer_slice_9_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_9_cast_fp16")];
+            tensor<int32, [3]> var_2253 = const()[name = string("op_2253"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_115_axes_0 = const()[name = string("input_115_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2254 = transpose(perm = var_2253, x = hidden_states_45_cast_fp16)[name = string("transpose_44")];
+            tensor<fp16, [1, 2560, 1, 1]> input_115 = expand_dims(axes = input_115_axes_0, x = var_2254)[name = string("input_115")];
+            string gated_25_pad_type_0 = const()[name = string("gated_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_25_strides_0 = const()[name = string("gated_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_25_pad_0 = const()[name = string("gated_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_25_dilations_0 = const()[name = string("gated_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_25_groups_0 = const()[name = string("gated_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_25 = conv(dilations = gated_25_dilations_0, groups = gated_25_groups_0, pad = gated_25_pad_0, pad_type = gated_25_pad_type_0, strides = gated_25_strides_0, weight = layers_4_per_layer_input_gate_weight_palettized, x = input_115)[name = string("gated_25")];
+            string gated_27_mode_0 = const()[name = string("gated_27_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_27 = gelu(mode = gated_27_mode_0, x = gated_25)[name = string("gated_27")];
+            tensor<int32, [3]> var_2273 = const()[name = string("op_2273"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_9_axes_0 = const()[name = string("per_layer_slice_conv_9_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_2274_cast_fp16 = transpose(perm = var_2273, x = per_layer_slice_9_cast_fp16)[name = string("transpose_43")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_9_cast_fp16 = expand_dims(axes = per_layer_slice_conv_9_axes_0, x = var_2274_cast_fp16)[name = string("per_layer_slice_conv_9_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_117_cast_fp16 = mul(x = gated_27, y = per_layer_slice_conv_9_cast_fp16)[name = string("input_117_cast_fp16")];
+            string gated_29_pad_type_0 = const()[name = string("gated_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_29_strides_0 = const()[name = string("gated_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_29_pad_0 = const()[name = string("gated_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_29_dilations_0 = const()[name = string("gated_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_29_groups_0 = const()[name = string("gated_29_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_4_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403127744))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403455488))))[name = string("layers_4_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_29_cast_fp16 = conv(dilations = gated_29_dilations_0, groups = gated_29_groups_0, pad = gated_29_pad_0, pad_type = gated_29_pad_type_0, strides = gated_29_strides_0, weight = layers_4_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_117_cast_fp16)[name = string("gated_29_cast_fp16")];
+            tensor<int32, [1]> var_2290_axes_0 = const()[name = string("op_2290_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2290_cast_fp16 = squeeze(axes = var_2290_axes_0, x = gated_29_cast_fp16)[name = string("op_2290_cast_fp16")];
+            tensor<int32, [3]> var_2294 = const()[name = string("op_2294"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2300 = const()[name = string("op_2300"), val = int32(-1)];
+            fp16 const_38_promoted_to_fp16 = const()[name = string("const_38_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_77_cast_fp16 = transpose(perm = var_2294, x = var_2290_cast_fp16)[name = string("transpose_42")];
+            tensor<fp16, [1, 1, 2560]> var_2302_cast_fp16 = mul(x = x_77_cast_fp16, y = const_38_promoted_to_fp16)[name = string("op_2302_cast_fp16")];
+            bool input_119_interleave_0 = const()[name = string("input_119_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_119_cast_fp16 = concat(axis = var_2300, interleave = input_119_interleave_0, values = (x_77_cast_fp16, var_2302_cast_fp16))[name = string("input_119_cast_fp16")];
+            tensor<int32, [1]> normed_117_axes_0 = const()[name = string("normed_117_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2297_to_fp16 = const()[name = string("op_2297_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_117_cast_fp16 = layer_norm(axes = normed_117_axes_0, epsilon = var_2297_to_fp16, x = input_119_cast_fp16)[name = string("normed_117_cast_fp16")];
+            tensor<int32, [2]> var_2307_split_sizes_0 = const()[name = string("op_2307_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2307_axis_0 = const()[name = string("op_2307_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2307_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2307_cast_fp16_1 = split(axis = var_2307_axis_0, split_sizes = var_2307_split_sizes_0, x = normed_117_cast_fp16)[name = string("op_2307_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_4_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403458112)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_49_cast_fp16 = mul(x = var_2307_cast_fp16_0, y = layers_4_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_49_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_51_cast_fp16 = add(x = hidden_states_45_cast_fp16, y = hidden_states_49_cast_fp16)[name = string("hidden_states_51_cast_fp16")];
+            tensor<fp16, [1]> const_39_promoted_to_fp16 = const()[name = string("const_39_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.c6p-1])];
+            tensor<fp16, [1, 1, 2560]> x_79_cast_fp16 = mul(x = hidden_states_51_cast_fp16, y = const_39_promoted_to_fp16)[name = string("x_79_cast_fp16")];
+            int32 var_2322 = const()[name = string("op_2322"), val = int32(-1)];
+            fp16 const_40_promoted_to_fp16 = const()[name = string("const_40_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_2324_cast_fp16 = mul(x = x_79_cast_fp16, y = const_40_promoted_to_fp16)[name = string("op_2324_cast_fp16")];
+            bool input_121_interleave_0 = const()[name = string("input_121_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_121_cast_fp16 = concat(axis = var_2322, interleave = input_121_interleave_0, values = (x_79_cast_fp16, var_2324_cast_fp16))[name = string("input_121_cast_fp16")];
+            tensor<int32, [1]> normed_121_axes_0 = const()[name = string("normed_121_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2319_to_fp16 = const()[name = string("op_2319_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_121_cast_fp16 = layer_norm(axes = normed_121_axes_0, epsilon = var_2319_to_fp16, x = input_121_cast_fp16)[name = string("normed_121_cast_fp16")];
+            tensor<int32, [2]> var_2329_split_sizes_0 = const()[name = string("op_2329_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2329_axis_0 = const()[name = string("op_2329_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2329_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2329_cast_fp16_1 = split(axis = var_2329_axis_0, split_sizes = var_2329_split_sizes_0, x = normed_121_cast_fp16)[name = string("op_2329_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_5_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403463296)))];
+            tensor<fp16, [1, 1, 2560]> h_31_cast_fp16 = mul(x = var_2329_cast_fp16_0, y = layers_5_input_layernorm_weight_promoted_to_fp16)[name = string("h_31_cast_fp16")];
+            tensor<int32, [3]> var_2335 = const()[name = string("op_2335"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_2338_axes_0 = const()[name = string("op_2338_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2336_cast_fp16 = transpose(perm = var_2335, x = h_31_cast_fp16)[name = string("transpose_41")];
+            tensor<fp16, [1, 2560, 1, 1]> var_2338_cast_fp16 = expand_dims(axes = var_2338_axes_0, x = var_2336_cast_fp16)[name = string("op_2338_cast_fp16")];
+            string var_2354_pad_type_0 = const()[name = string("op_2354_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2354_strides_0 = const()[name = string("op_2354_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2354_pad_0 = const()[name = string("op_2354_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2354_dilations_0 = const()[name = string("op_2354_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2354_groups_0 = const()[name = string("op_2354_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_2354 = conv(dilations = var_2354_dilations_0, groups = var_2354_groups_0, pad = var_2354_pad_0, pad_type = var_2354_pad_type_0, strides = var_2354_strides_0, weight = layers_5_self_attn_q_proj_weight_palettized, x = var_2338_cast_fp16)[name = string("op_2354")];
+            tensor<int32, [4]> var_2359 = const()[name = string("op_2359"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_2360 = reshape(shape = var_2359, x = var_2354)[name = string("op_2360")];
+            tensor<int32, [4]> var_2365 = const()[name = string("op_2365"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_2375 = const()[name = string("op_2375"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_2366 = transpose(perm = var_2365, x = var_2360)[name = string("transpose_40")];
+            tensor<fp16, [1, 8, 256]> x_81 = reshape(shape = var_2375, x = var_2366)[name = string("x_81")];
+            int32 var_2381 = const()[name = string("op_2381"), val = int32(-1)];
+            fp16 const_41_promoted = const()[name = string("const_41_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_2383 = mul(x = x_81, y = const_41_promoted)[name = string("op_2383")];
+            bool input_125_interleave_0 = const()[name = string("input_125_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_125 = concat(axis = var_2381, interleave = input_125_interleave_0, values = (x_81, var_2383))[name = string("input_125")];
+            tensor<int32, [1]> normed_125_axes_0 = const()[name = string("normed_125_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2378_to_fp16 = const()[name = string("op_2378_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_125_cast_fp16 = layer_norm(axes = normed_125_axes_0, epsilon = var_2378_to_fp16, x = input_125)[name = string("normed_125_cast_fp16")];
+            tensor<int32, [2]> var_2388_split_sizes_0 = const()[name = string("op_2388_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2388_axis_0 = const()[name = string("op_2388_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_2388_0, tensor<fp16, [1, 8, 256]> var_2388_1 = split(axis = var_2388_axis_0, split_sizes = var_2388_split_sizes_0, x = normed_125_cast_fp16)[name = string("op_2388")];
+            tensor<fp16, [1, 8, 256]> var_2390 = mul(x = var_2388_0, y = layers_0_self_attn_q_norm_weight)[name = string("op_2390")];
+            tensor<int32, [4]> var_2395 = const()[name = string("op_2395"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_33 = reshape(shape = var_2395, x = var_2390)[name = string("q_33")];
+            tensor<fp16, [1, 8, 1, 256]> var_2397_cast_fp16 = mul(x = q_33, y = cos_s)[name = string("op_2397_cast_fp16")];
+            tensor<int32, [2]> var_2398_split_sizes_0 = const()[name = string("op_2398_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2398_axis_0 = const()[name = string("op_2398_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_2398_0, tensor<fp16, [1, 8, 1, 128]> var_2398_1 = split(axis = var_2398_axis_0, split_sizes = var_2398_split_sizes_0, x = q_33)[name = string("op_2398")];
+            fp16 const_42_promoted = const()[name = string("const_42_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_2400 = mul(x = var_2398_1, y = const_42_promoted)[name = string("op_2400")];
+            int32 var_2402 = const()[name = string("op_2402"), val = int32(-1)];
+            bool var_2403_interleave_0 = const()[name = string("op_2403_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_2403 = concat(axis = var_2402, interleave = var_2403_interleave_0, values = (var_2400, var_2398_0))[name = string("op_2403")];
+            tensor<fp16, [1, 8, 1, 256]> var_2404_cast_fp16 = mul(x = var_2403, y = sin_s)[name = string("op_2404_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_35_cast_fp16 = add(x = var_2397_cast_fp16, y = var_2404_cast_fp16)[name = string("q_35_cast_fp16")];
+            bool attn_weights_21_transpose_x_0 = const()[name = string("attn_weights_21_transpose_x_0"), val = bool(false)];
+            bool attn_weights_21_transpose_y_0 = const()[name = string("attn_weights_21_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_21_cast_fp16 = matmul(transpose_x = attn_weights_21_transpose_x_0, transpose_y = attn_weights_21_transpose_y_0, x = q_35_cast_fp16, y = transpose_36_cast_fp16)[name = string("attn_weights_21_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_83_cast_fp16 = add(x = attn_weights_21_cast_fp16, y = causal_mask_sliding)[name = string("x_83_cast_fp16")];
+            tensor<int32, [1]> reduce_max_5_axes_0 = const()[name = string("reduce_max_5_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_5_keep_dims_0 = const()[name = string("reduce_max_5_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_5 = reduce_max(axes = reduce_max_5_axes_0, keep_dims = reduce_max_5_keep_dims_0, x = x_83_cast_fp16)[name = string("reduce_max_5")];
+            tensor<fp16, [1, 8, 1, 512]> var_2436 = sub(x = x_83_cast_fp16, y = reduce_max_5)[name = string("op_2436")];
+            tensor<fp16, [1, 8, 1, 512]> var_2442 = exp(x = var_2436)[name = string("op_2442")];
+            tensor<int32, [1]> var_2452_axes_0 = const()[name = string("op_2452_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2452_keep_dims_0 = const()[name = string("op_2452_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_2452 = reduce_sum(axes = var_2452_axes_0, keep_dims = var_2452_keep_dims_0, x = var_2442)[name = string("op_2452")];
+            tensor<fp16, [1, 8, 1, 512]> var_2458_cast_fp16 = real_div(x = var_2442, y = var_2452)[name = string("op_2458_cast_fp16")];
+            bool attn_output_31_transpose_x_0 = const()[name = string("attn_output_31_transpose_x_0"), val = bool(false)];
+            bool attn_output_31_transpose_y_0 = const()[name = string("attn_output_31_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_31_cast_fp16 = matmul(transpose_x = attn_output_31_transpose_x_0, transpose_y = attn_output_31_transpose_y_0, x = var_2458_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_31_cast_fp16")];
+            tensor<int32, [4]> var_2469 = const()[name = string("op_2469"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_2476 = const()[name = string("op_2476"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_2470_cast_fp16 = transpose(perm = var_2469, x = attn_output_31_cast_fp16)[name = string("transpose_39")];
+            tensor<fp16, [1, 1, 2048]> attn_output_33_cast_fp16 = reshape(shape = var_2476, x = var_2470_cast_fp16)[name = string("attn_output_33_cast_fp16")];
+            tensor<int32, [3]> var_2481 = const()[name = string("op_2481"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_2497_pad_type_0 = const()[name = string("op_2497_pad_type_0"), val = string("valid")];
+            int32 var_2497_groups_0 = const()[name = string("op_2497_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_2497_strides_0 = const()[name = string("op_2497_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_2497_pad_0 = const()[name = string("op_2497_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_2497_dilations_0 = const()[name = string("op_2497_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_5_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403468480))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406089984))))[name = string("squeeze_5_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_2482_cast_fp16 = transpose(perm = var_2481, x = attn_output_33_cast_fp16)[name = string("transpose_38")];
+            tensor<fp16, [1, 2560, 1]> var_2497_cast_fp16 = conv(dilations = var_2497_dilations_0, groups = var_2497_groups_0, pad = var_2497_pad_0, pad_type = var_2497_pad_type_0, strides = var_2497_strides_0, weight = squeeze_5_cast_fp16_to_fp32_to_fp16_palettized, x = var_2482_cast_fp16)[name = string("op_2497_cast_fp16")];
+            tensor<int32, [3]> var_2501 = const()[name = string("op_2501"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2507 = const()[name = string("op_2507"), val = int32(-1)];
+            fp16 const_43_promoted_to_fp16 = const()[name = string("const_43_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_87_cast_fp16 = transpose(perm = var_2501, x = var_2497_cast_fp16)[name = string("transpose_37")];
+            tensor<fp16, [1, 1, 2560]> var_2509_cast_fp16 = mul(x = x_87_cast_fp16, y = const_43_promoted_to_fp16)[name = string("op_2509_cast_fp16")];
+            bool input_129_interleave_0 = const()[name = string("input_129_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_129_cast_fp16 = concat(axis = var_2507, interleave = input_129_interleave_0, values = (x_87_cast_fp16, var_2509_cast_fp16))[name = string("input_129_cast_fp16")];
+            tensor<int32, [1]> normed_129_axes_0 = const()[name = string("normed_129_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2504_to_fp16 = const()[name = string("op_2504_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_129_cast_fp16 = layer_norm(axes = normed_129_axes_0, epsilon = var_2504_to_fp16, x = input_129_cast_fp16)[name = string("normed_129_cast_fp16")];
+            tensor<int32, [2]> var_2514_split_sizes_0 = const()[name = string("op_2514_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2514_axis_0 = const()[name = string("op_2514_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2514_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2514_cast_fp16_1 = split(axis = var_2514_axis_0, split_sizes = var_2514_split_sizes_0, x = normed_129_cast_fp16)[name = string("op_2514_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_5_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406092608)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_35_cast_fp16 = mul(x = var_2514_cast_fp16_0, y = layers_5_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_35_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_89_cast_fp16 = add(x = x_79_cast_fp16, y = attn_output_35_cast_fp16)[name = string("x_89_cast_fp16")];
+            int32 var_2523 = const()[name = string("op_2523"), val = int32(-1)];
+            fp16 const_44_promoted_to_fp16 = const()[name = string("const_44_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_2525_cast_fp16 = mul(x = x_89_cast_fp16, y = const_44_promoted_to_fp16)[name = string("op_2525_cast_fp16")];
+            bool input_131_interleave_0 = const()[name = string("input_131_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_131_cast_fp16 = concat(axis = var_2523, interleave = input_131_interleave_0, values = (x_89_cast_fp16, var_2525_cast_fp16))[name = string("input_131_cast_fp16")];
+            tensor<int32, [1]> normed_133_axes_0 = const()[name = string("normed_133_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2520_to_fp16 = const()[name = string("op_2520_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_133_cast_fp16 = layer_norm(axes = normed_133_axes_0, epsilon = var_2520_to_fp16, x = input_131_cast_fp16)[name = string("normed_133_cast_fp16")];
+            tensor<int32, [2]> var_2530_split_sizes_0 = const()[name = string("op_2530_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2530_axis_0 = const()[name = string("op_2530_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2530_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2530_cast_fp16_1 = split(axis = var_2530_axis_0, split_sizes = var_2530_split_sizes_0, x = normed_133_cast_fp16)[name = string("op_2530_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_5_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406097792)))];
+            tensor<fp16, [1, 1, 2560]> h_33_cast_fp16 = mul(x = var_2530_cast_fp16_0, y = layers_5_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_33_cast_fp16")];
+            tensor<int32, [3]> var_2541 = const()[name = string("op_2541"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_133_axes_0 = const()[name = string("input_133_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2542 = transpose(perm = var_2541, x = h_33_cast_fp16)[name = string("transpose_36")];
+            tensor<fp16, [1, 2560, 1, 1]> input_133 = expand_dims(axes = input_133_axes_0, x = var_2542)[name = string("input_133")];
+            string gate_21_pad_type_0 = const()[name = string("gate_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_21_strides_0 = const()[name = string("gate_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_21_pad_0 = const()[name = string("gate_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_21_dilations_0 = const()[name = string("gate_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_21_groups_0 = const()[name = string("gate_21_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_21 = conv(dilations = gate_21_dilations_0, groups = gate_21_groups_0, pad = gate_21_pad_0, pad_type = gate_21_pad_type_0, strides = gate_21_strides_0, weight = layers_5_mlp_gate_proj_weight_palettized, x = input_133)[name = string("gate_21")];
+            string up_11_pad_type_0 = const()[name = string("up_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_11_strides_0 = const()[name = string("up_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_11_pad_0 = const()[name = string("up_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_11_dilations_0 = const()[name = string("up_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_11_groups_0 = const()[name = string("up_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_11 = conv(dilations = up_11_dilations_0, groups = up_11_groups_0, pad = up_11_pad_0, pad_type = up_11_pad_type_0, strides = up_11_strides_0, weight = layers_5_mlp_up_proj_weight_palettized, x = input_133)[name = string("up_11")];
+            string gate_23_mode_0 = const()[name = string("gate_23_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_23 = gelu(mode = gate_23_mode_0, x = gate_21)[name = string("gate_23")];
+            tensor<fp16, [1, 10240, 1, 1]> input_135 = mul(x = gate_23, y = up_11)[name = string("input_135")];
+            string mlp_out_11_pad_type_0 = const()[name = string("mlp_out_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_11_strides_0 = const()[name = string("mlp_out_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_11_pad_0 = const()[name = string("mlp_out_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_11_dilations_0 = const()[name = string("mlp_out_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_11_groups_0 = const()[name = string("mlp_out_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_11 = conv(dilations = mlp_out_11_dilations_0, groups = mlp_out_11_groups_0, pad = mlp_out_11_pad_0, pad_type = mlp_out_11_pad_type_0, strides = mlp_out_11_strides_0, weight = layers_5_mlp_down_proj_weight_palettized, x = input_135)[name = string("mlp_out_11")];
+            tensor<int32, [1]> var_2582_axes_0 = const()[name = string("op_2582_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2582 = squeeze(axes = var_2582_axes_0, x = mlp_out_11)[name = string("op_2582")];
+            tensor<int32, [3]> var_2586 = const()[name = string("op_2586"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2592 = const()[name = string("op_2592"), val = int32(-1)];
+            fp16 const_45_promoted = const()[name = string("const_45_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_91 = transpose(perm = var_2586, x = var_2582)[name = string("transpose_35")];
+            tensor<fp16, [1, 1, 2560]> var_2594 = mul(x = x_91, y = const_45_promoted)[name = string("op_2594")];
+            bool input_137_interleave_0 = const()[name = string("input_137_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_137 = concat(axis = var_2592, interleave = input_137_interleave_0, values = (x_91, var_2594))[name = string("input_137")];
+            tensor<int32, [1]> normed_137_axes_0 = const()[name = string("normed_137_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2589_to_fp16 = const()[name = string("op_2589_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_137_cast_fp16 = layer_norm(axes = normed_137_axes_0, epsilon = var_2589_to_fp16, x = input_137)[name = string("normed_137_cast_fp16")];
+            tensor<int32, [2]> var_2599_split_sizes_0 = const()[name = string("op_2599_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2599_axis_0 = const()[name = string("op_2599_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2599_0, tensor<fp16, [1, 1, 2560]> var_2599_1 = split(axis = var_2599_axis_0, split_sizes = var_2599_split_sizes_0, x = normed_137_cast_fp16)[name = string("op_2599")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_53 = mul(x = var_2599_0, y = layers_5_post_feedforward_layernorm_weight)[name = string("hidden_states_53")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_55_cast_fp16 = add(x = x_89_cast_fp16, y = hidden_states_53)[name = string("hidden_states_55_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_11_begin_0 = const()[name = string("per_layer_slice_11_begin_0"), val = tensor<int32, [3]>([0, 0, 9728])];
+            tensor<int32, [3]> per_layer_slice_11_end_0 = const()[name = string("per_layer_slice_11_end_0"), val = tensor<int32, [3]>([1, 1, 9984])];
+            tensor<bool, [3]> per_layer_slice_11_end_mask_0 = const()[name = string("per_layer_slice_11_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_11_cast_fp16 = slice_by_index(begin = per_layer_slice_11_begin_0, end = per_layer_slice_11_end_0, end_mask = per_layer_slice_11_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_11_cast_fp16")];
+            tensor<int32, [3]> var_2627 = const()[name = string("op_2627"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_139_axes_0 = const()[name = string("input_139_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2628 = transpose(perm = var_2627, x = hidden_states_55_cast_fp16)[name = string("transpose_34")];
+            tensor<fp16, [1, 2560, 1, 1]> input_139 = expand_dims(axes = input_139_axes_0, x = var_2628)[name = string("input_139")];
+            string gated_31_pad_type_0 = const()[name = string("gated_31_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_31_strides_0 = const()[name = string("gated_31_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_31_pad_0 = const()[name = string("gated_31_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_31_dilations_0 = const()[name = string("gated_31_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_31_groups_0 = const()[name = string("gated_31_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_31 = conv(dilations = gated_31_dilations_0, groups = gated_31_groups_0, pad = gated_31_pad_0, pad_type = gated_31_pad_type_0, strides = gated_31_strides_0, weight = layers_5_per_layer_input_gate_weight_palettized, x = input_139)[name = string("gated_31")];
+            string gated_33_mode_0 = const()[name = string("gated_33_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_33 = gelu(mode = gated_33_mode_0, x = gated_31)[name = string("gated_33")];
+            tensor<int32, [3]> var_2647 = const()[name = string("op_2647"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_11_axes_0 = const()[name = string("per_layer_slice_conv_11_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_2648_cast_fp16 = transpose(perm = var_2647, x = per_layer_slice_11_cast_fp16)[name = string("transpose_33")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_11_cast_fp16 = expand_dims(axes = per_layer_slice_conv_11_axes_0, x = var_2648_cast_fp16)[name = string("per_layer_slice_conv_11_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_141_cast_fp16 = mul(x = gated_33, y = per_layer_slice_conv_11_cast_fp16)[name = string("input_141_cast_fp16")];
+            string gated_35_pad_type_0 = const()[name = string("gated_35_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_35_strides_0 = const()[name = string("gated_35_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_35_pad_0 = const()[name = string("gated_35_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_35_dilations_0 = const()[name = string("gated_35_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_35_groups_0 = const()[name = string("gated_35_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_5_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406102976))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406430720))))[name = string("layers_5_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_35_cast_fp16 = conv(dilations = gated_35_dilations_0, groups = gated_35_groups_0, pad = gated_35_pad_0, pad_type = gated_35_pad_type_0, strides = gated_35_strides_0, weight = layers_5_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_141_cast_fp16)[name = string("gated_35_cast_fp16")];
+            tensor<int32, [1]> var_2664_axes_0 = const()[name = string("op_2664_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2664_cast_fp16 = squeeze(axes = var_2664_axes_0, x = gated_35_cast_fp16)[name = string("op_2664_cast_fp16")];
+            tensor<int32, [3]> var_2668 = const()[name = string("op_2668"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2674 = const()[name = string("op_2674"), val = int32(-1)];
+            fp16 const_46_promoted_to_fp16 = const()[name = string("const_46_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_93_cast_fp16 = transpose(perm = var_2668, x = var_2664_cast_fp16)[name = string("transpose_32")];
+            tensor<fp16, [1, 1, 2560]> var_2676_cast_fp16 = mul(x = x_93_cast_fp16, y = const_46_promoted_to_fp16)[name = string("op_2676_cast_fp16")];
+            bool input_143_interleave_0 = const()[name = string("input_143_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_143_cast_fp16 = concat(axis = var_2674, interleave = input_143_interleave_0, values = (x_93_cast_fp16, var_2676_cast_fp16))[name = string("input_143_cast_fp16")];
+            tensor<int32, [1]> normed_141_axes_0 = const()[name = string("normed_141_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2671_to_fp16 = const()[name = string("op_2671_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_141_cast_fp16 = layer_norm(axes = normed_141_axes_0, epsilon = var_2671_to_fp16, x = input_143_cast_fp16)[name = string("normed_141_cast_fp16")];
+            tensor<int32, [2]> var_2681_split_sizes_0 = const()[name = string("op_2681_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2681_axis_0 = const()[name = string("op_2681_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2681_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2681_cast_fp16_1 = split(axis = var_2681_axis_0, split_sizes = var_2681_split_sizes_0, x = normed_141_cast_fp16)[name = string("op_2681_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_5_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406433344)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_59_cast_fp16 = mul(x = var_2681_cast_fp16_0, y = layers_5_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_59_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_61_cast_fp16 = add(x = hidden_states_55_cast_fp16, y = hidden_states_59_cast_fp16)[name = string("hidden_states_61_cast_fp16")];
+            tensor<fp16, [1]> const_47_promoted_to_fp16 = const()[name = string("const_47_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.c4p-1])];
+            tensor<fp16, [1, 1, 2560]> x_95_cast_fp16 = mul(x = hidden_states_61_cast_fp16, y = const_47_promoted_to_fp16)[name = string("x_95_cast_fp16")];
+            int32 var_2696 = const()[name = string("op_2696"), val = int32(-1)];
+            fp16 const_48_promoted_to_fp16 = const()[name = string("const_48_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_2698_cast_fp16 = mul(x = x_95_cast_fp16, y = const_48_promoted_to_fp16)[name = string("op_2698_cast_fp16")];
+            bool input_145_interleave_0 = const()[name = string("input_145_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_145_cast_fp16 = concat(axis = var_2696, interleave = input_145_interleave_0, values = (x_95_cast_fp16, var_2698_cast_fp16))[name = string("input_145_cast_fp16")];
+            tensor<int32, [1]> normed_145_axes_0 = const()[name = string("normed_145_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2693_to_fp16 = const()[name = string("op_2693_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_145_cast_fp16 = layer_norm(axes = normed_145_axes_0, epsilon = var_2693_to_fp16, x = input_145_cast_fp16)[name = string("normed_145_cast_fp16")];
+            tensor<int32, [2]> var_2703_split_sizes_0 = const()[name = string("op_2703_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2703_axis_0 = const()[name = string("op_2703_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2703_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2703_cast_fp16_1 = split(axis = var_2703_axis_0, split_sizes = var_2703_split_sizes_0, x = normed_145_cast_fp16)[name = string("op_2703_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_6_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406438528)))];
+            tensor<fp16, [1, 1, 2560]> h_37_cast_fp16 = mul(x = var_2703_cast_fp16_0, y = layers_6_input_layernorm_weight_promoted_to_fp16)[name = string("h_37_cast_fp16")];
+            tensor<int32, [3]> var_2709 = const()[name = string("op_2709"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_2712_axes_0 = const()[name = string("op_2712_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2710_cast_fp16 = transpose(perm = var_2709, x = h_37_cast_fp16)[name = string("transpose_31")];
+            tensor<fp16, [1, 2560, 1, 1]> var_2712_cast_fp16 = expand_dims(axes = var_2712_axes_0, x = var_2710_cast_fp16)[name = string("op_2712_cast_fp16")];
+            string var_2728_pad_type_0 = const()[name = string("op_2728_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_2728_strides_0 = const()[name = string("op_2728_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_2728_pad_0 = const()[name = string("op_2728_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_2728_dilations_0 = const()[name = string("op_2728_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_2728_groups_0 = const()[name = string("op_2728_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_2728 = conv(dilations = var_2728_dilations_0, groups = var_2728_groups_0, pad = var_2728_pad_0, pad_type = var_2728_pad_type_0, strides = var_2728_strides_0, weight = layers_6_self_attn_q_proj_weight_palettized, x = var_2712_cast_fp16)[name = string("op_2728")];
+            tensor<int32, [4]> var_2733 = const()[name = string("op_2733"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_2734 = reshape(shape = var_2733, x = var_2728)[name = string("op_2734")];
+            tensor<int32, [4]> var_2739 = const()[name = string("op_2739"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_2749 = const()[name = string("op_2749"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_2740 = transpose(perm = var_2739, x = var_2734)[name = string("transpose_30")];
+            tensor<fp16, [1, 8, 256]> x_97 = reshape(shape = var_2749, x = var_2740)[name = string("x_97")];
+            int32 var_2755 = const()[name = string("op_2755"), val = int32(-1)];
+            fp16 const_49_promoted = const()[name = string("const_49_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_2757 = mul(x = x_97, y = const_49_promoted)[name = string("op_2757")];
+            bool input_149_interleave_0 = const()[name = string("input_149_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_149 = concat(axis = var_2755, interleave = input_149_interleave_0, values = (x_97, var_2757))[name = string("input_149")];
+            tensor<int32, [1]> normed_149_axes_0 = const()[name = string("normed_149_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2752_to_fp16 = const()[name = string("op_2752_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_149_cast_fp16 = layer_norm(axes = normed_149_axes_0, epsilon = var_2752_to_fp16, x = input_149)[name = string("normed_149_cast_fp16")];
+            tensor<int32, [2]> var_2762_split_sizes_0 = const()[name = string("op_2762_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2762_axis_0 = const()[name = string("op_2762_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_2762_0, tensor<fp16, [1, 8, 256]> var_2762_1 = split(axis = var_2762_axis_0, split_sizes = var_2762_split_sizes_0, x = normed_149_cast_fp16)[name = string("op_2762")];
+            tensor<fp16, [1, 8, 256]> var_2764 = mul(x = var_2762_0, y = layers_0_self_attn_q_norm_weight)[name = string("op_2764")];
+            tensor<int32, [4]> var_2769 = const()[name = string("op_2769"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_39 = reshape(shape = var_2769, x = var_2764)[name = string("q_39")];
+            tensor<fp16, [1, 8, 1, 256]> var_2771_cast_fp16 = mul(x = q_39, y = cos_s)[name = string("op_2771_cast_fp16")];
+            tensor<int32, [2]> var_2772_split_sizes_0 = const()[name = string("op_2772_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2772_axis_0 = const()[name = string("op_2772_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_2772_0, tensor<fp16, [1, 8, 1, 128]> var_2772_1 = split(axis = var_2772_axis_0, split_sizes = var_2772_split_sizes_0, x = q_39)[name = string("op_2772")];
+            fp16 const_50_promoted = const()[name = string("const_50_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_2774 = mul(x = var_2772_1, y = const_50_promoted)[name = string("op_2774")];
+            int32 var_2776 = const()[name = string("op_2776"), val = int32(-1)];
+            bool var_2777_interleave_0 = const()[name = string("op_2777_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_2777 = concat(axis = var_2776, interleave = var_2777_interleave_0, values = (var_2774, var_2772_0))[name = string("op_2777")];
+            tensor<fp16, [1, 8, 1, 256]> var_2778_cast_fp16 = mul(x = var_2777, y = sin_s)[name = string("op_2778_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_41_cast_fp16 = add(x = var_2771_cast_fp16, y = var_2778_cast_fp16)[name = string("q_41_cast_fp16")];
+            bool attn_weights_25_transpose_x_0 = const()[name = string("attn_weights_25_transpose_x_0"), val = bool(false)];
+            bool attn_weights_25_transpose_y_0 = const()[name = string("attn_weights_25_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_25_cast_fp16 = matmul(transpose_x = attn_weights_25_transpose_x_0, transpose_y = attn_weights_25_transpose_y_0, x = q_41_cast_fp16, y = transpose_36_cast_fp16)[name = string("attn_weights_25_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_99_cast_fp16 = add(x = attn_weights_25_cast_fp16, y = causal_mask_sliding)[name = string("x_99_cast_fp16")];
+            tensor<int32, [1]> reduce_max_6_axes_0 = const()[name = string("reduce_max_6_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_6_keep_dims_0 = const()[name = string("reduce_max_6_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_6 = reduce_max(axes = reduce_max_6_axes_0, keep_dims = reduce_max_6_keep_dims_0, x = x_99_cast_fp16)[name = string("reduce_max_6")];
+            tensor<fp16, [1, 8, 1, 512]> var_2810 = sub(x = x_99_cast_fp16, y = reduce_max_6)[name = string("op_2810")];
+            tensor<fp16, [1, 8, 1, 512]> var_2816 = exp(x = var_2810)[name = string("op_2816")];
+            tensor<int32, [1]> var_2826_axes_0 = const()[name = string("op_2826_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2826_keep_dims_0 = const()[name = string("op_2826_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_2826 = reduce_sum(axes = var_2826_axes_0, keep_dims = var_2826_keep_dims_0, x = var_2816)[name = string("op_2826")];
+            tensor<fp16, [1, 8, 1, 512]> var_2832_cast_fp16 = real_div(x = var_2816, y = var_2826)[name = string("op_2832_cast_fp16")];
+            bool attn_output_37_transpose_x_0 = const()[name = string("attn_output_37_transpose_x_0"), val = bool(false)];
+            bool attn_output_37_transpose_y_0 = const()[name = string("attn_output_37_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_37_cast_fp16 = matmul(transpose_x = attn_output_37_transpose_x_0, transpose_y = attn_output_37_transpose_y_0, x = var_2832_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_37_cast_fp16")];
+            tensor<int32, [4]> var_2843 = const()[name = string("op_2843"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_2850 = const()[name = string("op_2850"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_2844_cast_fp16 = transpose(perm = var_2843, x = attn_output_37_cast_fp16)[name = string("transpose_29")];
+            tensor<fp16, [1, 1, 2048]> attn_output_39_cast_fp16 = reshape(shape = var_2850, x = var_2844_cast_fp16)[name = string("attn_output_39_cast_fp16")];
+            tensor<int32, [3]> var_2855 = const()[name = string("op_2855"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_2871_pad_type_0 = const()[name = string("op_2871_pad_type_0"), val = string("valid")];
+            int32 var_2871_groups_0 = const()[name = string("op_2871_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_2871_strides_0 = const()[name = string("op_2871_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_2871_pad_0 = const()[name = string("op_2871_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_2871_dilations_0 = const()[name = string("op_2871_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_6_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406443712))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409065216))))[name = string("squeeze_6_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_2856_cast_fp16 = transpose(perm = var_2855, x = attn_output_39_cast_fp16)[name = string("transpose_28")];
+            tensor<fp16, [1, 2560, 1]> var_2871_cast_fp16 = conv(dilations = var_2871_dilations_0, groups = var_2871_groups_0, pad = var_2871_pad_0, pad_type = var_2871_pad_type_0, strides = var_2871_strides_0, weight = squeeze_6_cast_fp16_to_fp32_to_fp16_palettized, x = var_2856_cast_fp16)[name = string("op_2871_cast_fp16")];
+            tensor<int32, [3]> var_2875 = const()[name = string("op_2875"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2881 = const()[name = string("op_2881"), val = int32(-1)];
+            fp16 const_51_promoted_to_fp16 = const()[name = string("const_51_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_103_cast_fp16 = transpose(perm = var_2875, x = var_2871_cast_fp16)[name = string("transpose_27")];
+            tensor<fp16, [1, 1, 2560]> var_2883_cast_fp16 = mul(x = x_103_cast_fp16, y = const_51_promoted_to_fp16)[name = string("op_2883_cast_fp16")];
+            bool input_153_interleave_0 = const()[name = string("input_153_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_153_cast_fp16 = concat(axis = var_2881, interleave = input_153_interleave_0, values = (x_103_cast_fp16, var_2883_cast_fp16))[name = string("input_153_cast_fp16")];
+            tensor<int32, [1]> normed_153_axes_0 = const()[name = string("normed_153_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2878_to_fp16 = const()[name = string("op_2878_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_153_cast_fp16 = layer_norm(axes = normed_153_axes_0, epsilon = var_2878_to_fp16, x = input_153_cast_fp16)[name = string("normed_153_cast_fp16")];
+            tensor<int32, [2]> var_2888_split_sizes_0 = const()[name = string("op_2888_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2888_axis_0 = const()[name = string("op_2888_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2888_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2888_cast_fp16_1 = split(axis = var_2888_axis_0, split_sizes = var_2888_split_sizes_0, x = normed_153_cast_fp16)[name = string("op_2888_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_6_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409067840)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_41_cast_fp16 = mul(x = var_2888_cast_fp16_0, y = layers_6_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_41_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_105_cast_fp16 = add(x = x_95_cast_fp16, y = attn_output_41_cast_fp16)[name = string("x_105_cast_fp16")];
+            int32 var_2897 = const()[name = string("op_2897"), val = int32(-1)];
+            fp16 const_52_promoted_to_fp16 = const()[name = string("const_52_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_2899_cast_fp16 = mul(x = x_105_cast_fp16, y = const_52_promoted_to_fp16)[name = string("op_2899_cast_fp16")];
+            bool input_155_interleave_0 = const()[name = string("input_155_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_155_cast_fp16 = concat(axis = var_2897, interleave = input_155_interleave_0, values = (x_105_cast_fp16, var_2899_cast_fp16))[name = string("input_155_cast_fp16")];
+            tensor<int32, [1]> normed_157_axes_0 = const()[name = string("normed_157_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2894_to_fp16 = const()[name = string("op_2894_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_157_cast_fp16 = layer_norm(axes = normed_157_axes_0, epsilon = var_2894_to_fp16, x = input_155_cast_fp16)[name = string("normed_157_cast_fp16")];
+            tensor<int32, [2]> var_2904_split_sizes_0 = const()[name = string("op_2904_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2904_axis_0 = const()[name = string("op_2904_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2904_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_2904_cast_fp16_1 = split(axis = var_2904_axis_0, split_sizes = var_2904_split_sizes_0, x = normed_157_cast_fp16)[name = string("op_2904_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_6_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409073024)))];
+            tensor<fp16, [1, 1, 2560]> h_39_cast_fp16 = mul(x = var_2904_cast_fp16_0, y = layers_6_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_39_cast_fp16")];
+            tensor<int32, [3]> var_2915 = const()[name = string("op_2915"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_157_axes_0 = const()[name = string("input_157_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2916 = transpose(perm = var_2915, x = h_39_cast_fp16)[name = string("transpose_26")];
+            tensor<fp16, [1, 2560, 1, 1]> input_157 = expand_dims(axes = input_157_axes_0, x = var_2916)[name = string("input_157")];
+            string gate_25_pad_type_0 = const()[name = string("gate_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_25_strides_0 = const()[name = string("gate_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_25_pad_0 = const()[name = string("gate_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_25_dilations_0 = const()[name = string("gate_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_25_groups_0 = const()[name = string("gate_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_25 = conv(dilations = gate_25_dilations_0, groups = gate_25_groups_0, pad = gate_25_pad_0, pad_type = gate_25_pad_type_0, strides = gate_25_strides_0, weight = layers_6_mlp_gate_proj_weight_palettized, x = input_157)[name = string("gate_25")];
+            string up_13_pad_type_0 = const()[name = string("up_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_13_strides_0 = const()[name = string("up_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_13_pad_0 = const()[name = string("up_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_13_dilations_0 = const()[name = string("up_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_13_groups_0 = const()[name = string("up_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_13 = conv(dilations = up_13_dilations_0, groups = up_13_groups_0, pad = up_13_pad_0, pad_type = up_13_pad_type_0, strides = up_13_strides_0, weight = layers_6_mlp_up_proj_weight_palettized, x = input_157)[name = string("up_13")];
+            string gate_27_mode_0 = const()[name = string("gate_27_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_27 = gelu(mode = gate_27_mode_0, x = gate_25)[name = string("gate_27")];
+            tensor<fp16, [1, 10240, 1, 1]> input_159 = mul(x = gate_27, y = up_13)[name = string("input_159")];
+            string mlp_out_13_pad_type_0 = const()[name = string("mlp_out_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_13_strides_0 = const()[name = string("mlp_out_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_13_pad_0 = const()[name = string("mlp_out_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_13_dilations_0 = const()[name = string("mlp_out_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_13_groups_0 = const()[name = string("mlp_out_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_13 = conv(dilations = mlp_out_13_dilations_0, groups = mlp_out_13_groups_0, pad = mlp_out_13_pad_0, pad_type = mlp_out_13_pad_type_0, strides = mlp_out_13_strides_0, weight = layers_6_mlp_down_proj_weight_palettized, x = input_159)[name = string("mlp_out_13")];
+            tensor<int32, [1]> var_2956_axes_0 = const()[name = string("op_2956_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_2956 = squeeze(axes = var_2956_axes_0, x = mlp_out_13)[name = string("op_2956")];
+            tensor<int32, [3]> var_2960 = const()[name = string("op_2960"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2966 = const()[name = string("op_2966"), val = int32(-1)];
+            fp16 const_53_promoted = const()[name = string("const_53_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_107 = transpose(perm = var_2960, x = var_2956)[name = string("transpose_25")];
+            tensor<fp16, [1, 1, 2560]> var_2968 = mul(x = x_107, y = const_53_promoted)[name = string("op_2968")];
+            bool input_161_interleave_0 = const()[name = string("input_161_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_161 = concat(axis = var_2966, interleave = input_161_interleave_0, values = (x_107, var_2968))[name = string("input_161")];
+            tensor<int32, [1]> normed_161_axes_0 = const()[name = string("normed_161_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2963_to_fp16 = const()[name = string("op_2963_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_161_cast_fp16 = layer_norm(axes = normed_161_axes_0, epsilon = var_2963_to_fp16, x = input_161)[name = string("normed_161_cast_fp16")];
+            tensor<int32, [2]> var_2973_split_sizes_0 = const()[name = string("op_2973_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2973_axis_0 = const()[name = string("op_2973_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_2973_0, tensor<fp16, [1, 1, 2560]> var_2973_1 = split(axis = var_2973_axis_0, split_sizes = var_2973_split_sizes_0, x = normed_161_cast_fp16)[name = string("op_2973")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_63 = mul(x = var_2973_0, y = layers_6_post_feedforward_layernorm_weight)[name = string("hidden_states_63")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_65_cast_fp16 = add(x = x_105_cast_fp16, y = hidden_states_63)[name = string("hidden_states_65_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_13_begin_0 = const()[name = string("per_layer_slice_13_begin_0"), val = tensor<int32, [3]>([0, 0, 9984])];
+            tensor<int32, [3]> per_layer_slice_13_end_0 = const()[name = string("per_layer_slice_13_end_0"), val = tensor<int32, [3]>([1, 1, 10240])];
+            tensor<bool, [3]> per_layer_slice_13_end_mask_0 = const()[name = string("per_layer_slice_13_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_13_cast_fp16 = slice_by_index(begin = per_layer_slice_13_begin_0, end = per_layer_slice_13_end_0, end_mask = per_layer_slice_13_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_13_cast_fp16")];
+            tensor<int32, [3]> var_3001 = const()[name = string("op_3001"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_163_axes_0 = const()[name = string("input_163_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3002 = transpose(perm = var_3001, x = hidden_states_65_cast_fp16)[name = string("transpose_24")];
+            tensor<fp16, [1, 2560, 1, 1]> input_163 = expand_dims(axes = input_163_axes_0, x = var_3002)[name = string("input_163")];
+            string gated_37_pad_type_0 = const()[name = string("gated_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_37_strides_0 = const()[name = string("gated_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_37_pad_0 = const()[name = string("gated_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_37_dilations_0 = const()[name = string("gated_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_37_groups_0 = const()[name = string("gated_37_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_37 = conv(dilations = gated_37_dilations_0, groups = gated_37_groups_0, pad = gated_37_pad_0, pad_type = gated_37_pad_type_0, strides = gated_37_strides_0, weight = layers_6_per_layer_input_gate_weight_palettized, x = input_163)[name = string("gated_37")];
+            string gated_39_mode_0 = const()[name = string("gated_39_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_39 = gelu(mode = gated_39_mode_0, x = gated_37)[name = string("gated_39")];
+            tensor<int32, [3]> var_3021 = const()[name = string("op_3021"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_13_axes_0 = const()[name = string("per_layer_slice_conv_13_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_3022_cast_fp16 = transpose(perm = var_3021, x = per_layer_slice_13_cast_fp16)[name = string("transpose_23")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_13_cast_fp16 = expand_dims(axes = per_layer_slice_conv_13_axes_0, x = var_3022_cast_fp16)[name = string("per_layer_slice_conv_13_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_165_cast_fp16 = mul(x = gated_39, y = per_layer_slice_conv_13_cast_fp16)[name = string("input_165_cast_fp16")];
+            string gated_41_pad_type_0 = const()[name = string("gated_41_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_41_strides_0 = const()[name = string("gated_41_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_41_pad_0 = const()[name = string("gated_41_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_41_dilations_0 = const()[name = string("gated_41_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_41_groups_0 = const()[name = string("gated_41_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_6_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409078208))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409405952))))[name = string("layers_6_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_41_cast_fp16 = conv(dilations = gated_41_dilations_0, groups = gated_41_groups_0, pad = gated_41_pad_0, pad_type = gated_41_pad_type_0, strides = gated_41_strides_0, weight = layers_6_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_165_cast_fp16)[name = string("gated_41_cast_fp16")];
+            tensor<int32, [1]> var_3038_axes_0 = const()[name = string("op_3038_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3038_cast_fp16 = squeeze(axes = var_3038_axes_0, x = gated_41_cast_fp16)[name = string("op_3038_cast_fp16")];
+            tensor<int32, [3]> var_3042 = const()[name = string("op_3042"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3048 = const()[name = string("op_3048"), val = int32(-1)];
+            fp16 const_54_promoted_to_fp16 = const()[name = string("const_54_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_109_cast_fp16 = transpose(perm = var_3042, x = var_3038_cast_fp16)[name = string("transpose_22")];
+            tensor<fp16, [1, 1, 2560]> var_3050_cast_fp16 = mul(x = x_109_cast_fp16, y = const_54_promoted_to_fp16)[name = string("op_3050_cast_fp16")];
+            bool input_167_interleave_0 = const()[name = string("input_167_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_167_cast_fp16 = concat(axis = var_3048, interleave = input_167_interleave_0, values = (x_109_cast_fp16, var_3050_cast_fp16))[name = string("input_167_cast_fp16")];
+            tensor<int32, [1]> normed_165_axes_0 = const()[name = string("normed_165_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3045_to_fp16 = const()[name = string("op_3045_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_165_cast_fp16 = layer_norm(axes = normed_165_axes_0, epsilon = var_3045_to_fp16, x = input_167_cast_fp16)[name = string("normed_165_cast_fp16")];
+            tensor<int32, [2]> var_3055_split_sizes_0 = const()[name = string("op_3055_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3055_axis_0 = const()[name = string("op_3055_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3055_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3055_cast_fp16_1 = split(axis = var_3055_axis_0, split_sizes = var_3055_split_sizes_0, x = normed_165_cast_fp16)[name = string("op_3055_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_6_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409408576)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_69_cast_fp16 = mul(x = var_3055_cast_fp16_0, y = layers_6_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_69_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_71_cast_fp16 = add(x = hidden_states_65_cast_fp16, y = hidden_states_69_cast_fp16)[name = string("hidden_states_71_cast_fp16")];
+            tensor<fp16, [1]> const_55_promoted_to_fp16 = const()[name = string("const_55_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.b6p-1])];
+            tensor<fp16, [1, 1, 2560]> x_111_cast_fp16 = mul(x = hidden_states_71_cast_fp16, y = const_55_promoted_to_fp16)[name = string("x_111_cast_fp16")];
+            int32 var_3070 = const()[name = string("op_3070"), val = int32(-1)];
+            fp16 const_56_promoted_to_fp16 = const()[name = string("const_56_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_3072_cast_fp16 = mul(x = x_111_cast_fp16, y = const_56_promoted_to_fp16)[name = string("op_3072_cast_fp16")];
+            bool input_169_interleave_0 = const()[name = string("input_169_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_169_cast_fp16 = concat(axis = var_3070, interleave = input_169_interleave_0, values = (x_111_cast_fp16, var_3072_cast_fp16))[name = string("input_169_cast_fp16")];
+            tensor<int32, [1]> normed_169_axes_0 = const()[name = string("normed_169_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3067_to_fp16 = const()[name = string("op_3067_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_169_cast_fp16 = layer_norm(axes = normed_169_axes_0, epsilon = var_3067_to_fp16, x = input_169_cast_fp16)[name = string("normed_169_cast_fp16")];
+            tensor<int32, [2]> var_3077_split_sizes_0 = const()[name = string("op_3077_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3077_axis_0 = const()[name = string("op_3077_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3077_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3077_cast_fp16_1 = split(axis = var_3077_axis_0, split_sizes = var_3077_split_sizes_0, x = normed_169_cast_fp16)[name = string("op_3077_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_7_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409413760)))];
+            tensor<fp16, [1, 1, 2560]> h_43_cast_fp16 = mul(x = var_3077_cast_fp16_0, y = layers_7_input_layernorm_weight_promoted_to_fp16)[name = string("h_43_cast_fp16")];
+            tensor<int32, [3]> var_3083 = const()[name = string("op_3083"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_3086_axes_0 = const()[name = string("op_3086_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3084_cast_fp16 = transpose(perm = var_3083, x = h_43_cast_fp16)[name = string("transpose_21")];
+            tensor<fp16, [1, 2560, 1, 1]> var_3086_cast_fp16 = expand_dims(axes = var_3086_axes_0, x = var_3084_cast_fp16)[name = string("op_3086_cast_fp16")];
+            string var_3102_pad_type_0 = const()[name = string("op_3102_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_3102_strides_0 = const()[name = string("op_3102_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_3102_pad_0 = const()[name = string("op_3102_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_3102_dilations_0 = const()[name = string("op_3102_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_3102_groups_0 = const()[name = string("op_3102_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 1]> var_3102 = conv(dilations = var_3102_dilations_0, groups = var_3102_groups_0, pad = var_3102_pad_0, pad_type = var_3102_pad_type_0, strides = var_3102_strides_0, weight = layers_7_self_attn_q_proj_weight_palettized, x = var_3086_cast_fp16)[name = string("op_3102")];
+            tensor<int32, [4]> var_3107 = const()[name = string("op_3107"), val = tensor<int32, [4]>([1, 8, 256, 1])];
+            tensor<fp16, [1, 8, 256, 1]> var_3108 = reshape(shape = var_3107, x = var_3102)[name = string("op_3108")];
+            tensor<int32, [4]> var_3113 = const()[name = string("op_3113"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_3123 = const()[name = string("op_3123"), val = tensor<int32, [3]>([1, 8, 256])];
+            tensor<fp16, [1, 8, 1, 256]> var_3114 = transpose(perm = var_3113, x = var_3108)[name = string("transpose_20")];
+            tensor<fp16, [1, 8, 256]> x_113 = reshape(shape = var_3123, x = var_3114)[name = string("x_113")];
+            int32 var_3129 = const()[name = string("op_3129"), val = int32(-1)];
+            fp16 const_57_promoted = const()[name = string("const_57_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 256]> var_3131 = mul(x = x_113, y = const_57_promoted)[name = string("op_3131")];
+            bool input_173_interleave_0 = const()[name = string("input_173_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512]> input_173 = concat(axis = var_3129, interleave = input_173_interleave_0, values = (x_113, var_3131))[name = string("input_173")];
+            tensor<int32, [1]> normed_173_axes_0 = const()[name = string("normed_173_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3126_to_fp16 = const()[name = string("op_3126_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 512]> normed_173_cast_fp16 = layer_norm(axes = normed_173_axes_0, epsilon = var_3126_to_fp16, x = input_173)[name = string("normed_173_cast_fp16")];
+            tensor<int32, [2]> var_3136_split_sizes_0 = const()[name = string("op_3136_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_3136_axis_0 = const()[name = string("op_3136_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 256]> var_3136_0, tensor<fp16, [1, 8, 256]> var_3136_1 = split(axis = var_3136_axis_0, split_sizes = var_3136_split_sizes_0, x = normed_173_cast_fp16)[name = string("op_3136")];
+            tensor<fp16, [1, 8, 256]> var_3138 = mul(x = var_3136_0, y = layers_0_self_attn_q_norm_weight)[name = string("op_3138")];
+            tensor<int32, [4]> var_3143 = const()[name = string("op_3143"), val = tensor<int32, [4]>([1, 8, 1, 256])];
+            tensor<fp16, [1, 8, 1, 256]> q_45 = reshape(shape = var_3143, x = var_3138)[name = string("q_45")];
+            tensor<fp16, [1, 8, 1, 256]> var_3145_cast_fp16 = mul(x = q_45, y = cos_s)[name = string("op_3145_cast_fp16")];
+            tensor<int32, [2]> var_3146_split_sizes_0 = const()[name = string("op_3146_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_3146_axis_0 = const()[name = string("op_3146_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 128]> var_3146_0, tensor<fp16, [1, 8, 1, 128]> var_3146_1 = split(axis = var_3146_axis_0, split_sizes = var_3146_split_sizes_0, x = q_45)[name = string("op_3146")];
+            fp16 const_58_promoted = const()[name = string("const_58_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 128]> var_3148 = mul(x = var_3146_1, y = const_58_promoted)[name = string("op_3148")];
+            int32 var_3150 = const()[name = string("op_3150"), val = int32(-1)];
+            bool var_3151_interleave_0 = const()[name = string("op_3151_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> var_3151 = concat(axis = var_3150, interleave = var_3151_interleave_0, values = (var_3148, var_3146_0))[name = string("op_3151")];
+            tensor<fp16, [1, 8, 1, 256]> var_3152_cast_fp16 = mul(x = var_3151, y = sin_s)[name = string("op_3152_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 256]> q_47_cast_fp16 = add(x = var_3145_cast_fp16, y = var_3152_cast_fp16)[name = string("q_47_cast_fp16")];
+            bool attn_weights_29_transpose_x_0 = const()[name = string("attn_weights_29_transpose_x_0"), val = bool(false)];
+            bool attn_weights_29_transpose_y_0 = const()[name = string("attn_weights_29_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> attn_weights_29_cast_fp16 = matmul(transpose_x = attn_weights_29_transpose_x_0, transpose_y = attn_weights_29_transpose_y_0, x = q_47_cast_fp16, y = transpose_36_cast_fp16)[name = string("attn_weights_29_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> x_115_cast_fp16 = add(x = attn_weights_29_cast_fp16, y = causal_mask_sliding)[name = string("x_115_cast_fp16")];
+            tensor<int32, [1]> reduce_max_7_axes_0 = const()[name = string("reduce_max_7_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_7_keep_dims_0 = const()[name = string("reduce_max_7_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_7 = reduce_max(axes = reduce_max_7_axes_0, keep_dims = reduce_max_7_keep_dims_0, x = x_115_cast_fp16)[name = string("reduce_max_7")];
+            tensor<fp16, [1, 8, 1, 512]> var_3184 = sub(x = x_115_cast_fp16, y = reduce_max_7)[name = string("op_3184")];
+            tensor<fp16, [1, 8, 1, 512]> var_3190 = exp(x = var_3184)[name = string("op_3190")];
+            tensor<int32, [1]> var_3200_axes_0 = const()[name = string("op_3200_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3200_keep_dims_0 = const()[name = string("op_3200_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_3200 = reduce_sum(axes = var_3200_axes_0, keep_dims = var_3200_keep_dims_0, x = var_3190)[name = string("op_3200")];
+            tensor<fp16, [1, 8, 1, 512]> var_3206_cast_fp16 = real_div(x = var_3190, y = var_3200)[name = string("op_3206_cast_fp16")];
+            bool attn_output_43_transpose_x_0 = const()[name = string("attn_output_43_transpose_x_0"), val = bool(false)];
+            bool attn_output_43_transpose_y_0 = const()[name = string("attn_output_43_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 256]> attn_output_43_cast_fp16 = matmul(transpose_x = attn_output_43_transpose_x_0, transpose_y = attn_output_43_transpose_y_0, x = var_3206_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_43_cast_fp16")];
+            tensor<int32, [4]> var_3217 = const()[name = string("op_3217"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_3224 = const()[name = string("op_3224"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 256]> var_3218_cast_fp16 = transpose(perm = var_3217, x = attn_output_43_cast_fp16)[name = string("transpose_19")];
+            tensor<fp16, [1, 1, 2048]> attn_output_45_cast_fp16 = reshape(shape = var_3224, x = var_3218_cast_fp16)[name = string("attn_output_45_cast_fp16")];
+            tensor<int32, [3]> var_3229 = const()[name = string("op_3229"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_3245_pad_type_0 = const()[name = string("op_3245_pad_type_0"), val = string("valid")];
+            int32 var_3245_groups_0 = const()[name = string("op_3245_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_3245_strides_0 = const()[name = string("op_3245_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_3245_pad_0 = const()[name = string("op_3245_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_3245_dilations_0 = const()[name = string("op_3245_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_7_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409418944))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412040448))))[name = string("squeeze_7_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 1]> var_3230_cast_fp16 = transpose(perm = var_3229, x = attn_output_45_cast_fp16)[name = string("transpose_18")];
+            tensor<fp16, [1, 2560, 1]> var_3245_cast_fp16 = conv(dilations = var_3245_dilations_0, groups = var_3245_groups_0, pad = var_3245_pad_0, pad_type = var_3245_pad_type_0, strides = var_3245_strides_0, weight = squeeze_7_cast_fp16_to_fp32_to_fp16_palettized, x = var_3230_cast_fp16)[name = string("op_3245_cast_fp16")];
+            tensor<int32, [3]> var_3249 = const()[name = string("op_3249"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3255 = const()[name = string("op_3255"), val = int32(-1)];
+            fp16 const_59_promoted_to_fp16 = const()[name = string("const_59_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_119_cast_fp16 = transpose(perm = var_3249, x = var_3245_cast_fp16)[name = string("transpose_17")];
+            tensor<fp16, [1, 1, 2560]> var_3257_cast_fp16 = mul(x = x_119_cast_fp16, y = const_59_promoted_to_fp16)[name = string("op_3257_cast_fp16")];
+            bool input_177_interleave_0 = const()[name = string("input_177_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_177_cast_fp16 = concat(axis = var_3255, interleave = input_177_interleave_0, values = (x_119_cast_fp16, var_3257_cast_fp16))[name = string("input_177_cast_fp16")];
+            tensor<int32, [1]> normed_177_axes_0 = const()[name = string("normed_177_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3252_to_fp16 = const()[name = string("op_3252_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_177_cast_fp16 = layer_norm(axes = normed_177_axes_0, epsilon = var_3252_to_fp16, x = input_177_cast_fp16)[name = string("normed_177_cast_fp16")];
+            tensor<int32, [2]> var_3262_split_sizes_0 = const()[name = string("op_3262_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3262_axis_0 = const()[name = string("op_3262_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3262_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3262_cast_fp16_1 = split(axis = var_3262_axis_0, split_sizes = var_3262_split_sizes_0, x = normed_177_cast_fp16)[name = string("op_3262_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_7_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412043072)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_47_cast_fp16 = mul(x = var_3262_cast_fp16_0, y = layers_7_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_47_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_121_cast_fp16 = add(x = x_111_cast_fp16, y = attn_output_47_cast_fp16)[name = string("x_121_cast_fp16")];
+            int32 var_3271 = const()[name = string("op_3271"), val = int32(-1)];
+            fp16 const_60_promoted_to_fp16 = const()[name = string("const_60_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_3273_cast_fp16 = mul(x = x_121_cast_fp16, y = const_60_promoted_to_fp16)[name = string("op_3273_cast_fp16")];
+            bool input_179_interleave_0 = const()[name = string("input_179_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_179_cast_fp16 = concat(axis = var_3271, interleave = input_179_interleave_0, values = (x_121_cast_fp16, var_3273_cast_fp16))[name = string("input_179_cast_fp16")];
+            tensor<int32, [1]> normed_181_axes_0 = const()[name = string("normed_181_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3268_to_fp16 = const()[name = string("op_3268_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_181_cast_fp16 = layer_norm(axes = normed_181_axes_0, epsilon = var_3268_to_fp16, x = input_179_cast_fp16)[name = string("normed_181_cast_fp16")];
+            tensor<int32, [2]> var_3278_split_sizes_0 = const()[name = string("op_3278_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3278_axis_0 = const()[name = string("op_3278_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3278_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3278_cast_fp16_1 = split(axis = var_3278_axis_0, split_sizes = var_3278_split_sizes_0, x = normed_181_cast_fp16)[name = string("op_3278_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_7_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412048256)))];
+            tensor<fp16, [1, 1, 2560]> h_45_cast_fp16 = mul(x = var_3278_cast_fp16_0, y = layers_7_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_45_cast_fp16")];
+            tensor<int32, [3]> var_3289 = const()[name = string("op_3289"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_181_axes_0 = const()[name = string("input_181_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3290 = transpose(perm = var_3289, x = h_45_cast_fp16)[name = string("transpose_16")];
+            tensor<fp16, [1, 2560, 1, 1]> input_181 = expand_dims(axes = input_181_axes_0, x = var_3290)[name = string("input_181")];
+            string gate_29_pad_type_0 = const()[name = string("gate_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_29_strides_0 = const()[name = string("gate_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_29_pad_0 = const()[name = string("gate_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_29_dilations_0 = const()[name = string("gate_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_29_groups_0 = const()[name = string("gate_29_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_29 = conv(dilations = gate_29_dilations_0, groups = gate_29_groups_0, pad = gate_29_pad_0, pad_type = gate_29_pad_type_0, strides = gate_29_strides_0, weight = layers_7_mlp_gate_proj_weight_palettized, x = input_181)[name = string("gate_29")];
+            string up_15_pad_type_0 = const()[name = string("up_15_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_15_strides_0 = const()[name = string("up_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_15_pad_0 = const()[name = string("up_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_15_dilations_0 = const()[name = string("up_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_15_groups_0 = const()[name = string("up_15_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up_15 = conv(dilations = up_15_dilations_0, groups = up_15_groups_0, pad = up_15_pad_0, pad_type = up_15_pad_type_0, strides = up_15_strides_0, weight = layers_7_mlp_up_proj_weight_palettized, x = input_181)[name = string("up_15")];
+            string gate_31_mode_0 = const()[name = string("gate_31_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate_31 = gelu(mode = gate_31_mode_0, x = gate_29)[name = string("gate_31")];
+            tensor<fp16, [1, 10240, 1, 1]> input_183 = mul(x = gate_31, y = up_15)[name = string("input_183")];
+            string mlp_out_15_pad_type_0 = const()[name = string("mlp_out_15_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_15_strides_0 = const()[name = string("mlp_out_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_15_pad_0 = const()[name = string("mlp_out_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_15_dilations_0 = const()[name = string("mlp_out_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_15_groups_0 = const()[name = string("mlp_out_15_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out_15 = conv(dilations = mlp_out_15_dilations_0, groups = mlp_out_15_groups_0, pad = mlp_out_15_pad_0, pad_type = mlp_out_15_pad_type_0, strides = mlp_out_15_strides_0, weight = layers_7_mlp_down_proj_weight_palettized, x = input_183)[name = string("mlp_out_15")];
+            tensor<int32, [1]> var_3330_axes_0 = const()[name = string("op_3330_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3330 = squeeze(axes = var_3330_axes_0, x = mlp_out_15)[name = string("op_3330")];
+            tensor<int32, [3]> var_3334 = const()[name = string("op_3334"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3340 = const()[name = string("op_3340"), val = int32(-1)];
+            fp16 const_61_promoted = const()[name = string("const_61_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_123 = transpose(perm = var_3334, x = var_3330)[name = string("transpose_15")];
+            tensor<fp16, [1, 1, 2560]> var_3342 = mul(x = x_123, y = const_61_promoted)[name = string("op_3342")];
+            bool input_185_interleave_0 = const()[name = string("input_185_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_185 = concat(axis = var_3340, interleave = input_185_interleave_0, values = (x_123, var_3342))[name = string("input_185")];
+            tensor<int32, [1]> normed_185_axes_0 = const()[name = string("normed_185_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3337_to_fp16 = const()[name = string("op_3337_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_185_cast_fp16 = layer_norm(axes = normed_185_axes_0, epsilon = var_3337_to_fp16, x = input_185)[name = string("normed_185_cast_fp16")];
+            tensor<int32, [2]> var_3347_split_sizes_0 = const()[name = string("op_3347_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3347_axis_0 = const()[name = string("op_3347_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3347_0, tensor<fp16, [1, 1, 2560]> var_3347_1 = split(axis = var_3347_axis_0, split_sizes = var_3347_split_sizes_0, x = normed_185_cast_fp16)[name = string("op_3347")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_73 = mul(x = var_3347_0, y = layers_7_post_feedforward_layernorm_weight)[name = string("hidden_states_73")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_75_cast_fp16 = add(x = x_121_cast_fp16, y = hidden_states_73)[name = string("hidden_states_75_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_15_begin_0 = const()[name = string("per_layer_slice_15_begin_0"), val = tensor<int32, [3]>([0, 0, 10240])];
+            tensor<int32, [3]> per_layer_slice_15_end_0 = const()[name = string("per_layer_slice_15_end_0"), val = tensor<int32, [3]>([1, 1, 10496])];
+            tensor<bool, [3]> per_layer_slice_15_end_mask_0 = const()[name = string("per_layer_slice_15_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_15_cast_fp16 = slice_by_index(begin = per_layer_slice_15_begin_0, end = per_layer_slice_15_end_0, end_mask = per_layer_slice_15_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_15_cast_fp16")];
+            tensor<int32, [3]> var_3375 = const()[name = string("op_3375"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_187_axes_0 = const()[name = string("input_187_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3376 = transpose(perm = var_3375, x = hidden_states_75_cast_fp16)[name = string("transpose_14")];
+            tensor<fp16, [1, 2560, 1, 1]> input_187 = expand_dims(axes = input_187_axes_0, x = var_3376)[name = string("input_187")];
+            string gated_43_pad_type_0 = const()[name = string("gated_43_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_43_strides_0 = const()[name = string("gated_43_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_43_pad_0 = const()[name = string("gated_43_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_43_dilations_0 = const()[name = string("gated_43_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_43_groups_0 = const()[name = string("gated_43_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_43 = conv(dilations = gated_43_dilations_0, groups = gated_43_groups_0, pad = gated_43_pad_0, pad_type = gated_43_pad_type_0, strides = gated_43_strides_0, weight = layers_7_per_layer_input_gate_weight_palettized, x = input_187)[name = string("gated_43")];
+            string gated_45_mode_0 = const()[name = string("gated_45_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_45 = gelu(mode = gated_45_mode_0, x = gated_43)[name = string("gated_45")];
+            tensor<int32, [3]> var_3395 = const()[name = string("op_3395"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_15_axes_0 = const()[name = string("per_layer_slice_conv_15_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_3396_cast_fp16 = transpose(perm = var_3395, x = per_layer_slice_15_cast_fp16)[name = string("transpose_13")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_15_cast_fp16 = expand_dims(axes = per_layer_slice_conv_15_axes_0, x = var_3396_cast_fp16)[name = string("per_layer_slice_conv_15_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_189_cast_fp16 = mul(x = gated_45, y = per_layer_slice_conv_15_cast_fp16)[name = string("input_189_cast_fp16")];
+            string gated_47_pad_type_0 = const()[name = string("gated_47_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_47_strides_0 = const()[name = string("gated_47_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_47_pad_0 = const()[name = string("gated_47_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_47_dilations_0 = const()[name = string("gated_47_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_47_groups_0 = const()[name = string("gated_47_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_7_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412053440))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412381184))))[name = string("layers_7_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_47_cast_fp16 = conv(dilations = gated_47_dilations_0, groups = gated_47_groups_0, pad = gated_47_pad_0, pad_type = gated_47_pad_type_0, strides = gated_47_strides_0, weight = layers_7_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_189_cast_fp16)[name = string("gated_47_cast_fp16")];
+            tensor<int32, [1]> var_3412_axes_0 = const()[name = string("op_3412_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3412_cast_fp16 = squeeze(axes = var_3412_axes_0, x = gated_47_cast_fp16)[name = string("op_3412_cast_fp16")];
+            tensor<int32, [3]> var_3416 = const()[name = string("op_3416"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3422 = const()[name = string("op_3422"), val = int32(-1)];
+            fp16 const_62_promoted_to_fp16 = const()[name = string("const_62_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_125_cast_fp16 = transpose(perm = var_3416, x = var_3412_cast_fp16)[name = string("transpose_12")];
+            tensor<fp16, [1, 1, 2560]> var_3424_cast_fp16 = mul(x = x_125_cast_fp16, y = const_62_promoted_to_fp16)[name = string("op_3424_cast_fp16")];
+            bool input_191_interleave_0 = const()[name = string("input_191_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_191_cast_fp16 = concat(axis = var_3422, interleave = input_191_interleave_0, values = (x_125_cast_fp16, var_3424_cast_fp16))[name = string("input_191_cast_fp16")];
+            tensor<int32, [1]> normed_189_axes_0 = const()[name = string("normed_189_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3419_to_fp16 = const()[name = string("op_3419_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_189_cast_fp16 = layer_norm(axes = normed_189_axes_0, epsilon = var_3419_to_fp16, x = input_191_cast_fp16)[name = string("normed_189_cast_fp16")];
+            tensor<int32, [2]> var_3429_split_sizes_0 = const()[name = string("op_3429_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3429_axis_0 = const()[name = string("op_3429_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3429_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3429_cast_fp16_1 = split(axis = var_3429_axis_0, split_sizes = var_3429_split_sizes_0, x = normed_189_cast_fp16)[name = string("op_3429_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_7_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412383808)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_79_cast_fp16 = mul(x = var_3429_cast_fp16_0, y = layers_7_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_79_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_81_cast_fp16 = add(x = hidden_states_75_cast_fp16, y = hidden_states_79_cast_fp16)[name = string("hidden_states_81_cast_fp16")];
+            tensor<fp16, [1]> const_63_promoted_to_fp16 = const()[name = string("const_63_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.9ep-1])];
+            tensor<fp16, [1, 1, 2560]> x_127_cast_fp16 = mul(x = hidden_states_81_cast_fp16, y = const_63_promoted_to_fp16)[name = string("x_127_cast_fp16")];
+            int32 var_3444 = const()[name = string("op_3444"), val = int32(-1)];
+            fp16 const_64_promoted_to_fp16 = const()[name = string("const_64_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_3446_cast_fp16 = mul(x = x_127_cast_fp16, y = const_64_promoted_to_fp16)[name = string("op_3446_cast_fp16")];
+            bool input_193_interleave_0 = const()[name = string("input_193_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_193_cast_fp16 = concat(axis = var_3444, interleave = input_193_interleave_0, values = (x_127_cast_fp16, var_3446_cast_fp16))[name = string("input_193_cast_fp16")];
+            tensor<int32, [1]> normed_193_axes_0 = const()[name = string("normed_193_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3441_to_fp16 = const()[name = string("op_3441_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_193_cast_fp16 = layer_norm(axes = normed_193_axes_0, epsilon = var_3441_to_fp16, x = input_193_cast_fp16)[name = string("normed_193_cast_fp16")];
+            tensor<int32, [2]> var_3451_split_sizes_0 = const()[name = string("op_3451_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3451_axis_0 = const()[name = string("op_3451_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3451_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3451_cast_fp16_1 = split(axis = var_3451_axis_0, split_sizes = var_3451_split_sizes_0, x = normed_193_cast_fp16)[name = string("op_3451_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_8_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412388992)))];
+            tensor<fp16, [1, 1, 2560]> h_49_cast_fp16 = mul(x = var_3451_cast_fp16_0, y = layers_8_input_layernorm_weight_promoted_to_fp16)[name = string("h_49_cast_fp16")];
+            tensor<int32, [3]> var_3457 = const()[name = string("op_3457"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_3460_axes_0 = const()[name = string("op_3460_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3458_cast_fp16 = transpose(perm = var_3457, x = h_49_cast_fp16)[name = string("transpose_11")];
+            tensor<fp16, [1, 2560, 1, 1]> var_3460_cast_fp16 = expand_dims(axes = var_3460_axes_0, x = var_3458_cast_fp16)[name = string("op_3460_cast_fp16")];
+            string var_3476_pad_type_0 = const()[name = string("op_3476_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> var_3476_strides_0 = const()[name = string("op_3476_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> var_3476_pad_0 = const()[name = string("op_3476_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> var_3476_dilations_0 = const()[name = string("op_3476_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 var_3476_groups_0 = const()[name = string("op_3476_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 4096, 1, 1]> var_3476 = conv(dilations = var_3476_dilations_0, groups = var_3476_groups_0, pad = var_3476_pad_0, pad_type = var_3476_pad_type_0, strides = var_3476_strides_0, weight = layers_8_self_attn_q_proj_weight_palettized, x = var_3460_cast_fp16)[name = string("op_3476")];
+            tensor<int32, [4]> var_3481 = const()[name = string("op_3481"), val = tensor<int32, [4]>([1, 8, 512, 1])];
+            tensor<fp16, [1, 8, 512, 1]> var_3482 = reshape(shape = var_3481, x = var_3476)[name = string("op_3482")];
+            tensor<int32, [4]> var_3487 = const()[name = string("op_3487"), val = tensor<int32, [4]>([0, 1, 3, 2])];
+            tensor<int32, [3]> var_3497 = const()[name = string("op_3497"), val = tensor<int32, [3]>([1, 8, 512])];
+            tensor<fp16, [1, 8, 1, 512]> var_3488 = transpose(perm = var_3487, x = var_3482)[name = string("transpose_10")];
+            tensor<fp16, [1, 8, 512]> x_129 = reshape(shape = var_3497, x = var_3488)[name = string("x_129")];
+            int32 var_3503 = const()[name = string("op_3503"), val = int32(-1)];
+            fp16 const_65_promoted = const()[name = string("const_65_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 512]> var_3505 = mul(x = x_129, y = const_65_promoted)[name = string("op_3505")];
+            bool input_197_interleave_0 = const()[name = string("input_197_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1024]> input_197 = concat(axis = var_3503, interleave = input_197_interleave_0, values = (x_129, var_3505))[name = string("input_197")];
+            tensor<int32, [1]> normed_197_axes_0 = const()[name = string("normed_197_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3500_to_fp16 = const()[name = string("op_3500_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 8, 1024]> normed_197_cast_fp16 = layer_norm(axes = normed_197_axes_0, epsilon = var_3500_to_fp16, x = input_197)[name = string("normed_197_cast_fp16")];
+            tensor<int32, [2]> var_3510_split_sizes_0 = const()[name = string("op_3510_split_sizes_0"), val = tensor<int32, [2]>([512, 512])];
+            int32 var_3510_axis_0 = const()[name = string("op_3510_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 512]> var_3510_0, tensor<fp16, [1, 8, 512]> var_3510_1 = split(axis = var_3510_axis_0, split_sizes = var_3510_split_sizes_0, x = normed_197_cast_fp16)[name = string("op_3510")];
+            tensor<fp16, [1, 8, 512]> var_3512 = mul(x = var_3510_0, y = layers_2_self_attn_q_norm_weight)[name = string("op_3512")];
+            tensor<int32, [4]> var_3517 = const()[name = string("op_3517"), val = tensor<int32, [4]>([1, 8, 1, 512])];
+            tensor<fp16, [1, 8, 1, 512]> q_51 = reshape(shape = var_3517, x = var_3512)[name = string("q_51")];
+            tensor<fp16, [1, 8, 1, 512]> var_3519_cast_fp16 = mul(x = q_51, y = cos_f)[name = string("op_3519_cast_fp16")];
+            tensor<int32, [2]> var_3520_split_sizes_0 = const()[name = string("op_3520_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_3520_axis_0 = const()[name = string("op_3520_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 1, 256]> var_3520_0, tensor<fp16, [1, 8, 1, 256]> var_3520_1 = split(axis = var_3520_axis_0, split_sizes = var_3520_split_sizes_0, x = q_51)[name = string("op_3520")];
+            fp16 const_66_promoted = const()[name = string("const_66_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 1, 256]> var_3522 = mul(x = var_3520_1, y = const_66_promoted)[name = string("op_3522")];
+            int32 var_3524 = const()[name = string("op_3524"), val = int32(-1)];
+            bool var_3525_interleave_0 = const()[name = string("op_3525_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> var_3525 = concat(axis = var_3524, interleave = var_3525_interleave_0, values = (var_3522, var_3520_0))[name = string("op_3525")];
+            tensor<fp16, [1, 8, 1, 512]> var_3526_cast_fp16 = mul(x = var_3525, y = sin_f)[name = string("op_3526_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 512]> q_cast_fp16 = add(x = var_3519_cast_fp16, y = var_3526_cast_fp16)[name = string("q_cast_fp16")];
+            bool attn_weights_33_transpose_x_0 = const()[name = string("attn_weights_33_transpose_x_0"), val = bool(false)];
+            bool attn_weights_33_transpose_y_0 = const()[name = string("attn_weights_33_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 2048]> attn_weights_33_cast_fp16 = matmul(transpose_x = attn_weights_33_transpose_x_0, transpose_y = attn_weights_33_transpose_y_0, x = q_cast_fp16, y = transpose_38_cast_fp16)[name = string("attn_weights_33_cast_fp16")];
+            tensor<fp16, [1, 8, 1, 2048]> x_131_cast_fp16 = add(x = attn_weights_33_cast_fp16, y = causal_mask_full)[name = string("x_131_cast_fp16")];
+            tensor<int32, [1]> reduce_max_8_axes_0 = const()[name = string("reduce_max_8_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_8_keep_dims_0 = const()[name = string("reduce_max_8_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> reduce_max_8 = reduce_max(axes = reduce_max_8_axes_0, keep_dims = reduce_max_8_keep_dims_0, x = x_131_cast_fp16)[name = string("reduce_max_8")];
+            tensor<fp16, [1, 8, 1, 2048]> var_3558 = sub(x = x_131_cast_fp16, y = reduce_max_8)[name = string("op_3558")];
+            tensor<fp16, [1, 8, 1, 2048]> var_3564 = exp(x = var_3558)[name = string("op_3564")];
+            tensor<int32, [1]> var_3574_axes_0 = const()[name = string("op_3574_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3574_keep_dims_0 = const()[name = string("op_3574_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 1, 1]> var_3574 = reduce_sum(axes = var_3574_axes_0, keep_dims = var_3574_keep_dims_0, x = var_3564)[name = string("op_3574")];
+            tensor<fp16, [1, 8, 1, 2048]> var_3580_cast_fp16 = real_div(x = var_3564, y = var_3574)[name = string("op_3580_cast_fp16")];
+            bool attn_output_49_transpose_x_0 = const()[name = string("attn_output_49_transpose_x_0"), val = bool(false)];
+            bool attn_output_49_transpose_y_0 = const()[name = string("attn_output_49_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 1, 512]> attn_output_49_cast_fp16 = matmul(transpose_x = attn_output_49_transpose_x_0, transpose_y = attn_output_49_transpose_y_0, x = var_3580_cast_fp16, y = V_expanded_5_cast_fp16)[name = string("attn_output_49_cast_fp16")];
+            tensor<int32, [4]> var_3591 = const()[name = string("op_3591"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_3598 = const()[name = string("op_3598"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, 8, 512]> var_3592_cast_fp16 = transpose(perm = var_3591, x = attn_output_49_cast_fp16)[name = string("transpose_9")];
+            tensor<fp16, [1, 1, 4096]> attn_output_51_cast_fp16 = reshape(shape = var_3598, x = var_3592_cast_fp16)[name = string("attn_output_51_cast_fp16")];
+            tensor<int32, [3]> var_3603 = const()[name = string("op_3603"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_3619_pad_type_0 = const()[name = string("op_3619_pad_type_0"), val = string("valid")];
+            int32 var_3619_groups_0 = const()[name = string("op_3619_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_3619_strides_0 = const()[name = string("op_3619_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_3619_pad_0 = const()[name = string("op_3619_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_3619_dilations_0 = const()[name = string("op_3619_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 4096, 1]> squeeze_8_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 4096, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412394176))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(417637120))))[name = string("squeeze_8_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 4096, 1]> var_3604_cast_fp16 = transpose(perm = var_3603, x = attn_output_51_cast_fp16)[name = string("transpose_8")];
+            tensor<fp16, [1, 2560, 1]> var_3619_cast_fp16 = conv(dilations = var_3619_dilations_0, groups = var_3619_groups_0, pad = var_3619_pad_0, pad_type = var_3619_pad_type_0, strides = var_3619_strides_0, weight = squeeze_8_cast_fp16_to_fp32_to_fp16_palettized, x = var_3604_cast_fp16)[name = string("op_3619_cast_fp16")];
+            tensor<int32, [3]> var_3623 = const()[name = string("op_3623"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3629 = const()[name = string("op_3629"), val = int32(-1)];
+            fp16 const_67_promoted_to_fp16 = const()[name = string("const_67_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_135_cast_fp16 = transpose(perm = var_3623, x = var_3619_cast_fp16)[name = string("transpose_7")];
+            tensor<fp16, [1, 1, 2560]> var_3631_cast_fp16 = mul(x = x_135_cast_fp16, y = const_67_promoted_to_fp16)[name = string("op_3631_cast_fp16")];
+            bool input_201_interleave_0 = const()[name = string("input_201_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_201_cast_fp16 = concat(axis = var_3629, interleave = input_201_interleave_0, values = (x_135_cast_fp16, var_3631_cast_fp16))[name = string("input_201_cast_fp16")];
+            tensor<int32, [1]> normed_201_axes_0 = const()[name = string("normed_201_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3626_to_fp16 = const()[name = string("op_3626_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_201_cast_fp16 = layer_norm(axes = normed_201_axes_0, epsilon = var_3626_to_fp16, x = input_201_cast_fp16)[name = string("normed_201_cast_fp16")];
+            tensor<int32, [2]> var_3636_split_sizes_0 = const()[name = string("op_3636_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3636_axis_0 = const()[name = string("op_3636_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3636_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3636_cast_fp16_1 = split(axis = var_3636_axis_0, split_sizes = var_3636_split_sizes_0, x = normed_201_cast_fp16)[name = string("op_3636_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_8_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(417639744)))];
+            tensor<fp16, [1, 1, 2560]> attn_output_cast_fp16 = mul(x = var_3636_cast_fp16_0, y = layers_8_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> x_137_cast_fp16 = add(x = x_127_cast_fp16, y = attn_output_cast_fp16)[name = string("x_137_cast_fp16")];
+            int32 var_3645 = const()[name = string("op_3645"), val = int32(-1)];
+            fp16 const_68_promoted_to_fp16 = const()[name = string("const_68_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_3647_cast_fp16 = mul(x = x_137_cast_fp16, y = const_68_promoted_to_fp16)[name = string("op_3647_cast_fp16")];
+            bool input_203_interleave_0 = const()[name = string("input_203_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_203_cast_fp16 = concat(axis = var_3645, interleave = input_203_interleave_0, values = (x_137_cast_fp16, var_3647_cast_fp16))[name = string("input_203_cast_fp16")];
+            tensor<int32, [1]> normed_205_axes_0 = const()[name = string("normed_205_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3642_to_fp16 = const()[name = string("op_3642_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_205_cast_fp16 = layer_norm(axes = normed_205_axes_0, epsilon = var_3642_to_fp16, x = input_203_cast_fp16)[name = string("normed_205_cast_fp16")];
+            tensor<int32, [2]> var_3652_split_sizes_0 = const()[name = string("op_3652_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3652_axis_0 = const()[name = string("op_3652_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3652_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3652_cast_fp16_1 = split(axis = var_3652_axis_0, split_sizes = var_3652_split_sizes_0, x = normed_205_cast_fp16)[name = string("op_3652_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_8_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(417644928)))];
+            tensor<fp16, [1, 1, 2560]> h_51_cast_fp16 = mul(x = var_3652_cast_fp16_0, y = layers_8_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_51_cast_fp16")];
+            tensor<int32, [3]> var_3663 = const()[name = string("op_3663"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_205_axes_0 = const()[name = string("input_205_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3664 = transpose(perm = var_3663, x = h_51_cast_fp16)[name = string("transpose_6")];
+            tensor<fp16, [1, 2560, 1, 1]> input_205 = expand_dims(axes = input_205_axes_0, x = var_3664)[name = string("input_205")];
+            string gate_33_pad_type_0 = const()[name = string("gate_33_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_33_strides_0 = const()[name = string("gate_33_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_33_pad_0 = const()[name = string("gate_33_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_33_dilations_0 = const()[name = string("gate_33_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_33_groups_0 = const()[name = string("gate_33_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> gate_33 = conv(dilations = gate_33_dilations_0, groups = gate_33_groups_0, pad = gate_33_pad_0, pad_type = gate_33_pad_type_0, strides = gate_33_strides_0, weight = layers_8_mlp_gate_proj_weight_palettized, x = input_205)[name = string("gate_33")];
+            string up_pad_type_0 = const()[name = string("up_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_strides_0 = const()[name = string("up_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_pad_0 = const()[name = string("up_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_dilations_0 = const()[name = string("up_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_groups_0 = const()[name = string("up_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 1]> up = conv(dilations = up_dilations_0, groups = up_groups_0, pad = up_pad_0, pad_type = up_pad_type_0, strides = up_strides_0, weight = layers_8_mlp_up_proj_weight_palettized, x = input_205)[name = string("up")];
+            string gate_mode_0 = const()[name = string("gate_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 1]> gate = gelu(mode = gate_mode_0, x = gate_33)[name = string("gate")];
+            tensor<fp16, [1, 10240, 1, 1]> input_207 = mul(x = gate, y = up)[name = string("input_207")];
+            string mlp_out_pad_type_0 = const()[name = string("mlp_out_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_strides_0 = const()[name = string("mlp_out_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_pad_0 = const()[name = string("mlp_out_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_dilations_0 = const()[name = string("mlp_out_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_groups_0 = const()[name = string("mlp_out_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 1]> mlp_out = conv(dilations = mlp_out_dilations_0, groups = mlp_out_groups_0, pad = mlp_out_pad_0, pad_type = mlp_out_pad_type_0, strides = mlp_out_strides_0, weight = layers_8_mlp_down_proj_weight_palettized, x = input_207)[name = string("mlp_out")];
+            tensor<int32, [1]> var_3704_axes_0 = const()[name = string("op_3704_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3704 = squeeze(axes = var_3704_axes_0, x = mlp_out)[name = string("op_3704")];
+            tensor<int32, [3]> var_3708 = const()[name = string("op_3708"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3714 = const()[name = string("op_3714"), val = int32(-1)];
+            fp16 const_69_promoted = const()[name = string("const_69_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_139 = transpose(perm = var_3708, x = var_3704)[name = string("transpose_5")];
+            tensor<fp16, [1, 1, 2560]> var_3716 = mul(x = x_139, y = const_69_promoted)[name = string("op_3716")];
+            bool input_209_interleave_0 = const()[name = string("input_209_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_209 = concat(axis = var_3714, interleave = input_209_interleave_0, values = (x_139, var_3716))[name = string("input_209")];
+            tensor<int32, [1]> normed_209_axes_0 = const()[name = string("normed_209_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3711_to_fp16 = const()[name = string("op_3711_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_209_cast_fp16 = layer_norm(axes = normed_209_axes_0, epsilon = var_3711_to_fp16, x = input_209)[name = string("normed_209_cast_fp16")];
+            tensor<int32, [2]> var_3721_split_sizes_0 = const()[name = string("op_3721_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3721_axis_0 = const()[name = string("op_3721_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3721_0, tensor<fp16, [1, 1, 2560]> var_3721_1 = split(axis = var_3721_axis_0, split_sizes = var_3721_split_sizes_0, x = normed_209_cast_fp16)[name = string("op_3721")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_83 = mul(x = var_3721_0, y = layers_8_post_feedforward_layernorm_weight)[name = string("hidden_states_83")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_85_cast_fp16 = add(x = x_137_cast_fp16, y = hidden_states_83)[name = string("hidden_states_85_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_begin_0 = const()[name = string("per_layer_slice_begin_0"), val = tensor<int32, [3]>([0, 0, 10496])];
+            tensor<int32, [3]> per_layer_slice_end_0 = const()[name = string("per_layer_slice_end_0"), val = tensor<int32, [3]>([1, 1, 1])];
+            tensor<bool, [3]> per_layer_slice_end_mask_0 = const()[name = string("per_layer_slice_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp16, [1, 1, 256]> per_layer_slice_cast_fp16 = slice_by_index(begin = per_layer_slice_begin_0, end = per_layer_slice_end_0, end_mask = per_layer_slice_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_cast_fp16")];
+            tensor<int32, [3]> var_3749 = const()[name = string("op_3749"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_211_axes_0 = const()[name = string("input_211_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3750 = transpose(perm = var_3749, x = hidden_states_85_cast_fp16)[name = string("transpose_4")];
+            tensor<fp16, [1, 2560, 1, 1]> input_211 = expand_dims(axes = input_211_axes_0, x = var_3750)[name = string("input_211")];
+            string gated_49_pad_type_0 = const()[name = string("gated_49_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_49_strides_0 = const()[name = string("gated_49_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_49_pad_0 = const()[name = string("gated_49_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_49_dilations_0 = const()[name = string("gated_49_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_49_groups_0 = const()[name = string("gated_49_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 1]> gated_49 = conv(dilations = gated_49_dilations_0, groups = gated_49_groups_0, pad = gated_49_pad_0, pad_type = gated_49_pad_type_0, strides = gated_49_strides_0, weight = layers_8_per_layer_input_gate_weight_palettized, x = input_211)[name = string("gated_49")];
+            string gated_51_mode_0 = const()[name = string("gated_51_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 1]> gated_51 = gelu(mode = gated_51_mode_0, x = gated_49)[name = string("gated_51")];
+            tensor<int32, [3]> var_3769 = const()[name = string("op_3769"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_axes_0 = const()[name = string("per_layer_slice_conv_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 1]> var_3770_cast_fp16 = transpose(perm = var_3769, x = per_layer_slice_cast_fp16)[name = string("transpose_3")];
+            tensor<fp16, [1, 256, 1, 1]> per_layer_slice_conv_cast_fp16 = expand_dims(axes = per_layer_slice_conv_axes_0, x = var_3770_cast_fp16)[name = string("per_layer_slice_conv_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 1]> input_213_cast_fp16 = mul(x = gated_51, y = per_layer_slice_conv_cast_fp16)[name = string("input_213_cast_fp16")];
+            string gated_pad_type_0 = const()[name = string("gated_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_strides_0 = const()[name = string("gated_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_pad_0 = const()[name = string("gated_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_dilations_0 = const()[name = string("gated_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_groups_0 = const()[name = string("gated_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_8_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(417650112))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(417977856))))[name = string("layers_8_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 1]> gated_cast_fp16 = conv(dilations = gated_dilations_0, groups = gated_groups_0, pad = gated_pad_0, pad_type = gated_pad_type_0, strides = gated_strides_0, weight = layers_8_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_213_cast_fp16)[name = string("gated_cast_fp16")];
+            tensor<int32, [1]> var_3786_axes_0 = const()[name = string("op_3786_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 1]> var_3786_cast_fp16 = squeeze(axes = var_3786_axes_0, x = gated_cast_fp16)[name = string("op_3786_cast_fp16")];
+            tensor<int32, [3]> var_3790 = const()[name = string("op_3790"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3796 = const()[name = string("op_3796"), val = int32(-1)];
+            fp16 const_70_promoted_to_fp16 = const()[name = string("const_70_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> x_141_cast_fp16 = transpose(perm = var_3790, x = var_3786_cast_fp16)[name = string("transpose_2")];
+            tensor<fp16, [1, 1, 2560]> var_3798_cast_fp16 = mul(x = x_141_cast_fp16, y = const_70_promoted_to_fp16)[name = string("op_3798_cast_fp16")];
+            bool input_215_interleave_0 = const()[name = string("input_215_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_215_cast_fp16 = concat(axis = var_3796, interleave = input_215_interleave_0, values = (x_141_cast_fp16, var_3798_cast_fp16))[name = string("input_215_cast_fp16")];
+            tensor<int32, [1]> normed_213_axes_0 = const()[name = string("normed_213_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3793_to_fp16 = const()[name = string("op_3793_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_213_cast_fp16 = layer_norm(axes = normed_213_axes_0, epsilon = var_3793_to_fp16, x = input_215_cast_fp16)[name = string("normed_213_cast_fp16")];
+            tensor<int32, [2]> var_3803_split_sizes_0 = const()[name = string("op_3803_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3803_axis_0 = const()[name = string("op_3803_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3803_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3803_cast_fp16_1 = split(axis = var_3803_axis_0, split_sizes = var_3803_split_sizes_0, x = normed_213_cast_fp16)[name = string("op_3803_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_8_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(417980480)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_89_cast_fp16 = mul(x = var_3803_cast_fp16_0, y = layers_8_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_89_cast_fp16")];
+            tensor<fp16, [1, 1, 2560]> hidden_states_cast_fp16 = add(x = hidden_states_85_cast_fp16, y = hidden_states_89_cast_fp16)[name = string("hidden_states_cast_fp16")];
+            tensor<fp16, [1]> const_71_promoted_to_fp16 = const()[name = string("const_71_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.c8p-2])];
+            tensor<fp16, [1, 1, 2560]> x_cast_fp16 = mul(x = hidden_states_cast_fp16, y = const_71_promoted_to_fp16)[name = string("x_cast_fp16")];
+            int32 var_3818 = const()[name = string("op_3818"), val = int32(-1)];
+            fp16 const_72_promoted_to_fp16 = const()[name = string("const_72_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 1, 2560]> var_3820_cast_fp16 = mul(x = x_cast_fp16, y = const_72_promoted_to_fp16)[name = string("op_3820_cast_fp16")];
+            bool input_217_interleave_0 = const()[name = string("input_217_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 1, 5120]> input_217_cast_fp16 = concat(axis = var_3818, interleave = input_217_interleave_0, values = (x_cast_fp16, var_3820_cast_fp16))[name = string("input_217_cast_fp16")];
+            tensor<int32, [1]> normed_217_axes_0 = const()[name = string("normed_217_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3815_to_fp16 = const()[name = string("op_3815_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 1, 5120]> normed_217_cast_fp16 = layer_norm(axes = normed_217_axes_0, epsilon = var_3815_to_fp16, x = input_217_cast_fp16)[name = string("normed_217_cast_fp16")];
+            tensor<int32, [2]> var_3825_split_sizes_0 = const()[name = string("op_3825_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3825_axis_0 = const()[name = string("op_3825_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 1, 2560]> var_3825_cast_fp16_0, tensor<fp16, [1, 1, 2560]> var_3825_cast_fp16_1 = split(axis = var_3825_axis_0, split_sizes = var_3825_split_sizes_0, x = normed_217_cast_fp16)[name = string("op_3825_cast_fp16")];
+            tensor<fp16, [2560]> norm_weight_promoted_to_fp16 = const()[name = string("norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(417985664)))];
+            tensor<fp16, [1, 1, 2560]> hidden_states_out = mul(x = var_3825_cast_fp16_0, y = norm_weight_promoted_to_fp16)[name = string("normed_221_cast_fp16")];
+            tensor<int32, [3]> var_3836 = const()[name = string("op_3836"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp16, [262144, 2560, 1]> squeeze_9_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [262144, 2560, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(417990848))), lut = tensor<fp16, [8192, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(753535232))))[name = string("squeeze_9_palettized")];
+            string var_3852_pad_type_0 = const()[name = string("op_3852_pad_type_0"), val = string("valid")];
+            int32 var_3852_groups_0 = const()[name = string("op_3852_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_3852_strides_0 = const()[name = string("op_3852_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_3852_pad_0 = const()[name = string("op_3852_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_3852_dilations_0 = const()[name = string("op_3852_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 2560, 1]> var_3837 = transpose(perm = var_3836, x = hidden_states_out)[name = string("transpose_1")];
+            tensor<fp16, [1, 262144, 1]> var_3852 = conv(dilations = var_3852_dilations_0, groups = var_3852_groups_0, pad = var_3852_pad_0, pad_type = var_3852_pad_type_0, strides = var_3852_strides_0, weight = squeeze_9_palettized, x = var_3837)[name = string("op_3852")];
+            tensor<int32, [3]> var_3856 = const()[name = string("op_3856"), val = tensor<int32, [3]>([0, 2, 1])];
+            fp16 _inversed_3859_y_0_to_fp16 = const()[name = string("_inversed_3859_y_0_to_fp16"), val = fp16(0x1.11p-5)];
+            tensor<fp16, [1, 1, 262144]> logits_1 = transpose(perm = var_3856, x = var_3852)[name = string("transpose_0")];
+            tensor<fp16, [1, 1, 262144]> _inversed_3859_cast_fp16 = mul(x = logits_1, y = _inversed_3859_y_0_to_fp16)[name = string("_inversed_3859_cast_fp16")];
+            tensor<fp16, [1, 1, 262144]> var_3860_cast_fp16 = tanh(x = _inversed_3859_cast_fp16)[name = string("op_3860_cast_fp16")];
+            fp16 var_3861_to_fp16 = const()[name = string("op_3861_to_fp16"), val = fp16(0x1.ep+4)];
+            tensor<fp16, [1, 1, 262144]> logits_3_cast_fp16 = mul(x = var_3860_cast_fp16, y = var_3861_to_fp16)[name = string("logits_3_cast_fp16")];
+            tensor<int32, [1]> logits_axes_0 = const()[name = string("logits_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 262144]> logits_cast_fp16 = squeeze(axes = logits_axes_0, x = logits_3_cast_fp16)[name = string("logits_cast_fp16")];
+            int32 var_3866 = const()[name = string("op_3866"), val = int32(-1)];
+            int32 token_id_axis_0 = const()[name = string("token_id_axis_0"), val = int32(-1)];
+            bool token_id_keep_dims_0 = const()[name = string("token_id_keep_dims_0"), val = bool(false)];
+            string token_id_output_dtype_0 = const()[name = string("token_id_output_dtype_0"), val = string("int32")];
+            tensor<int32, [1]> token_id = reduce_argmax(axis = token_id_axis_0, keep_dims = token_id_keep_dims_0, output_dtype = token_id_output_dtype_0, x = logits_cast_fp16)[name = string("token_id_cast_fp16")];
+            tensor<int32, [1]> var_3868_axes_0 = const()[name = string("op_3868_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<int32, [1, 1]> var_3868 = expand_dims(axes = var_3868_axes_0, x = token_id)[name = string("op_3868")];
+            bool var_3869_validate_indices_0 = const()[name = string("op_3869_validate_indices_0"), val = bool(false)];
+            tensor<fp16, [1, 1]> var_3869_cast_fp16 = gather_along_axis(axis = var_3866, indices = var_3868, validate_indices = var_3869_validate_indices_0, x = logits_cast_fp16)[name = string("op_3869_cast_fp16")];
+            tensor<int32, [1]> var_3870_axes_0 = const()[name = string("op_3870_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp16, [1]> token_logit = squeeze(axes = var_3870_axes_0, x = var_3869_cast_fp16)[name = string("op_3870_cast_fp16")];
+            tensor<fp16, [1, 1, 2048, 1]> update_mask_tmp = identity(x = update_mask)[name = string("update_mask_tmp")];
+        } -> (token_id, token_logit, hidden_states_out);
+    func verify_qK<ios18>(tensor<fp16, [1, 1, 3, 2048]> causal_mask_full, tensor<fp16, [1, 1, 3, 512]> causal_mask_sliding, tensor<fp16, [1, 1, 3, 512]> cos_f, tensor<fp16, [1, 1, 3, 256]> cos_s, tensor<fp16, [1, 3, 2560]> hidden_states, tensor<fp16, [1, 2, 512, 256]> kv13_k, tensor<fp16, [1, 2, 512, 256]> kv13_v, tensor<fp16, [1, 2, 2048, 512]> kv14_k, tensor<fp16, [1, 2, 2048, 512]> kv14_v, tensor<fp16, [1, 3, 10752]> per_layer_combined, tensor<fp16, [1, 1, 3, 512]> sin_f, tensor<fp16, [1, 1, 3, 256]> sin_s) {
+            tensor<fp16, [2048, 2560, 1, 1]> layers_0_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2621568))))[name = string("layers_0_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [256]> layers_0_self_attn_q_norm_weight = const()[name = string("layers_0_self_attn_q_norm_weight"), val = tensor<fp16, [256]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2623680)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_0_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(2624256))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(15731520))))[name = string("layers_0_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_0_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(15741824))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(28849088))))[name = string("layers_0_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_0_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(28859392))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(41966656))))[name = string("layers_0_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_0_post_feedforward_layernorm_weight = const()[name = string("layers_0_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(41969280)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_0_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(41974464))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(42302208))))[name = string("layers_0_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_1_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(42302528))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(44924032))))[name = string("layers_1_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_1_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(44926144))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(58033408))))[name = string("layers_1_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_1_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(58043712))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(71150976))))[name = string("layers_1_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_1_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(71161280))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(84268544))))[name = string("layers_1_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_1_post_feedforward_layernorm_weight = const()[name = string("layers_1_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(84271168)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_1_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(84276352))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(84604096))))[name = string("layers_1_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [4096, 2560, 1, 1]> layers_2_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(84604416))), lut = tensor<fp16, [128, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(89847360))))[name = string("layers_2_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [512]> layers_2_self_attn_q_norm_weight = const()[name = string("layers_2_self_attn_q_norm_weight"), val = tensor<fp16, [512]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(89851520)))];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_2_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(89852608))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(102959872))))[name = string("layers_2_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_2_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(102970176))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(116077440))))[name = string("layers_2_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_2_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(116087744))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(129195008))))[name = string("layers_2_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_2_post_feedforward_layernorm_weight = const()[name = string("layers_2_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(129197632)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_2_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(129202816))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(129530560))))[name = string("layers_2_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_3_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(129530880))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(132152384))))[name = string("layers_3_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_3_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(132154496))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(145261760))))[name = string("layers_3_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_3_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(145272064))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(158379328))))[name = string("layers_3_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_3_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(158389632))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(171496896))))[name = string("layers_3_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_3_post_feedforward_layernorm_weight = const()[name = string("layers_3_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(171499520)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_3_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(171504704))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(171832448))))[name = string("layers_3_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_4_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(171832768))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(174454272))))[name = string("layers_4_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_4_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(174456384))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(187563648))))[name = string("layers_4_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_4_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(187573952))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(200681216))))[name = string("layers_4_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_4_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(200691520))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(213798784))))[name = string("layers_4_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_4_post_feedforward_layernorm_weight = const()[name = string("layers_4_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(213801408)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_4_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(213806592))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(214134336))))[name = string("layers_4_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_5_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(214134656))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(216756160))))[name = string("layers_5_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_5_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(216758272))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(229865536))))[name = string("layers_5_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_5_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(229875840))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(242983104))))[name = string("layers_5_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_5_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(242993408))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(256100672))))[name = string("layers_5_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_5_post_feedforward_layernorm_weight = const()[name = string("layers_5_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(256103296)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_5_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(256108480))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(256436224))))[name = string("layers_5_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_6_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(256436544))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(259058048))))[name = string("layers_6_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_6_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(259060160))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(272167424))))[name = string("layers_6_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_6_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(272177728))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(285284992))))[name = string("layers_6_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_6_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(285295296))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(298402560))))[name = string("layers_6_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_6_post_feedforward_layernorm_weight = const()[name = string("layers_6_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(298405184)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_6_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(298410368))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(298738112))))[name = string("layers_6_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [2048, 2560, 1, 1]> layers_7_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2048, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(298738432))), lut = tensor<fp16, [64, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(301359936))))[name = string("layers_7_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_7_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(301362048))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(314469312))))[name = string("layers_7_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_7_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(314479616))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(327586880))))[name = string("layers_7_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_7_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(327597184))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(340704448))))[name = string("layers_7_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_7_post_feedforward_layernorm_weight = const()[name = string("layers_7_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(340707072)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_7_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(340712256))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(341040000))))[name = string("layers_7_per_layer_input_gate_weight_palettized")];
+            tensor<fp16, [4096, 2560, 1, 1]> layers_8_self_attn_q_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [4096, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(341040320))), lut = tensor<fp16, [128, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(346283264))))[name = string("layers_8_self_attn_q_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_8_mlp_gate_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(346287424))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(359394688))))[name = string("layers_8_mlp_gate_proj_weight_palettized")];
+            tensor<fp16, [10240, 2560, 1, 1]> layers_8_mlp_up_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [10240, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(359404992))), lut = tensor<fp16, [320, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(372512256))))[name = string("layers_8_mlp_up_proj_weight_palettized")];
+            tensor<fp16, [2560, 10240, 1, 1]> layers_8_mlp_down_proj_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 10240, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(372522560))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(385629824))))[name = string("layers_8_mlp_down_proj_weight_palettized")];
+            tensor<fp16, [2560]> layers_8_post_feedforward_layernorm_weight = const()[name = string("layers_8_post_feedforward_layernorm_weight"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(385632448)))];
+            tensor<fp16, [256, 2560, 1, 1]> layers_8_per_layer_input_gate_weight_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [256, 2560, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(385637632))), lut = tensor<fp16, [8, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(385965376))))[name = string("layers_8_per_layer_input_gate_weight_palettized")];
+            int32 var_451 = const()[name = string("op_451"), val = int32(-1)];
+            fp16 const_0_promoted_to_fp16 = const()[name = string("const_0_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_453_cast_fp16 = mul(x = hidden_states, y = const_0_promoted_to_fp16)[name = string("op_453_cast_fp16")];
+            bool input_1_interleave_0 = const()[name = string("input_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_1_cast_fp16 = concat(axis = var_451, interleave = input_1_interleave_0, values = (hidden_states, var_453_cast_fp16))[name = string("input_1_cast_fp16")];
+            tensor<int32, [1]> normed_1_axes_0 = const()[name = string("normed_1_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_448_to_fp16 = const()[name = string("op_448_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_1_cast_fp16 = layer_norm(axes = normed_1_axes_0, epsilon = var_448_to_fp16, x = input_1_cast_fp16)[name = string("normed_1_cast_fp16")];
+            tensor<int32, [2]> var_458_split_sizes_0 = const()[name = string("op_458_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_458_axis_0 = const()[name = string("op_458_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_458_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_458_cast_fp16_1 = split(axis = var_458_axis_0, split_sizes = var_458_split_sizes_0, x = normed_1_cast_fp16)[name = string("op_458_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_0_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(385965696)))];
+            tensor<fp16, [1, 3, 2560]> h_1_cast_fp16 = mul(x = var_458_cast_fp16_0, y = layers_0_input_layernorm_weight_promoted_to_fp16)[name = string("h_1_cast_fp16")];
+            tensor<int32, [3]> var_464 = const()[name = string("op_464"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_467_axes_0 = const()[name = string("op_467_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_465_cast_fp16 = transpose(perm = var_464, x = h_1_cast_fp16)[name = string("transpose_112")];
+            tensor<fp16, [1, 2560, 1, 3]> var_467_cast_fp16 = expand_dims(axes = var_467_axes_0, x = var_465_cast_fp16)[name = string("op_467_cast_fp16")];
+            string q_1_pad_type_0 = const()[name = string("q_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_1_strides_0 = const()[name = string("q_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_1_pad_0 = const()[name = string("q_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_1_dilations_0 = const()[name = string("q_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_1_groups_0 = const()[name = string("q_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_1 = conv(dilations = q_1_dilations_0, groups = q_1_groups_0, pad = q_1_pad_0, pad_type = q_1_pad_type_0, strides = q_1_strides_0, weight = layers_0_self_attn_q_proj_weight_palettized, x = var_467_cast_fp16)[name = string("q_1")];
+            tensor<int32, [4]> var_488 = const()[name = string("op_488"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_489 = reshape(shape = var_488, x = q_1)[name = string("op_489")];
+            tensor<int32, [4]> transpose_36_perm_0 = const()[name = string("transpose_36_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_512 = const()[name = string("op_512"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_36 = transpose(perm = transpose_36_perm_0, x = var_489)[name = string("transpose_111")];
+            tensor<fp16, [3, 8, 256]> x_1 = reshape(shape = var_512, x = transpose_36)[name = string("x_1")];
+            int32 var_518 = const()[name = string("op_518"), val = int32(-1)];
+            fp16 const_1_promoted = const()[name = string("const_1_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_520 = mul(x = x_1, y = const_1_promoted)[name = string("op_520")];
+            bool input_5_interleave_0 = const()[name = string("input_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_5 = concat(axis = var_518, interleave = input_5_interleave_0, values = (x_1, var_520))[name = string("input_5")];
+            tensor<int32, [1]> normed_5_axes_0 = const()[name = string("normed_5_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_515_to_fp16 = const()[name = string("op_515_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_5_cast_fp16 = layer_norm(axes = normed_5_axes_0, epsilon = var_515_to_fp16, x = input_5)[name = string("normed_5_cast_fp16")];
+            tensor<int32, [2]> var_525_split_sizes_0 = const()[name = string("op_525_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_525_axis_0 = const()[name = string("op_525_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_525_0, tensor<fp16, [3, 8, 256]> var_525_1 = split(axis = var_525_axis_0, split_sizes = var_525_split_sizes_0, x = normed_5_cast_fp16)[name = string("op_525")];
+            tensor<fp16, [3, 8, 256]> q_5 = mul(x = var_525_0, y = layers_0_self_attn_q_norm_weight)[name = string("q_5")];
+            tensor<int32, [4]> var_532 = const()[name = string("op_532"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_533 = reshape(shape = var_532, x = q_5)[name = string("op_533")];
+            tensor<int32, [4]> var_538 = const()[name = string("op_538"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_7 = transpose(perm = var_538, x = var_533)[name = string("transpose_110")];
+            tensor<fp16, [1, 8, 3, 256]> var_540_cast_fp16 = mul(x = q_7, y = cos_s)[name = string("op_540_cast_fp16")];
+            tensor<int32, [2]> var_541_split_sizes_0 = const()[name = string("op_541_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_541_axis_0 = const()[name = string("op_541_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_541_0, tensor<fp16, [1, 8, 3, 128]> var_541_1 = split(axis = var_541_axis_0, split_sizes = var_541_split_sizes_0, x = q_7)[name = string("op_541")];
+            fp16 const_2_promoted = const()[name = string("const_2_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_543 = mul(x = var_541_1, y = const_2_promoted)[name = string("op_543")];
+            int32 var_545 = const()[name = string("op_545"), val = int32(-1)];
+            bool var_546_interleave_0 = const()[name = string("op_546_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_546 = concat(axis = var_545, interleave = var_546_interleave_0, values = (var_543, var_541_0))[name = string("op_546")];
+            tensor<fp16, [1, 8, 3, 256]> var_547_cast_fp16 = mul(x = var_546, y = sin_s)[name = string("op_547_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_9_cast_fp16 = add(x = var_540_cast_fp16, y = var_547_cast_fp16)[name = string("q_9_cast_fp16")];
+            tensor<int32, [4]> transpose_0_perm_0 = const()[name = string("transpose_0_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_0_reps_0 = const()[name = string("tile_0_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_0_cast_fp16 = transpose(perm = transpose_0_perm_0, x = kv13_k)[name = string("transpose_109")];
+            tensor<fp16, [8, 1, 512, 256]> tile_0_cast_fp16 = tile(reps = tile_0_reps_0, x = transpose_0_cast_fp16)[name = string("tile_0_cast_fp16")];
+            tensor<int32, [5]> concat_0 = const()[name = string("concat_0"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_0_cast_fp16 = reshape(shape = concat_0, x = tile_0_cast_fp16)[name = string("reshape_0_cast_fp16")];
+            tensor<int32, [5]> transpose_1_perm_0 = const()[name = string("transpose_1_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_1 = const()[name = string("concat_1"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_1_cast_fp16 = transpose(perm = transpose_1_perm_0, x = reshape_0_cast_fp16)[name = string("transpose_108")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_1_cast_fp16 = reshape(shape = concat_1, x = transpose_1_cast_fp16)[name = string("reshape_1_cast_fp16")];
+            tensor<int32, [4]> transpose_37_perm_0 = const()[name = string("transpose_37_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_2_perm_0 = const()[name = string("transpose_2_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_1_reps_0 = const()[name = string("tile_1_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 512, 256]> transpose_2_cast_fp16 = transpose(perm = transpose_2_perm_0, x = kv13_v)[name = string("transpose_107")];
+            tensor<fp16, [8, 1, 512, 256]> tile_1_cast_fp16 = tile(reps = tile_1_reps_0, x = transpose_2_cast_fp16)[name = string("tile_1_cast_fp16")];
+            tensor<int32, [5]> concat_2 = const()[name = string("concat_2"), val = tensor<int32, [5]>([4, 2, 1, 512, 256])];
+            tensor<fp16, [4, 2, 1, 512, 256]> reshape_2_cast_fp16 = reshape(shape = concat_2, x = tile_1_cast_fp16)[name = string("reshape_2_cast_fp16")];
+            tensor<int32, [5]> transpose_3_perm_0 = const()[name = string("transpose_3_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_3 = const()[name = string("concat_3"), val = tensor<int32, [4]>([-1, 1, 512, 256])];
+            tensor<fp16, [2, 4, 1, 512, 256]> transpose_3_cast_fp16 = transpose(perm = transpose_3_perm_0, x = reshape_2_cast_fp16)[name = string("transpose_106")];
+            tensor<fp16, [8, 1, 512, 256]> reshape_3_cast_fp16 = reshape(shape = concat_3, x = transpose_3_cast_fp16)[name = string("reshape_3_cast_fp16")];
+            tensor<int32, [4]> V_expanded_1_perm_0 = const()[name = string("V_expanded_1_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_1_transpose_x_0 = const()[name = string("attn_weights_1_transpose_x_0"), val = bool(false)];
+            bool attn_weights_1_transpose_y_0 = const()[name = string("attn_weights_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 256, 512]> transpose_37_cast_fp16 = transpose(perm = transpose_37_perm_0, x = reshape_1_cast_fp16)[name = string("transpose_105")];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_1_cast_fp16 = matmul(transpose_x = attn_weights_1_transpose_x_0, transpose_y = attn_weights_1_transpose_y_0, x = q_9_cast_fp16, y = transpose_37_cast_fp16)[name = string("attn_weights_1_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_3_cast_fp16 = add(x = attn_weights_1_cast_fp16, y = causal_mask_sliding)[name = string("x_3_cast_fp16")];
+            tensor<int32, [1]> reduce_max_0_axes_0 = const()[name = string("reduce_max_0_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_0_keep_dims_0 = const()[name = string("reduce_max_0_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_0 = reduce_max(axes = reduce_max_0_axes_0, keep_dims = reduce_max_0_keep_dims_0, x = x_3_cast_fp16)[name = string("reduce_max_0")];
+            tensor<fp16, [1, 8, 3, 512]> var_579 = sub(x = x_3_cast_fp16, y = reduce_max_0)[name = string("op_579")];
+            tensor<fp16, [1, 8, 3, 512]> var_585 = exp(x = var_579)[name = string("op_585")];
+            tensor<int32, [1]> var_595_axes_0 = const()[name = string("op_595_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_595_keep_dims_0 = const()[name = string("op_595_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_595 = reduce_sum(axes = var_595_axes_0, keep_dims = var_595_keep_dims_0, x = var_585)[name = string("op_595")];
+            tensor<fp16, [1, 8, 3, 512]> var_601_cast_fp16 = real_div(x = var_585, y = var_595)[name = string("op_601_cast_fp16")];
+            bool attn_output_1_transpose_x_0 = const()[name = string("attn_output_1_transpose_x_0"), val = bool(false)];
+            bool attn_output_1_transpose_y_0 = const()[name = string("attn_output_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 256]> V_expanded_1_cast_fp16 = transpose(perm = V_expanded_1_perm_0, x = reshape_3_cast_fp16)[name = string("transpose_104")];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_1_cast_fp16 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = var_601_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_1_cast_fp16")];
+            tensor<int32, [4]> var_612 = const()[name = string("op_612"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_619 = const()[name = string("op_619"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_613_cast_fp16 = transpose(perm = var_612, x = attn_output_1_cast_fp16)[name = string("transpose_103")];
+            tensor<fp16, [1, 3, 2048]> attn_output_3_cast_fp16 = reshape(shape = var_619, x = var_613_cast_fp16)[name = string("attn_output_3_cast_fp16")];
+            tensor<int32, [3]> var_624 = const()[name = string("op_624"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_640_pad_type_0 = const()[name = string("op_640_pad_type_0"), val = string("valid")];
+            int32 var_640_groups_0 = const()[name = string("op_640_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_640_strides_0 = const()[name = string("op_640_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_640_pad_0 = const()[name = string("op_640_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_640_dilations_0 = const()[name = string("op_640_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_0_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(385970880))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388592384))))[name = string("squeeze_0_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_625_cast_fp16 = transpose(perm = var_624, x = attn_output_3_cast_fp16)[name = string("transpose_102")];
+            tensor<fp16, [1, 2560, 3]> var_640_cast_fp16 = conv(dilations = var_640_dilations_0, groups = var_640_groups_0, pad = var_640_pad_0, pad_type = var_640_pad_type_0, strides = var_640_strides_0, weight = squeeze_0_cast_fp16_to_fp32_to_fp16_palettized, x = var_625_cast_fp16)[name = string("op_640_cast_fp16")];
+            tensor<int32, [3]> var_644 = const()[name = string("op_644"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_650 = const()[name = string("op_650"), val = int32(-1)];
+            fp16 const_3_promoted_to_fp16 = const()[name = string("const_3_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_7_cast_fp16 = transpose(perm = var_644, x = var_640_cast_fp16)[name = string("transpose_101")];
+            tensor<fp16, [1, 3, 2560]> var_652_cast_fp16 = mul(x = x_7_cast_fp16, y = const_3_promoted_to_fp16)[name = string("op_652_cast_fp16")];
+            bool input_9_interleave_0 = const()[name = string("input_9_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_9_cast_fp16 = concat(axis = var_650, interleave = input_9_interleave_0, values = (x_7_cast_fp16, var_652_cast_fp16))[name = string("input_9_cast_fp16")];
+            tensor<int32, [1]> normed_9_axes_0 = const()[name = string("normed_9_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_647_to_fp16 = const()[name = string("op_647_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_9_cast_fp16 = layer_norm(axes = normed_9_axes_0, epsilon = var_647_to_fp16, x = input_9_cast_fp16)[name = string("normed_9_cast_fp16")];
+            tensor<int32, [2]> var_657_split_sizes_0 = const()[name = string("op_657_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_657_axis_0 = const()[name = string("op_657_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_657_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_657_cast_fp16_1 = split(axis = var_657_axis_0, split_sizes = var_657_split_sizes_0, x = normed_9_cast_fp16)[name = string("op_657_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_0_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388595008)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_5_cast_fp16 = mul(x = var_657_cast_fp16_0, y = layers_0_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_5_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_9_cast_fp16 = add(x = hidden_states, y = attn_output_5_cast_fp16)[name = string("x_9_cast_fp16")];
+            int32 var_666 = const()[name = string("op_666"), val = int32(-1)];
+            fp16 const_4_promoted_to_fp16 = const()[name = string("const_4_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_668_cast_fp16 = mul(x = x_9_cast_fp16, y = const_4_promoted_to_fp16)[name = string("op_668_cast_fp16")];
+            bool input_11_interleave_0 = const()[name = string("input_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_11_cast_fp16 = concat(axis = var_666, interleave = input_11_interleave_0, values = (x_9_cast_fp16, var_668_cast_fp16))[name = string("input_11_cast_fp16")];
+            tensor<int32, [1]> normed_13_axes_0 = const()[name = string("normed_13_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_663_to_fp16 = const()[name = string("op_663_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_13_cast_fp16 = layer_norm(axes = normed_13_axes_0, epsilon = var_663_to_fp16, x = input_11_cast_fp16)[name = string("normed_13_cast_fp16")];
+            tensor<int32, [2]> var_673_split_sizes_0 = const()[name = string("op_673_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_673_axis_0 = const()[name = string("op_673_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_673_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_673_cast_fp16_1 = split(axis = var_673_axis_0, split_sizes = var_673_split_sizes_0, x = normed_13_cast_fp16)[name = string("op_673_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_0_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388600192)))];
+            tensor<fp16, [1, 3, 2560]> h_3_cast_fp16 = mul(x = var_673_cast_fp16_0, y = layers_0_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_3_cast_fp16")];
+            tensor<int32, [3]> var_684 = const()[name = string("op_684"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_13_axes_0 = const()[name = string("input_13_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_685 = transpose(perm = var_684, x = h_3_cast_fp16)[name = string("transpose_100")];
+            tensor<fp16, [1, 2560, 1, 3]> input_13 = expand_dims(axes = input_13_axes_0, x = var_685)[name = string("input_13")];
+            string gate_1_pad_type_0 = const()[name = string("gate_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_1_strides_0 = const()[name = string("gate_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_1_pad_0 = const()[name = string("gate_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_1_dilations_0 = const()[name = string("gate_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_1_groups_0 = const()[name = string("gate_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_1 = conv(dilations = gate_1_dilations_0, groups = gate_1_groups_0, pad = gate_1_pad_0, pad_type = gate_1_pad_type_0, strides = gate_1_strides_0, weight = layers_0_mlp_gate_proj_weight_palettized, x = input_13)[name = string("gate_1")];
+            string up_1_pad_type_0 = const()[name = string("up_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_1_strides_0 = const()[name = string("up_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_1_pad_0 = const()[name = string("up_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_1_dilations_0 = const()[name = string("up_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_1_groups_0 = const()[name = string("up_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_1 = conv(dilations = up_1_dilations_0, groups = up_1_groups_0, pad = up_1_pad_0, pad_type = up_1_pad_type_0, strides = up_1_strides_0, weight = layers_0_mlp_up_proj_weight_palettized, x = input_13)[name = string("up_1")];
+            string gate_3_mode_0 = const()[name = string("gate_3_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_3 = gelu(mode = gate_3_mode_0, x = gate_1)[name = string("gate_3")];
+            tensor<fp16, [1, 10240, 1, 3]> input_15 = mul(x = gate_3, y = up_1)[name = string("input_15")];
+            string mlp_out_1_pad_type_0 = const()[name = string("mlp_out_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_1_strides_0 = const()[name = string("mlp_out_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_1_pad_0 = const()[name = string("mlp_out_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_1_dilations_0 = const()[name = string("mlp_out_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_1_groups_0 = const()[name = string("mlp_out_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_1 = conv(dilations = mlp_out_1_dilations_0, groups = mlp_out_1_groups_0, pad = mlp_out_1_pad_0, pad_type = mlp_out_1_pad_type_0, strides = mlp_out_1_strides_0, weight = layers_0_mlp_down_proj_weight_palettized, x = input_15)[name = string("mlp_out_1")];
+            tensor<int32, [1]> var_725_axes_0 = const()[name = string("op_725_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_725 = squeeze(axes = var_725_axes_0, x = mlp_out_1)[name = string("op_725")];
+            tensor<int32, [3]> var_729 = const()[name = string("op_729"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_735 = const()[name = string("op_735"), val = int32(-1)];
+            fp16 const_5_promoted = const()[name = string("const_5_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_11 = transpose(perm = var_729, x = var_725)[name = string("transpose_99")];
+            tensor<fp16, [1, 3, 2560]> var_737 = mul(x = x_11, y = const_5_promoted)[name = string("op_737")];
+            bool input_17_interleave_0 = const()[name = string("input_17_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_17 = concat(axis = var_735, interleave = input_17_interleave_0, values = (x_11, var_737))[name = string("input_17")];
+            tensor<int32, [1]> normed_17_axes_0 = const()[name = string("normed_17_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_732_to_fp16 = const()[name = string("op_732_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_17_cast_fp16 = layer_norm(axes = normed_17_axes_0, epsilon = var_732_to_fp16, x = input_17)[name = string("normed_17_cast_fp16")];
+            tensor<int32, [2]> var_742_split_sizes_0 = const()[name = string("op_742_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_742_axis_0 = const()[name = string("op_742_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_742_0, tensor<fp16, [1, 3, 2560]> var_742_1 = split(axis = var_742_axis_0, split_sizes = var_742_split_sizes_0, x = normed_17_cast_fp16)[name = string("op_742")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_3 = mul(x = var_742_0, y = layers_0_post_feedforward_layernorm_weight)[name = string("hidden_states_3")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_5_cast_fp16 = add(x = x_9_cast_fp16, y = hidden_states_3)[name = string("hidden_states_5_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_1_begin_0 = const()[name = string("per_layer_slice_1_begin_0"), val = tensor<int32, [3]>([0, 0, 8448])];
+            tensor<int32, [3]> per_layer_slice_1_end_0 = const()[name = string("per_layer_slice_1_end_0"), val = tensor<int32, [3]>([1, 3, 8704])];
+            tensor<bool, [3]> per_layer_slice_1_end_mask_0 = const()[name = string("per_layer_slice_1_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_1_cast_fp16 = slice_by_index(begin = per_layer_slice_1_begin_0, end = per_layer_slice_1_end_0, end_mask = per_layer_slice_1_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_1_cast_fp16")];
+            tensor<int32, [3]> var_770 = const()[name = string("op_770"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_19_axes_0 = const()[name = string("input_19_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_771 = transpose(perm = var_770, x = hidden_states_5_cast_fp16)[name = string("transpose_98")];
+            tensor<fp16, [1, 2560, 1, 3]> input_19 = expand_dims(axes = input_19_axes_0, x = var_771)[name = string("input_19")];
+            string gated_1_pad_type_0 = const()[name = string("gated_1_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_1_strides_0 = const()[name = string("gated_1_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_1_pad_0 = const()[name = string("gated_1_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_1_dilations_0 = const()[name = string("gated_1_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_1_groups_0 = const()[name = string("gated_1_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_1 = conv(dilations = gated_1_dilations_0, groups = gated_1_groups_0, pad = gated_1_pad_0, pad_type = gated_1_pad_type_0, strides = gated_1_strides_0, weight = layers_0_per_layer_input_gate_weight_palettized, x = input_19)[name = string("gated_1")];
+            string gated_3_mode_0 = const()[name = string("gated_3_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_3 = gelu(mode = gated_3_mode_0, x = gated_1)[name = string("gated_3")];
+            tensor<int32, [3]> var_790 = const()[name = string("op_790"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_1_axes_0 = const()[name = string("per_layer_slice_conv_1_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_791_cast_fp16 = transpose(perm = var_790, x = per_layer_slice_1_cast_fp16)[name = string("transpose_97")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_1_cast_fp16 = expand_dims(axes = per_layer_slice_conv_1_axes_0, x = var_791_cast_fp16)[name = string("per_layer_slice_conv_1_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_21_cast_fp16 = mul(x = gated_3, y = per_layer_slice_conv_1_cast_fp16)[name = string("input_21_cast_fp16")];
+            string gated_5_pad_type_0 = const()[name = string("gated_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_5_strides_0 = const()[name = string("gated_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_5_pad_0 = const()[name = string("gated_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_5_dilations_0 = const()[name = string("gated_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_5_groups_0 = const()[name = string("gated_5_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_0_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388605376))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388933120))))[name = string("layers_0_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_5_cast_fp16 = conv(dilations = gated_5_dilations_0, groups = gated_5_groups_0, pad = gated_5_pad_0, pad_type = gated_5_pad_type_0, strides = gated_5_strides_0, weight = layers_0_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_21_cast_fp16)[name = string("gated_5_cast_fp16")];
+            tensor<int32, [1]> var_807_axes_0 = const()[name = string("op_807_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_807_cast_fp16 = squeeze(axes = var_807_axes_0, x = gated_5_cast_fp16)[name = string("op_807_cast_fp16")];
+            tensor<int32, [3]> var_811 = const()[name = string("op_811"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_817 = const()[name = string("op_817"), val = int32(-1)];
+            fp16 const_6_promoted_to_fp16 = const()[name = string("const_6_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_13_cast_fp16 = transpose(perm = var_811, x = var_807_cast_fp16)[name = string("transpose_96")];
+            tensor<fp16, [1, 3, 2560]> var_819_cast_fp16 = mul(x = x_13_cast_fp16, y = const_6_promoted_to_fp16)[name = string("op_819_cast_fp16")];
+            bool input_23_interleave_0 = const()[name = string("input_23_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_23_cast_fp16 = concat(axis = var_817, interleave = input_23_interleave_0, values = (x_13_cast_fp16, var_819_cast_fp16))[name = string("input_23_cast_fp16")];
+            tensor<int32, [1]> normed_21_axes_0 = const()[name = string("normed_21_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_814_to_fp16 = const()[name = string("op_814_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_21_cast_fp16 = layer_norm(axes = normed_21_axes_0, epsilon = var_814_to_fp16, x = input_23_cast_fp16)[name = string("normed_21_cast_fp16")];
+            tensor<int32, [2]> var_824_split_sizes_0 = const()[name = string("op_824_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_824_axis_0 = const()[name = string("op_824_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_824_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_824_cast_fp16_1 = split(axis = var_824_axis_0, split_sizes = var_824_split_sizes_0, x = normed_21_cast_fp16)[name = string("op_824_cast_fp16")];
+            tensor<fp16, [2560]> layers_0_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_0_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388935744)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_9_cast_fp16 = mul(x = var_824_cast_fp16_0, y = layers_0_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_9_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_11_cast_fp16 = add(x = hidden_states_5_cast_fp16, y = hidden_states_9_cast_fp16)[name = string("hidden_states_11_cast_fp16")];
+            tensor<fp16, [1]> const_7_promoted_to_fp16 = const()[name = string("const_7_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.a6p-1])];
+            tensor<fp16, [1, 3, 2560]> x_15_cast_fp16 = mul(x = hidden_states_11_cast_fp16, y = const_7_promoted_to_fp16)[name = string("x_15_cast_fp16")];
+            int32 var_839 = const()[name = string("op_839"), val = int32(-1)];
+            fp16 const_8_promoted_to_fp16 = const()[name = string("const_8_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_841_cast_fp16 = mul(x = x_15_cast_fp16, y = const_8_promoted_to_fp16)[name = string("op_841_cast_fp16")];
+            bool input_25_interleave_0 = const()[name = string("input_25_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_25_cast_fp16 = concat(axis = var_839, interleave = input_25_interleave_0, values = (x_15_cast_fp16, var_841_cast_fp16))[name = string("input_25_cast_fp16")];
+            tensor<int32, [1]> normed_25_axes_0 = const()[name = string("normed_25_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_836_to_fp16 = const()[name = string("op_836_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_25_cast_fp16 = layer_norm(axes = normed_25_axes_0, epsilon = var_836_to_fp16, x = input_25_cast_fp16)[name = string("normed_25_cast_fp16")];
+            tensor<int32, [2]> var_846_split_sizes_0 = const()[name = string("op_846_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_846_axis_0 = const()[name = string("op_846_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_846_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_846_cast_fp16_1 = split(axis = var_846_axis_0, split_sizes = var_846_split_sizes_0, x = normed_25_cast_fp16)[name = string("op_846_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_1_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388940928)))];
+            tensor<fp16, [1, 3, 2560]> h_7_cast_fp16 = mul(x = var_846_cast_fp16_0, y = layers_1_input_layernorm_weight_promoted_to_fp16)[name = string("h_7_cast_fp16")];
+            tensor<int32, [3]> var_852 = const()[name = string("op_852"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_855_axes_0 = const()[name = string("op_855_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_853_cast_fp16 = transpose(perm = var_852, x = h_7_cast_fp16)[name = string("transpose_95")];
+            tensor<fp16, [1, 2560, 1, 3]> var_855_cast_fp16 = expand_dims(axes = var_855_axes_0, x = var_853_cast_fp16)[name = string("op_855_cast_fp16")];
+            string q_11_pad_type_0 = const()[name = string("q_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_11_strides_0 = const()[name = string("q_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_11_pad_0 = const()[name = string("q_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_11_dilations_0 = const()[name = string("q_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_11_groups_0 = const()[name = string("q_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_11 = conv(dilations = q_11_dilations_0, groups = q_11_groups_0, pad = q_11_pad_0, pad_type = q_11_pad_type_0, strides = q_11_strides_0, weight = layers_1_self_attn_q_proj_weight_palettized, x = var_855_cast_fp16)[name = string("q_11")];
+            tensor<int32, [4]> var_876 = const()[name = string("op_876"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_877 = reshape(shape = var_876, x = q_11)[name = string("op_877")];
+            tensor<int32, [4]> transpose_38_perm_0 = const()[name = string("transpose_38_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_900 = const()[name = string("op_900"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_38 = transpose(perm = transpose_38_perm_0, x = var_877)[name = string("transpose_94")];
+            tensor<fp16, [3, 8, 256]> x_17 = reshape(shape = var_900, x = transpose_38)[name = string("x_17")];
+            int32 var_906 = const()[name = string("op_906"), val = int32(-1)];
+            fp16 const_9_promoted = const()[name = string("const_9_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_908 = mul(x = x_17, y = const_9_promoted)[name = string("op_908")];
+            bool input_29_interleave_0 = const()[name = string("input_29_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_29 = concat(axis = var_906, interleave = input_29_interleave_0, values = (x_17, var_908))[name = string("input_29")];
+            tensor<int32, [1]> normed_29_axes_0 = const()[name = string("normed_29_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_903_to_fp16 = const()[name = string("op_903_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_29_cast_fp16 = layer_norm(axes = normed_29_axes_0, epsilon = var_903_to_fp16, x = input_29)[name = string("normed_29_cast_fp16")];
+            tensor<int32, [2]> var_913_split_sizes_0 = const()[name = string("op_913_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_913_axis_0 = const()[name = string("op_913_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_913_0, tensor<fp16, [3, 8, 256]> var_913_1 = split(axis = var_913_axis_0, split_sizes = var_913_split_sizes_0, x = normed_29_cast_fp16)[name = string("op_913")];
+            tensor<fp16, [3, 8, 256]> q_15 = mul(x = var_913_0, y = layers_0_self_attn_q_norm_weight)[name = string("q_15")];
+            tensor<int32, [4]> var_920 = const()[name = string("op_920"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_921 = reshape(shape = var_920, x = q_15)[name = string("op_921")];
+            tensor<int32, [4]> var_926 = const()[name = string("op_926"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_17 = transpose(perm = var_926, x = var_921)[name = string("transpose_93")];
+            tensor<fp16, [1, 8, 3, 256]> var_928_cast_fp16 = mul(x = q_17, y = cos_s)[name = string("op_928_cast_fp16")];
+            tensor<int32, [2]> var_929_split_sizes_0 = const()[name = string("op_929_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_929_axis_0 = const()[name = string("op_929_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_929_0, tensor<fp16, [1, 8, 3, 128]> var_929_1 = split(axis = var_929_axis_0, split_sizes = var_929_split_sizes_0, x = q_17)[name = string("op_929")];
+            fp16 const_10_promoted = const()[name = string("const_10_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_931 = mul(x = var_929_1, y = const_10_promoted)[name = string("op_931")];
+            int32 var_933 = const()[name = string("op_933"), val = int32(-1)];
+            bool var_934_interleave_0 = const()[name = string("op_934_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_934 = concat(axis = var_933, interleave = var_934_interleave_0, values = (var_931, var_929_0))[name = string("op_934")];
+            tensor<fp16, [1, 8, 3, 256]> var_935_cast_fp16 = mul(x = var_934, y = sin_s)[name = string("op_935_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_19_cast_fp16 = add(x = var_928_cast_fp16, y = var_935_cast_fp16)[name = string("q_19_cast_fp16")];
+            bool attn_weights_5_transpose_x_0 = const()[name = string("attn_weights_5_transpose_x_0"), val = bool(false)];
+            bool attn_weights_5_transpose_y_0 = const()[name = string("attn_weights_5_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_5_cast_fp16 = matmul(transpose_x = attn_weights_5_transpose_x_0, transpose_y = attn_weights_5_transpose_y_0, x = q_19_cast_fp16, y = transpose_37_cast_fp16)[name = string("attn_weights_5_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_19_cast_fp16 = add(x = attn_weights_5_cast_fp16, y = causal_mask_sliding)[name = string("x_19_cast_fp16")];
+            tensor<int32, [1]> reduce_max_1_axes_0 = const()[name = string("reduce_max_1_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_1_keep_dims_0 = const()[name = string("reduce_max_1_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_1 = reduce_max(axes = reduce_max_1_axes_0, keep_dims = reduce_max_1_keep_dims_0, x = x_19_cast_fp16)[name = string("reduce_max_1")];
+            tensor<fp16, [1, 8, 3, 512]> var_967 = sub(x = x_19_cast_fp16, y = reduce_max_1)[name = string("op_967")];
+            tensor<fp16, [1, 8, 3, 512]> var_973 = exp(x = var_967)[name = string("op_973")];
+            tensor<int32, [1]> var_983_axes_0 = const()[name = string("op_983_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_983_keep_dims_0 = const()[name = string("op_983_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_983 = reduce_sum(axes = var_983_axes_0, keep_dims = var_983_keep_dims_0, x = var_973)[name = string("op_983")];
+            tensor<fp16, [1, 8, 3, 512]> var_989_cast_fp16 = real_div(x = var_973, y = var_983)[name = string("op_989_cast_fp16")];
+            bool attn_output_7_transpose_x_0 = const()[name = string("attn_output_7_transpose_x_0"), val = bool(false)];
+            bool attn_output_7_transpose_y_0 = const()[name = string("attn_output_7_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_7_cast_fp16 = matmul(transpose_x = attn_output_7_transpose_x_0, transpose_y = attn_output_7_transpose_y_0, x = var_989_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_7_cast_fp16")];
+            tensor<int32, [4]> var_1000 = const()[name = string("op_1000"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1007 = const()[name = string("op_1007"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_1001_cast_fp16 = transpose(perm = var_1000, x = attn_output_7_cast_fp16)[name = string("transpose_92")];
+            tensor<fp16, [1, 3, 2048]> attn_output_9_cast_fp16 = reshape(shape = var_1007, x = var_1001_cast_fp16)[name = string("attn_output_9_cast_fp16")];
+            tensor<int32, [3]> var_1012 = const()[name = string("op_1012"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_1028_pad_type_0 = const()[name = string("op_1028_pad_type_0"), val = string("valid")];
+            int32 var_1028_groups_0 = const()[name = string("op_1028_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_1028_strides_0 = const()[name = string("op_1028_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_1028_pad_0 = const()[name = string("op_1028_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_1028_dilations_0 = const()[name = string("op_1028_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_1_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(388946112))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391567616))))[name = string("squeeze_1_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_1013_cast_fp16 = transpose(perm = var_1012, x = attn_output_9_cast_fp16)[name = string("transpose_91")];
+            tensor<fp16, [1, 2560, 3]> var_1028_cast_fp16 = conv(dilations = var_1028_dilations_0, groups = var_1028_groups_0, pad = var_1028_pad_0, pad_type = var_1028_pad_type_0, strides = var_1028_strides_0, weight = squeeze_1_cast_fp16_to_fp32_to_fp16_palettized, x = var_1013_cast_fp16)[name = string("op_1028_cast_fp16")];
+            tensor<int32, [3]> var_1032 = const()[name = string("op_1032"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1038 = const()[name = string("op_1038"), val = int32(-1)];
+            fp16 const_11_promoted_to_fp16 = const()[name = string("const_11_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_23_cast_fp16 = transpose(perm = var_1032, x = var_1028_cast_fp16)[name = string("transpose_90")];
+            tensor<fp16, [1, 3, 2560]> var_1040_cast_fp16 = mul(x = x_23_cast_fp16, y = const_11_promoted_to_fp16)[name = string("op_1040_cast_fp16")];
+            bool input_33_interleave_0 = const()[name = string("input_33_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_33_cast_fp16 = concat(axis = var_1038, interleave = input_33_interleave_0, values = (x_23_cast_fp16, var_1040_cast_fp16))[name = string("input_33_cast_fp16")];
+            tensor<int32, [1]> normed_33_axes_0 = const()[name = string("normed_33_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1035_to_fp16 = const()[name = string("op_1035_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_33_cast_fp16 = layer_norm(axes = normed_33_axes_0, epsilon = var_1035_to_fp16, x = input_33_cast_fp16)[name = string("normed_33_cast_fp16")];
+            tensor<int32, [2]> var_1045_split_sizes_0 = const()[name = string("op_1045_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1045_axis_0 = const()[name = string("op_1045_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1045_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1045_cast_fp16_1 = split(axis = var_1045_axis_0, split_sizes = var_1045_split_sizes_0, x = normed_33_cast_fp16)[name = string("op_1045_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_1_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391570240)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_11_cast_fp16 = mul(x = var_1045_cast_fp16_0, y = layers_1_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_11_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_25_cast_fp16 = add(x = x_15_cast_fp16, y = attn_output_11_cast_fp16)[name = string("x_25_cast_fp16")];
+            int32 var_1054 = const()[name = string("op_1054"), val = int32(-1)];
+            fp16 const_12_promoted_to_fp16 = const()[name = string("const_12_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_1056_cast_fp16 = mul(x = x_25_cast_fp16, y = const_12_promoted_to_fp16)[name = string("op_1056_cast_fp16")];
+            bool input_35_interleave_0 = const()[name = string("input_35_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_35_cast_fp16 = concat(axis = var_1054, interleave = input_35_interleave_0, values = (x_25_cast_fp16, var_1056_cast_fp16))[name = string("input_35_cast_fp16")];
+            tensor<int32, [1]> normed_37_axes_0 = const()[name = string("normed_37_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1051_to_fp16 = const()[name = string("op_1051_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_37_cast_fp16 = layer_norm(axes = normed_37_axes_0, epsilon = var_1051_to_fp16, x = input_35_cast_fp16)[name = string("normed_37_cast_fp16")];
+            tensor<int32, [2]> var_1061_split_sizes_0 = const()[name = string("op_1061_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1061_axis_0 = const()[name = string("op_1061_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1061_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1061_cast_fp16_1 = split(axis = var_1061_axis_0, split_sizes = var_1061_split_sizes_0, x = normed_37_cast_fp16)[name = string("op_1061_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_1_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391575424)))];
+            tensor<fp16, [1, 3, 2560]> h_9_cast_fp16 = mul(x = var_1061_cast_fp16_0, y = layers_1_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_9_cast_fp16")];
+            tensor<int32, [3]> var_1072 = const()[name = string("op_1072"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_37_axes_0 = const()[name = string("input_37_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1073 = transpose(perm = var_1072, x = h_9_cast_fp16)[name = string("transpose_89")];
+            tensor<fp16, [1, 2560, 1, 3]> input_37 = expand_dims(axes = input_37_axes_0, x = var_1073)[name = string("input_37")];
+            string gate_5_pad_type_0 = const()[name = string("gate_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_5_strides_0 = const()[name = string("gate_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_5_pad_0 = const()[name = string("gate_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_5_dilations_0 = const()[name = string("gate_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_5_groups_0 = const()[name = string("gate_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_5 = conv(dilations = gate_5_dilations_0, groups = gate_5_groups_0, pad = gate_5_pad_0, pad_type = gate_5_pad_type_0, strides = gate_5_strides_0, weight = layers_1_mlp_gate_proj_weight_palettized, x = input_37)[name = string("gate_5")];
+            string up_3_pad_type_0 = const()[name = string("up_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_3_strides_0 = const()[name = string("up_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_3_pad_0 = const()[name = string("up_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_3_dilations_0 = const()[name = string("up_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_3_groups_0 = const()[name = string("up_3_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_3 = conv(dilations = up_3_dilations_0, groups = up_3_groups_0, pad = up_3_pad_0, pad_type = up_3_pad_type_0, strides = up_3_strides_0, weight = layers_1_mlp_up_proj_weight_palettized, x = input_37)[name = string("up_3")];
+            string gate_7_mode_0 = const()[name = string("gate_7_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_7 = gelu(mode = gate_7_mode_0, x = gate_5)[name = string("gate_7")];
+            tensor<fp16, [1, 10240, 1, 3]> input_39 = mul(x = gate_7, y = up_3)[name = string("input_39")];
+            string mlp_out_3_pad_type_0 = const()[name = string("mlp_out_3_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_3_strides_0 = const()[name = string("mlp_out_3_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_3_pad_0 = const()[name = string("mlp_out_3_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_3_dilations_0 = const()[name = string("mlp_out_3_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_3_groups_0 = const()[name = string("mlp_out_3_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_3 = conv(dilations = mlp_out_3_dilations_0, groups = mlp_out_3_groups_0, pad = mlp_out_3_pad_0, pad_type = mlp_out_3_pad_type_0, strides = mlp_out_3_strides_0, weight = layers_1_mlp_down_proj_weight_palettized, x = input_39)[name = string("mlp_out_3")];
+            tensor<int32, [1]> var_1113_axes_0 = const()[name = string("op_1113_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1113 = squeeze(axes = var_1113_axes_0, x = mlp_out_3)[name = string("op_1113")];
+            tensor<int32, [3]> var_1117 = const()[name = string("op_1117"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1123 = const()[name = string("op_1123"), val = int32(-1)];
+            fp16 const_13_promoted = const()[name = string("const_13_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_27 = transpose(perm = var_1117, x = var_1113)[name = string("transpose_88")];
+            tensor<fp16, [1, 3, 2560]> var_1125 = mul(x = x_27, y = const_13_promoted)[name = string("op_1125")];
+            bool input_41_interleave_0 = const()[name = string("input_41_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_41 = concat(axis = var_1123, interleave = input_41_interleave_0, values = (x_27, var_1125))[name = string("input_41")];
+            tensor<int32, [1]> normed_41_axes_0 = const()[name = string("normed_41_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1120_to_fp16 = const()[name = string("op_1120_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_41_cast_fp16 = layer_norm(axes = normed_41_axes_0, epsilon = var_1120_to_fp16, x = input_41)[name = string("normed_41_cast_fp16")];
+            tensor<int32, [2]> var_1130_split_sizes_0 = const()[name = string("op_1130_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1130_axis_0 = const()[name = string("op_1130_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1130_0, tensor<fp16, [1, 3, 2560]> var_1130_1 = split(axis = var_1130_axis_0, split_sizes = var_1130_split_sizes_0, x = normed_41_cast_fp16)[name = string("op_1130")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_13 = mul(x = var_1130_0, y = layers_1_post_feedforward_layernorm_weight)[name = string("hidden_states_13")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_15_cast_fp16 = add(x = x_25_cast_fp16, y = hidden_states_13)[name = string("hidden_states_15_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_3_begin_0 = const()[name = string("per_layer_slice_3_begin_0"), val = tensor<int32, [3]>([0, 0, 8704])];
+            tensor<int32, [3]> per_layer_slice_3_end_0 = const()[name = string("per_layer_slice_3_end_0"), val = tensor<int32, [3]>([1, 3, 8960])];
+            tensor<bool, [3]> per_layer_slice_3_end_mask_0 = const()[name = string("per_layer_slice_3_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_3_cast_fp16 = slice_by_index(begin = per_layer_slice_3_begin_0, end = per_layer_slice_3_end_0, end_mask = per_layer_slice_3_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_3_cast_fp16")];
+            tensor<int32, [3]> var_1158 = const()[name = string("op_1158"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_43_axes_0 = const()[name = string("input_43_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1159 = transpose(perm = var_1158, x = hidden_states_15_cast_fp16)[name = string("transpose_87")];
+            tensor<fp16, [1, 2560, 1, 3]> input_43 = expand_dims(axes = input_43_axes_0, x = var_1159)[name = string("input_43")];
+            string gated_7_pad_type_0 = const()[name = string("gated_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_7_strides_0 = const()[name = string("gated_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_7_pad_0 = const()[name = string("gated_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_7_dilations_0 = const()[name = string("gated_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_7_groups_0 = const()[name = string("gated_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_7 = conv(dilations = gated_7_dilations_0, groups = gated_7_groups_0, pad = gated_7_pad_0, pad_type = gated_7_pad_type_0, strides = gated_7_strides_0, weight = layers_1_per_layer_input_gate_weight_palettized, x = input_43)[name = string("gated_7")];
+            string gated_9_mode_0 = const()[name = string("gated_9_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_9 = gelu(mode = gated_9_mode_0, x = gated_7)[name = string("gated_9")];
+            tensor<int32, [3]> var_1178 = const()[name = string("op_1178"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_3_axes_0 = const()[name = string("per_layer_slice_conv_3_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_1179_cast_fp16 = transpose(perm = var_1178, x = per_layer_slice_3_cast_fp16)[name = string("transpose_86")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_3_cast_fp16 = expand_dims(axes = per_layer_slice_conv_3_axes_0, x = var_1179_cast_fp16)[name = string("per_layer_slice_conv_3_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_45_cast_fp16 = mul(x = gated_9, y = per_layer_slice_conv_3_cast_fp16)[name = string("input_45_cast_fp16")];
+            string gated_11_pad_type_0 = const()[name = string("gated_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_11_strides_0 = const()[name = string("gated_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_11_pad_0 = const()[name = string("gated_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_11_dilations_0 = const()[name = string("gated_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_11_groups_0 = const()[name = string("gated_11_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_1_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391580608))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391908352))))[name = string("layers_1_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_11_cast_fp16 = conv(dilations = gated_11_dilations_0, groups = gated_11_groups_0, pad = gated_11_pad_0, pad_type = gated_11_pad_type_0, strides = gated_11_strides_0, weight = layers_1_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_45_cast_fp16)[name = string("gated_11_cast_fp16")];
+            tensor<int32, [1]> var_1195_axes_0 = const()[name = string("op_1195_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1195_cast_fp16 = squeeze(axes = var_1195_axes_0, x = gated_11_cast_fp16)[name = string("op_1195_cast_fp16")];
+            tensor<int32, [3]> var_1199 = const()[name = string("op_1199"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1205 = const()[name = string("op_1205"), val = int32(-1)];
+            fp16 const_14_promoted_to_fp16 = const()[name = string("const_14_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_29_cast_fp16 = transpose(perm = var_1199, x = var_1195_cast_fp16)[name = string("transpose_85")];
+            tensor<fp16, [1, 3, 2560]> var_1207_cast_fp16 = mul(x = x_29_cast_fp16, y = const_14_promoted_to_fp16)[name = string("op_1207_cast_fp16")];
+            bool input_47_interleave_0 = const()[name = string("input_47_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_47_cast_fp16 = concat(axis = var_1205, interleave = input_47_interleave_0, values = (x_29_cast_fp16, var_1207_cast_fp16))[name = string("input_47_cast_fp16")];
+            tensor<int32, [1]> normed_45_axes_0 = const()[name = string("normed_45_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1202_to_fp16 = const()[name = string("op_1202_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_45_cast_fp16 = layer_norm(axes = normed_45_axes_0, epsilon = var_1202_to_fp16, x = input_47_cast_fp16)[name = string("normed_45_cast_fp16")];
+            tensor<int32, [2]> var_1212_split_sizes_0 = const()[name = string("op_1212_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1212_axis_0 = const()[name = string("op_1212_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1212_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1212_cast_fp16_1 = split(axis = var_1212_axis_0, split_sizes = var_1212_split_sizes_0, x = normed_45_cast_fp16)[name = string("op_1212_cast_fp16")];
+            tensor<fp16, [2560]> layers_1_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_1_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391910976)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_19_cast_fp16 = mul(x = var_1212_cast_fp16_0, y = layers_1_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_19_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_21_cast_fp16 = add(x = hidden_states_15_cast_fp16, y = hidden_states_19_cast_fp16)[name = string("hidden_states_21_cast_fp16")];
+            tensor<fp16, [1]> const_15_promoted_to_fp16 = const()[name = string("const_15_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.acp-1])];
+            tensor<fp16, [1, 3, 2560]> x_31_cast_fp16 = mul(x = hidden_states_21_cast_fp16, y = const_15_promoted_to_fp16)[name = string("x_31_cast_fp16")];
+            int32 var_1227 = const()[name = string("op_1227"), val = int32(-1)];
+            fp16 const_16_promoted_to_fp16 = const()[name = string("const_16_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_1229_cast_fp16 = mul(x = x_31_cast_fp16, y = const_16_promoted_to_fp16)[name = string("op_1229_cast_fp16")];
+            bool input_49_interleave_0 = const()[name = string("input_49_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_49_cast_fp16 = concat(axis = var_1227, interleave = input_49_interleave_0, values = (x_31_cast_fp16, var_1229_cast_fp16))[name = string("input_49_cast_fp16")];
+            tensor<int32, [1]> normed_49_axes_0 = const()[name = string("normed_49_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1224_to_fp16 = const()[name = string("op_1224_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_49_cast_fp16 = layer_norm(axes = normed_49_axes_0, epsilon = var_1224_to_fp16, x = input_49_cast_fp16)[name = string("normed_49_cast_fp16")];
+            tensor<int32, [2]> var_1234_split_sizes_0 = const()[name = string("op_1234_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1234_axis_0 = const()[name = string("op_1234_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1234_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1234_cast_fp16_1 = split(axis = var_1234_axis_0, split_sizes = var_1234_split_sizes_0, x = normed_49_cast_fp16)[name = string("op_1234_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_2_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391916160)))];
+            tensor<fp16, [1, 3, 2560]> h_13_cast_fp16 = mul(x = var_1234_cast_fp16_0, y = layers_2_input_layernorm_weight_promoted_to_fp16)[name = string("h_13_cast_fp16")];
+            tensor<int32, [3]> var_1240 = const()[name = string("op_1240"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1243_axes_0 = const()[name = string("op_1243_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1241_cast_fp16 = transpose(perm = var_1240, x = h_13_cast_fp16)[name = string("transpose_84")];
+            tensor<fp16, [1, 2560, 1, 3]> var_1243_cast_fp16 = expand_dims(axes = var_1243_axes_0, x = var_1241_cast_fp16)[name = string("op_1243_cast_fp16")];
+            string q_21_pad_type_0 = const()[name = string("q_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_21_strides_0 = const()[name = string("q_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_21_pad_0 = const()[name = string("q_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_21_dilations_0 = const()[name = string("q_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_21_groups_0 = const()[name = string("q_21_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 4096, 1, 3]> q_21 = conv(dilations = q_21_dilations_0, groups = q_21_groups_0, pad = q_21_pad_0, pad_type = q_21_pad_type_0, strides = q_21_strides_0, weight = layers_2_self_attn_q_proj_weight_palettized, x = var_1243_cast_fp16)[name = string("q_21")];
+            tensor<int32, [4]> var_1264 = const()[name = string("op_1264"), val = tensor<int32, [4]>([1, 8, 512, 3])];
+            tensor<fp16, [1, 8, 512, 3]> var_1265 = reshape(shape = var_1264, x = q_21)[name = string("op_1265")];
+            tensor<int32, [4]> transpose_40_perm_0 = const()[name = string("transpose_40_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_1288 = const()[name = string("op_1288"), val = tensor<int32, [3]>([3, 8, 512])];
+            tensor<fp16, [1, 3, 8, 512]> transpose_40 = transpose(perm = transpose_40_perm_0, x = var_1265)[name = string("transpose_83")];
+            tensor<fp16, [3, 8, 512]> x_33 = reshape(shape = var_1288, x = transpose_40)[name = string("x_33")];
+            int32 var_1294 = const()[name = string("op_1294"), val = int32(-1)];
+            fp16 const_17_promoted = const()[name = string("const_17_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 512]> var_1296 = mul(x = x_33, y = const_17_promoted)[name = string("op_1296")];
+            bool input_53_interleave_0 = const()[name = string("input_53_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 1024]> input_53 = concat(axis = var_1294, interleave = input_53_interleave_0, values = (x_33, var_1296))[name = string("input_53")];
+            tensor<int32, [1]> normed_53_axes_0 = const()[name = string("normed_53_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1291_to_fp16 = const()[name = string("op_1291_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 1024]> normed_53_cast_fp16 = layer_norm(axes = normed_53_axes_0, epsilon = var_1291_to_fp16, x = input_53)[name = string("normed_53_cast_fp16")];
+            tensor<int32, [2]> var_1301_split_sizes_0 = const()[name = string("op_1301_split_sizes_0"), val = tensor<int32, [2]>([512, 512])];
+            int32 var_1301_axis_0 = const()[name = string("op_1301_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 512]> var_1301_0, tensor<fp16, [3, 8, 512]> var_1301_1 = split(axis = var_1301_axis_0, split_sizes = var_1301_split_sizes_0, x = normed_53_cast_fp16)[name = string("op_1301")];
+            tensor<fp16, [3, 8, 512]> q_25 = mul(x = var_1301_0, y = layers_2_self_attn_q_norm_weight)[name = string("q_25")];
+            tensor<int32, [4]> var_1308 = const()[name = string("op_1308"), val = tensor<int32, [4]>([1, 3, 8, 512])];
+            tensor<fp16, [1, 3, 8, 512]> var_1309 = reshape(shape = var_1308, x = q_25)[name = string("op_1309")];
+            tensor<int32, [4]> var_1314 = const()[name = string("op_1314"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 512]> q_27 = transpose(perm = var_1314, x = var_1309)[name = string("transpose_82")];
+            tensor<fp16, [1, 8, 3, 512]> var_1316_cast_fp16 = mul(x = q_27, y = cos_f)[name = string("op_1316_cast_fp16")];
+            tensor<int32, [2]> var_1317_split_sizes_0 = const()[name = string("op_1317_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_1317_axis_0 = const()[name = string("op_1317_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 256]> var_1317_0, tensor<fp16, [1, 8, 3, 256]> var_1317_1 = split(axis = var_1317_axis_0, split_sizes = var_1317_split_sizes_0, x = q_27)[name = string("op_1317")];
+            fp16 const_18_promoted = const()[name = string("const_18_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 256]> var_1319 = mul(x = var_1317_1, y = const_18_promoted)[name = string("op_1319")];
+            int32 var_1321 = const()[name = string("op_1321"), val = int32(-1)];
+            bool var_1322_interleave_0 = const()[name = string("op_1322_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 512]> var_1322 = concat(axis = var_1321, interleave = var_1322_interleave_0, values = (var_1319, var_1317_0))[name = string("op_1322")];
+            tensor<fp16, [1, 8, 3, 512]> var_1323_cast_fp16 = mul(x = var_1322, y = sin_f)[name = string("op_1323_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> q_29_cast_fp16 = add(x = var_1316_cast_fp16, y = var_1323_cast_fp16)[name = string("q_29_cast_fp16")];
+            tensor<int32, [4]> transpose_8_perm_0 = const()[name = string("transpose_8_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_4_reps_0 = const()[name = string("tile_4_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 2048, 512]> transpose_8_cast_fp16 = transpose(perm = transpose_8_perm_0, x = kv14_k)[name = string("transpose_81")];
+            tensor<fp16, [8, 1, 2048, 512]> tile_4_cast_fp16 = tile(reps = tile_4_reps_0, x = transpose_8_cast_fp16)[name = string("tile_4_cast_fp16")];
+            tensor<int32, [5]> concat_8 = const()[name = string("concat_8"), val = tensor<int32, [5]>([4, 2, 1, 2048, 512])];
+            tensor<fp16, [4, 2, 1, 2048, 512]> reshape_8_cast_fp16 = reshape(shape = concat_8, x = tile_4_cast_fp16)[name = string("reshape_8_cast_fp16")];
+            tensor<int32, [5]> transpose_9_perm_0 = const()[name = string("transpose_9_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_9 = const()[name = string("concat_9"), val = tensor<int32, [4]>([-1, 1, 2048, 512])];
+            tensor<fp16, [2, 4, 1, 2048, 512]> transpose_9_cast_fp16 = transpose(perm = transpose_9_perm_0, x = reshape_8_cast_fp16)[name = string("transpose_80")];
+            tensor<fp16, [8, 1, 2048, 512]> reshape_9_cast_fp16 = reshape(shape = concat_9, x = transpose_9_cast_fp16)[name = string("reshape_9_cast_fp16")];
+            tensor<int32, [4]> transpose_41_perm_0 = const()[name = string("transpose_41_perm_0"), val = tensor<int32, [4]>([1, 0, -1, -2])];
+            tensor<int32, [4]> transpose_10_perm_0 = const()[name = string("transpose_10_perm_0"), val = tensor<int32, [4]>([1, 0, 2, 3])];
+            tensor<int32, [4]> tile_5_reps_0 = const()[name = string("tile_5_reps_0"), val = tensor<int32, [4]>([4, 1, 1, 1])];
+            tensor<fp16, [2, 1, 2048, 512]> transpose_10_cast_fp16 = transpose(perm = transpose_10_perm_0, x = kv14_v)[name = string("transpose_79")];
+            tensor<fp16, [8, 1, 2048, 512]> tile_5_cast_fp16 = tile(reps = tile_5_reps_0, x = transpose_10_cast_fp16)[name = string("tile_5_cast_fp16")];
+            tensor<int32, [5]> concat_10 = const()[name = string("concat_10"), val = tensor<int32, [5]>([4, 2, 1, 2048, 512])];
+            tensor<fp16, [4, 2, 1, 2048, 512]> reshape_10_cast_fp16 = reshape(shape = concat_10, x = tile_5_cast_fp16)[name = string("reshape_10_cast_fp16")];
+            tensor<int32, [5]> transpose_11_perm_0 = const()[name = string("transpose_11_perm_0"), val = tensor<int32, [5]>([1, 0, 2, 3, 4])];
+            tensor<int32, [4]> concat_11 = const()[name = string("concat_11"), val = tensor<int32, [4]>([-1, 1, 2048, 512])];
+            tensor<fp16, [2, 4, 1, 2048, 512]> transpose_11_cast_fp16 = transpose(perm = transpose_11_perm_0, x = reshape_10_cast_fp16)[name = string("transpose_78")];
+            tensor<fp16, [8, 1, 2048, 512]> reshape_11_cast_fp16 = reshape(shape = concat_11, x = transpose_11_cast_fp16)[name = string("reshape_11_cast_fp16")];
+            tensor<int32, [4]> V_expanded_5_perm_0 = const()[name = string("V_expanded_5_perm_0"), val = tensor<int32, [4]>([1, 0, -2, -1])];
+            bool attn_weights_9_transpose_x_0 = const()[name = string("attn_weights_9_transpose_x_0"), val = bool(false)];
+            bool attn_weights_9_transpose_y_0 = const()[name = string("attn_weights_9_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 512, 2048]> transpose_41_cast_fp16 = transpose(perm = transpose_41_perm_0, x = reshape_9_cast_fp16)[name = string("transpose_77")];
+            tensor<fp16, [1, 8, 3, 2048]> attn_weights_9_cast_fp16 = matmul(transpose_x = attn_weights_9_transpose_x_0, transpose_y = attn_weights_9_transpose_y_0, x = q_29_cast_fp16, y = transpose_41_cast_fp16)[name = string("attn_weights_9_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 2048]> x_35_cast_fp16 = add(x = attn_weights_9_cast_fp16, y = causal_mask_full)[name = string("x_35_cast_fp16")];
+            tensor<int32, [1]> reduce_max_2_axes_0 = const()[name = string("reduce_max_2_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_2_keep_dims_0 = const()[name = string("reduce_max_2_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_2 = reduce_max(axes = reduce_max_2_axes_0, keep_dims = reduce_max_2_keep_dims_0, x = x_35_cast_fp16)[name = string("reduce_max_2")];
+            tensor<fp16, [1, 8, 3, 2048]> var_1355 = sub(x = x_35_cast_fp16, y = reduce_max_2)[name = string("op_1355")];
+            tensor<fp16, [1, 8, 3, 2048]> var_1361 = exp(x = var_1355)[name = string("op_1361")];
+            tensor<int32, [1]> var_1371_axes_0 = const()[name = string("op_1371_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1371_keep_dims_0 = const()[name = string("op_1371_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_1371 = reduce_sum(axes = var_1371_axes_0, keep_dims = var_1371_keep_dims_0, x = var_1361)[name = string("op_1371")];
+            tensor<fp16, [1, 8, 3, 2048]> var_1377_cast_fp16 = real_div(x = var_1361, y = var_1371)[name = string("op_1377_cast_fp16")];
+            bool attn_output_13_transpose_x_0 = const()[name = string("attn_output_13_transpose_x_0"), val = bool(false)];
+            bool attn_output_13_transpose_y_0 = const()[name = string("attn_output_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 2048, 512]> V_expanded_5_cast_fp16 = transpose(perm = V_expanded_5_perm_0, x = reshape_11_cast_fp16)[name = string("transpose_76")];
+            tensor<fp16, [1, 8, 3, 512]> attn_output_13_cast_fp16 = matmul(transpose_x = attn_output_13_transpose_x_0, transpose_y = attn_output_13_transpose_y_0, x = var_1377_cast_fp16, y = V_expanded_5_cast_fp16)[name = string("attn_output_13_cast_fp16")];
+            tensor<int32, [4]> var_1388 = const()[name = string("op_1388"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1395 = const()[name = string("op_1395"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 512]> var_1389_cast_fp16 = transpose(perm = var_1388, x = attn_output_13_cast_fp16)[name = string("transpose_75")];
+            tensor<fp16, [1, 3, 4096]> attn_output_15_cast_fp16 = reshape(shape = var_1395, x = var_1389_cast_fp16)[name = string("attn_output_15_cast_fp16")];
+            tensor<int32, [3]> var_1400 = const()[name = string("op_1400"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_1416_pad_type_0 = const()[name = string("op_1416_pad_type_0"), val = string("valid")];
+            int32 var_1416_groups_0 = const()[name = string("op_1416_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_1416_strides_0 = const()[name = string("op_1416_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_1416_pad_0 = const()[name = string("op_1416_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_1416_dilations_0 = const()[name = string("op_1416_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 4096, 1]> squeeze_2_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 4096, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(391921344))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397164288))))[name = string("squeeze_2_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 4096, 3]> var_1401_cast_fp16 = transpose(perm = var_1400, x = attn_output_15_cast_fp16)[name = string("transpose_74")];
+            tensor<fp16, [1, 2560, 3]> var_1416_cast_fp16 = conv(dilations = var_1416_dilations_0, groups = var_1416_groups_0, pad = var_1416_pad_0, pad_type = var_1416_pad_type_0, strides = var_1416_strides_0, weight = squeeze_2_cast_fp16_to_fp32_to_fp16_palettized, x = var_1401_cast_fp16)[name = string("op_1416_cast_fp16")];
+            tensor<int32, [3]> var_1420 = const()[name = string("op_1420"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1426 = const()[name = string("op_1426"), val = int32(-1)];
+            fp16 const_19_promoted_to_fp16 = const()[name = string("const_19_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_39_cast_fp16 = transpose(perm = var_1420, x = var_1416_cast_fp16)[name = string("transpose_73")];
+            tensor<fp16, [1, 3, 2560]> var_1428_cast_fp16 = mul(x = x_39_cast_fp16, y = const_19_promoted_to_fp16)[name = string("op_1428_cast_fp16")];
+            bool input_57_interleave_0 = const()[name = string("input_57_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_57_cast_fp16 = concat(axis = var_1426, interleave = input_57_interleave_0, values = (x_39_cast_fp16, var_1428_cast_fp16))[name = string("input_57_cast_fp16")];
+            tensor<int32, [1]> normed_57_axes_0 = const()[name = string("normed_57_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1423_to_fp16 = const()[name = string("op_1423_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_57_cast_fp16 = layer_norm(axes = normed_57_axes_0, epsilon = var_1423_to_fp16, x = input_57_cast_fp16)[name = string("normed_57_cast_fp16")];
+            tensor<int32, [2]> var_1433_split_sizes_0 = const()[name = string("op_1433_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1433_axis_0 = const()[name = string("op_1433_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1433_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1433_cast_fp16_1 = split(axis = var_1433_axis_0, split_sizes = var_1433_split_sizes_0, x = normed_57_cast_fp16)[name = string("op_1433_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_2_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397166912)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_17_cast_fp16 = mul(x = var_1433_cast_fp16_0, y = layers_2_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_17_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_41_cast_fp16 = add(x = x_31_cast_fp16, y = attn_output_17_cast_fp16)[name = string("x_41_cast_fp16")];
+            int32 var_1442 = const()[name = string("op_1442"), val = int32(-1)];
+            fp16 const_20_promoted_to_fp16 = const()[name = string("const_20_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_1444_cast_fp16 = mul(x = x_41_cast_fp16, y = const_20_promoted_to_fp16)[name = string("op_1444_cast_fp16")];
+            bool input_59_interleave_0 = const()[name = string("input_59_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_59_cast_fp16 = concat(axis = var_1442, interleave = input_59_interleave_0, values = (x_41_cast_fp16, var_1444_cast_fp16))[name = string("input_59_cast_fp16")];
+            tensor<int32, [1]> normed_61_axes_0 = const()[name = string("normed_61_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1439_to_fp16 = const()[name = string("op_1439_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_61_cast_fp16 = layer_norm(axes = normed_61_axes_0, epsilon = var_1439_to_fp16, x = input_59_cast_fp16)[name = string("normed_61_cast_fp16")];
+            tensor<int32, [2]> var_1449_split_sizes_0 = const()[name = string("op_1449_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1449_axis_0 = const()[name = string("op_1449_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1449_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1449_cast_fp16_1 = split(axis = var_1449_axis_0, split_sizes = var_1449_split_sizes_0, x = normed_61_cast_fp16)[name = string("op_1449_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_2_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397172096)))];
+            tensor<fp16, [1, 3, 2560]> h_15_cast_fp16 = mul(x = var_1449_cast_fp16_0, y = layers_2_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_15_cast_fp16")];
+            tensor<int32, [3]> var_1460 = const()[name = string("op_1460"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_61_axes_0 = const()[name = string("input_61_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1461 = transpose(perm = var_1460, x = h_15_cast_fp16)[name = string("transpose_72")];
+            tensor<fp16, [1, 2560, 1, 3]> input_61 = expand_dims(axes = input_61_axes_0, x = var_1461)[name = string("input_61")];
+            string gate_9_pad_type_0 = const()[name = string("gate_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_9_strides_0 = const()[name = string("gate_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_9_pad_0 = const()[name = string("gate_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_9_dilations_0 = const()[name = string("gate_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_9_groups_0 = const()[name = string("gate_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_9 = conv(dilations = gate_9_dilations_0, groups = gate_9_groups_0, pad = gate_9_pad_0, pad_type = gate_9_pad_type_0, strides = gate_9_strides_0, weight = layers_2_mlp_gate_proj_weight_palettized, x = input_61)[name = string("gate_9")];
+            string up_5_pad_type_0 = const()[name = string("up_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_5_strides_0 = const()[name = string("up_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_5_pad_0 = const()[name = string("up_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_5_dilations_0 = const()[name = string("up_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_5_groups_0 = const()[name = string("up_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_5 = conv(dilations = up_5_dilations_0, groups = up_5_groups_0, pad = up_5_pad_0, pad_type = up_5_pad_type_0, strides = up_5_strides_0, weight = layers_2_mlp_up_proj_weight_palettized, x = input_61)[name = string("up_5")];
+            string gate_11_mode_0 = const()[name = string("gate_11_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_11 = gelu(mode = gate_11_mode_0, x = gate_9)[name = string("gate_11")];
+            tensor<fp16, [1, 10240, 1, 3]> input_63 = mul(x = gate_11, y = up_5)[name = string("input_63")];
+            string mlp_out_5_pad_type_0 = const()[name = string("mlp_out_5_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_5_strides_0 = const()[name = string("mlp_out_5_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_5_pad_0 = const()[name = string("mlp_out_5_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_5_dilations_0 = const()[name = string("mlp_out_5_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_5_groups_0 = const()[name = string("mlp_out_5_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_5 = conv(dilations = mlp_out_5_dilations_0, groups = mlp_out_5_groups_0, pad = mlp_out_5_pad_0, pad_type = mlp_out_5_pad_type_0, strides = mlp_out_5_strides_0, weight = layers_2_mlp_down_proj_weight_palettized, x = input_63)[name = string("mlp_out_5")];
+            tensor<int32, [1]> var_1501_axes_0 = const()[name = string("op_1501_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1501 = squeeze(axes = var_1501_axes_0, x = mlp_out_5)[name = string("op_1501")];
+            tensor<int32, [3]> var_1505 = const()[name = string("op_1505"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1511 = const()[name = string("op_1511"), val = int32(-1)];
+            fp16 const_21_promoted = const()[name = string("const_21_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_43 = transpose(perm = var_1505, x = var_1501)[name = string("transpose_71")];
+            tensor<fp16, [1, 3, 2560]> var_1513 = mul(x = x_43, y = const_21_promoted)[name = string("op_1513")];
+            bool input_65_interleave_0 = const()[name = string("input_65_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_65 = concat(axis = var_1511, interleave = input_65_interleave_0, values = (x_43, var_1513))[name = string("input_65")];
+            tensor<int32, [1]> normed_65_axes_0 = const()[name = string("normed_65_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1508_to_fp16 = const()[name = string("op_1508_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_65_cast_fp16 = layer_norm(axes = normed_65_axes_0, epsilon = var_1508_to_fp16, x = input_65)[name = string("normed_65_cast_fp16")];
+            tensor<int32, [2]> var_1518_split_sizes_0 = const()[name = string("op_1518_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1518_axis_0 = const()[name = string("op_1518_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1518_0, tensor<fp16, [1, 3, 2560]> var_1518_1 = split(axis = var_1518_axis_0, split_sizes = var_1518_split_sizes_0, x = normed_65_cast_fp16)[name = string("op_1518")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_23 = mul(x = var_1518_0, y = layers_2_post_feedforward_layernorm_weight)[name = string("hidden_states_23")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_25_cast_fp16 = add(x = x_41_cast_fp16, y = hidden_states_23)[name = string("hidden_states_25_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_5_begin_0 = const()[name = string("per_layer_slice_5_begin_0"), val = tensor<int32, [3]>([0, 0, 8960])];
+            tensor<int32, [3]> per_layer_slice_5_end_0 = const()[name = string("per_layer_slice_5_end_0"), val = tensor<int32, [3]>([1, 3, 9216])];
+            tensor<bool, [3]> per_layer_slice_5_end_mask_0 = const()[name = string("per_layer_slice_5_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_5_cast_fp16 = slice_by_index(begin = per_layer_slice_5_begin_0, end = per_layer_slice_5_end_0, end_mask = per_layer_slice_5_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_5_cast_fp16")];
+            tensor<int32, [3]> var_1546 = const()[name = string("op_1546"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_67_axes_0 = const()[name = string("input_67_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1547 = transpose(perm = var_1546, x = hidden_states_25_cast_fp16)[name = string("transpose_70")];
+            tensor<fp16, [1, 2560, 1, 3]> input_67 = expand_dims(axes = input_67_axes_0, x = var_1547)[name = string("input_67")];
+            string gated_13_pad_type_0 = const()[name = string("gated_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_13_strides_0 = const()[name = string("gated_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_13_pad_0 = const()[name = string("gated_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_13_dilations_0 = const()[name = string("gated_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_13_groups_0 = const()[name = string("gated_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_13 = conv(dilations = gated_13_dilations_0, groups = gated_13_groups_0, pad = gated_13_pad_0, pad_type = gated_13_pad_type_0, strides = gated_13_strides_0, weight = layers_2_per_layer_input_gate_weight_palettized, x = input_67)[name = string("gated_13")];
+            string gated_15_mode_0 = const()[name = string("gated_15_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_15 = gelu(mode = gated_15_mode_0, x = gated_13)[name = string("gated_15")];
+            tensor<int32, [3]> var_1566 = const()[name = string("op_1566"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_5_axes_0 = const()[name = string("per_layer_slice_conv_5_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_1567_cast_fp16 = transpose(perm = var_1566, x = per_layer_slice_5_cast_fp16)[name = string("transpose_69")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_5_cast_fp16 = expand_dims(axes = per_layer_slice_conv_5_axes_0, x = var_1567_cast_fp16)[name = string("per_layer_slice_conv_5_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_69_cast_fp16 = mul(x = gated_15, y = per_layer_slice_conv_5_cast_fp16)[name = string("input_69_cast_fp16")];
+            string gated_17_pad_type_0 = const()[name = string("gated_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_17_strides_0 = const()[name = string("gated_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_17_pad_0 = const()[name = string("gated_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_17_dilations_0 = const()[name = string("gated_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_17_groups_0 = const()[name = string("gated_17_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_2_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397177280))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397505024))))[name = string("layers_2_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_17_cast_fp16 = conv(dilations = gated_17_dilations_0, groups = gated_17_groups_0, pad = gated_17_pad_0, pad_type = gated_17_pad_type_0, strides = gated_17_strides_0, weight = layers_2_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_69_cast_fp16)[name = string("gated_17_cast_fp16")];
+            tensor<int32, [1]> var_1583_axes_0 = const()[name = string("op_1583_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1583_cast_fp16 = squeeze(axes = var_1583_axes_0, x = gated_17_cast_fp16)[name = string("op_1583_cast_fp16")];
+            tensor<int32, [3]> var_1587 = const()[name = string("op_1587"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1593 = const()[name = string("op_1593"), val = int32(-1)];
+            fp16 const_22_promoted_to_fp16 = const()[name = string("const_22_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_45_cast_fp16 = transpose(perm = var_1587, x = var_1583_cast_fp16)[name = string("transpose_68")];
+            tensor<fp16, [1, 3, 2560]> var_1595_cast_fp16 = mul(x = x_45_cast_fp16, y = const_22_promoted_to_fp16)[name = string("op_1595_cast_fp16")];
+            bool input_71_interleave_0 = const()[name = string("input_71_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_71_cast_fp16 = concat(axis = var_1593, interleave = input_71_interleave_0, values = (x_45_cast_fp16, var_1595_cast_fp16))[name = string("input_71_cast_fp16")];
+            tensor<int32, [1]> normed_69_axes_0 = const()[name = string("normed_69_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1590_to_fp16 = const()[name = string("op_1590_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_69_cast_fp16 = layer_norm(axes = normed_69_axes_0, epsilon = var_1590_to_fp16, x = input_71_cast_fp16)[name = string("normed_69_cast_fp16")];
+            tensor<int32, [2]> var_1600_split_sizes_0 = const()[name = string("op_1600_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1600_axis_0 = const()[name = string("op_1600_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1600_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1600_cast_fp16_1 = split(axis = var_1600_axis_0, split_sizes = var_1600_split_sizes_0, x = normed_69_cast_fp16)[name = string("op_1600_cast_fp16")];
+            tensor<fp16, [2560]> layers_2_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_2_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397507648)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_29_cast_fp16 = mul(x = var_1600_cast_fp16_0, y = layers_2_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_29_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_31_cast_fp16 = add(x = hidden_states_25_cast_fp16, y = hidden_states_29_cast_fp16)[name = string("hidden_states_31_cast_fp16")];
+            tensor<fp16, [1]> const_23_promoted_to_fp16 = const()[name = string("const_23_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.acp-1])];
+            tensor<fp16, [1, 3, 2560]> x_47_cast_fp16 = mul(x = hidden_states_31_cast_fp16, y = const_23_promoted_to_fp16)[name = string("x_47_cast_fp16")];
+            int32 var_1615 = const()[name = string("op_1615"), val = int32(-1)];
+            fp16 const_24_promoted_to_fp16 = const()[name = string("const_24_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_1617_cast_fp16 = mul(x = x_47_cast_fp16, y = const_24_promoted_to_fp16)[name = string("op_1617_cast_fp16")];
+            bool input_73_interleave_0 = const()[name = string("input_73_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_73_cast_fp16 = concat(axis = var_1615, interleave = input_73_interleave_0, values = (x_47_cast_fp16, var_1617_cast_fp16))[name = string("input_73_cast_fp16")];
+            tensor<int32, [1]> normed_73_axes_0 = const()[name = string("normed_73_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1612_to_fp16 = const()[name = string("op_1612_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_73_cast_fp16 = layer_norm(axes = normed_73_axes_0, epsilon = var_1612_to_fp16, x = input_73_cast_fp16)[name = string("normed_73_cast_fp16")];
+            tensor<int32, [2]> var_1622_split_sizes_0 = const()[name = string("op_1622_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1622_axis_0 = const()[name = string("op_1622_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1622_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1622_cast_fp16_1 = split(axis = var_1622_axis_0, split_sizes = var_1622_split_sizes_0, x = normed_73_cast_fp16)[name = string("op_1622_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_3_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397512832)))];
+            tensor<fp16, [1, 3, 2560]> h_19_cast_fp16 = mul(x = var_1622_cast_fp16_0, y = layers_3_input_layernorm_weight_promoted_to_fp16)[name = string("h_19_cast_fp16")];
+            tensor<int32, [3]> var_1628 = const()[name = string("op_1628"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_1631_axes_0 = const()[name = string("op_1631_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1629_cast_fp16 = transpose(perm = var_1628, x = h_19_cast_fp16)[name = string("transpose_67")];
+            tensor<fp16, [1, 2560, 1, 3]> var_1631_cast_fp16 = expand_dims(axes = var_1631_axes_0, x = var_1629_cast_fp16)[name = string("op_1631_cast_fp16")];
+            string q_31_pad_type_0 = const()[name = string("q_31_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_31_strides_0 = const()[name = string("q_31_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_31_pad_0 = const()[name = string("q_31_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_31_dilations_0 = const()[name = string("q_31_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_31_groups_0 = const()[name = string("q_31_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_31 = conv(dilations = q_31_dilations_0, groups = q_31_groups_0, pad = q_31_pad_0, pad_type = q_31_pad_type_0, strides = q_31_strides_0, weight = layers_3_self_attn_q_proj_weight_palettized, x = var_1631_cast_fp16)[name = string("q_31")];
+            tensor<int32, [4]> var_1652 = const()[name = string("op_1652"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_1653 = reshape(shape = var_1652, x = q_31)[name = string("op_1653")];
+            tensor<int32, [4]> transpose_42_perm_0 = const()[name = string("transpose_42_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_1676 = const()[name = string("op_1676"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_42 = transpose(perm = transpose_42_perm_0, x = var_1653)[name = string("transpose_66")];
+            tensor<fp16, [3, 8, 256]> x_49 = reshape(shape = var_1676, x = transpose_42)[name = string("x_49")];
+            int32 var_1682 = const()[name = string("op_1682"), val = int32(-1)];
+            fp16 const_25_promoted = const()[name = string("const_25_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_1684 = mul(x = x_49, y = const_25_promoted)[name = string("op_1684")];
+            bool input_77_interleave_0 = const()[name = string("input_77_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_77 = concat(axis = var_1682, interleave = input_77_interleave_0, values = (x_49, var_1684))[name = string("input_77")];
+            tensor<int32, [1]> normed_77_axes_0 = const()[name = string("normed_77_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1679_to_fp16 = const()[name = string("op_1679_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_77_cast_fp16 = layer_norm(axes = normed_77_axes_0, epsilon = var_1679_to_fp16, x = input_77)[name = string("normed_77_cast_fp16")];
+            tensor<int32, [2]> var_1689_split_sizes_0 = const()[name = string("op_1689_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_1689_axis_0 = const()[name = string("op_1689_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_1689_0, tensor<fp16, [3, 8, 256]> var_1689_1 = split(axis = var_1689_axis_0, split_sizes = var_1689_split_sizes_0, x = normed_77_cast_fp16)[name = string("op_1689")];
+            tensor<fp16, [3, 8, 256]> q_35 = mul(x = var_1689_0, y = layers_0_self_attn_q_norm_weight)[name = string("q_35")];
+            tensor<int32, [4]> var_1696 = const()[name = string("op_1696"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_1697 = reshape(shape = var_1696, x = q_35)[name = string("op_1697")];
+            tensor<int32, [4]> var_1702 = const()[name = string("op_1702"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_37 = transpose(perm = var_1702, x = var_1697)[name = string("transpose_65")];
+            tensor<fp16, [1, 8, 3, 256]> var_1704_cast_fp16 = mul(x = q_37, y = cos_s)[name = string("op_1704_cast_fp16")];
+            tensor<int32, [2]> var_1705_split_sizes_0 = const()[name = string("op_1705_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_1705_axis_0 = const()[name = string("op_1705_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_1705_0, tensor<fp16, [1, 8, 3, 128]> var_1705_1 = split(axis = var_1705_axis_0, split_sizes = var_1705_split_sizes_0, x = q_37)[name = string("op_1705")];
+            fp16 const_26_promoted = const()[name = string("const_26_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_1707 = mul(x = var_1705_1, y = const_26_promoted)[name = string("op_1707")];
+            int32 var_1709 = const()[name = string("op_1709"), val = int32(-1)];
+            bool var_1710_interleave_0 = const()[name = string("op_1710_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_1710 = concat(axis = var_1709, interleave = var_1710_interleave_0, values = (var_1707, var_1705_0))[name = string("op_1710")];
+            tensor<fp16, [1, 8, 3, 256]> var_1711_cast_fp16 = mul(x = var_1710, y = sin_s)[name = string("op_1711_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_39_cast_fp16 = add(x = var_1704_cast_fp16, y = var_1711_cast_fp16)[name = string("q_39_cast_fp16")];
+            bool attn_weights_13_transpose_x_0 = const()[name = string("attn_weights_13_transpose_x_0"), val = bool(false)];
+            bool attn_weights_13_transpose_y_0 = const()[name = string("attn_weights_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_13_cast_fp16 = matmul(transpose_x = attn_weights_13_transpose_x_0, transpose_y = attn_weights_13_transpose_y_0, x = q_39_cast_fp16, y = transpose_37_cast_fp16)[name = string("attn_weights_13_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_51_cast_fp16 = add(x = attn_weights_13_cast_fp16, y = causal_mask_sliding)[name = string("x_51_cast_fp16")];
+            tensor<int32, [1]> reduce_max_3_axes_0 = const()[name = string("reduce_max_3_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_3_keep_dims_0 = const()[name = string("reduce_max_3_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_3 = reduce_max(axes = reduce_max_3_axes_0, keep_dims = reduce_max_3_keep_dims_0, x = x_51_cast_fp16)[name = string("reduce_max_3")];
+            tensor<fp16, [1, 8, 3, 512]> var_1743 = sub(x = x_51_cast_fp16, y = reduce_max_3)[name = string("op_1743")];
+            tensor<fp16, [1, 8, 3, 512]> var_1749 = exp(x = var_1743)[name = string("op_1749")];
+            tensor<int32, [1]> var_1759_axes_0 = const()[name = string("op_1759_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1759_keep_dims_0 = const()[name = string("op_1759_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_1759 = reduce_sum(axes = var_1759_axes_0, keep_dims = var_1759_keep_dims_0, x = var_1749)[name = string("op_1759")];
+            tensor<fp16, [1, 8, 3, 512]> var_1765_cast_fp16 = real_div(x = var_1749, y = var_1759)[name = string("op_1765_cast_fp16")];
+            bool attn_output_19_transpose_x_0 = const()[name = string("attn_output_19_transpose_x_0"), val = bool(false)];
+            bool attn_output_19_transpose_y_0 = const()[name = string("attn_output_19_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_19_cast_fp16 = matmul(transpose_x = attn_output_19_transpose_x_0, transpose_y = attn_output_19_transpose_y_0, x = var_1765_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_19_cast_fp16")];
+            tensor<int32, [4]> var_1776 = const()[name = string("op_1776"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1783 = const()[name = string("op_1783"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_1777_cast_fp16 = transpose(perm = var_1776, x = attn_output_19_cast_fp16)[name = string("transpose_64")];
+            tensor<fp16, [1, 3, 2048]> attn_output_21_cast_fp16 = reshape(shape = var_1783, x = var_1777_cast_fp16)[name = string("attn_output_21_cast_fp16")];
+            tensor<int32, [3]> var_1788 = const()[name = string("op_1788"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_1804_pad_type_0 = const()[name = string("op_1804_pad_type_0"), val = string("valid")];
+            int32 var_1804_groups_0 = const()[name = string("op_1804_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_1804_strides_0 = const()[name = string("op_1804_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_1804_pad_0 = const()[name = string("op_1804_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_1804_dilations_0 = const()[name = string("op_1804_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_3_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(397518016))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400139520))))[name = string("squeeze_3_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_1789_cast_fp16 = transpose(perm = var_1788, x = attn_output_21_cast_fp16)[name = string("transpose_63")];
+            tensor<fp16, [1, 2560, 3]> var_1804_cast_fp16 = conv(dilations = var_1804_dilations_0, groups = var_1804_groups_0, pad = var_1804_pad_0, pad_type = var_1804_pad_type_0, strides = var_1804_strides_0, weight = squeeze_3_cast_fp16_to_fp32_to_fp16_palettized, x = var_1789_cast_fp16)[name = string("op_1804_cast_fp16")];
+            tensor<int32, [3]> var_1808 = const()[name = string("op_1808"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1814 = const()[name = string("op_1814"), val = int32(-1)];
+            fp16 const_27_promoted_to_fp16 = const()[name = string("const_27_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_55_cast_fp16 = transpose(perm = var_1808, x = var_1804_cast_fp16)[name = string("transpose_62")];
+            tensor<fp16, [1, 3, 2560]> var_1816_cast_fp16 = mul(x = x_55_cast_fp16, y = const_27_promoted_to_fp16)[name = string("op_1816_cast_fp16")];
+            bool input_81_interleave_0 = const()[name = string("input_81_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_81_cast_fp16 = concat(axis = var_1814, interleave = input_81_interleave_0, values = (x_55_cast_fp16, var_1816_cast_fp16))[name = string("input_81_cast_fp16")];
+            tensor<int32, [1]> normed_81_axes_0 = const()[name = string("normed_81_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1811_to_fp16 = const()[name = string("op_1811_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_81_cast_fp16 = layer_norm(axes = normed_81_axes_0, epsilon = var_1811_to_fp16, x = input_81_cast_fp16)[name = string("normed_81_cast_fp16")];
+            tensor<int32, [2]> var_1821_split_sizes_0 = const()[name = string("op_1821_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1821_axis_0 = const()[name = string("op_1821_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1821_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1821_cast_fp16_1 = split(axis = var_1821_axis_0, split_sizes = var_1821_split_sizes_0, x = normed_81_cast_fp16)[name = string("op_1821_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_3_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400142144)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_23_cast_fp16 = mul(x = var_1821_cast_fp16_0, y = layers_3_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_23_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_57_cast_fp16 = add(x = x_47_cast_fp16, y = attn_output_23_cast_fp16)[name = string("x_57_cast_fp16")];
+            int32 var_1830 = const()[name = string("op_1830"), val = int32(-1)];
+            fp16 const_28_promoted_to_fp16 = const()[name = string("const_28_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_1832_cast_fp16 = mul(x = x_57_cast_fp16, y = const_28_promoted_to_fp16)[name = string("op_1832_cast_fp16")];
+            bool input_83_interleave_0 = const()[name = string("input_83_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_83_cast_fp16 = concat(axis = var_1830, interleave = input_83_interleave_0, values = (x_57_cast_fp16, var_1832_cast_fp16))[name = string("input_83_cast_fp16")];
+            tensor<int32, [1]> normed_85_axes_0 = const()[name = string("normed_85_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1827_to_fp16 = const()[name = string("op_1827_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_85_cast_fp16 = layer_norm(axes = normed_85_axes_0, epsilon = var_1827_to_fp16, x = input_83_cast_fp16)[name = string("normed_85_cast_fp16")];
+            tensor<int32, [2]> var_1837_split_sizes_0 = const()[name = string("op_1837_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1837_axis_0 = const()[name = string("op_1837_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1837_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1837_cast_fp16_1 = split(axis = var_1837_axis_0, split_sizes = var_1837_split_sizes_0, x = normed_85_cast_fp16)[name = string("op_1837_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_3_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400147328)))];
+            tensor<fp16, [1, 3, 2560]> h_21_cast_fp16 = mul(x = var_1837_cast_fp16_0, y = layers_3_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_21_cast_fp16")];
+            tensor<int32, [3]> var_1848 = const()[name = string("op_1848"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_85_axes_0 = const()[name = string("input_85_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1849 = transpose(perm = var_1848, x = h_21_cast_fp16)[name = string("transpose_61")];
+            tensor<fp16, [1, 2560, 1, 3]> input_85 = expand_dims(axes = input_85_axes_0, x = var_1849)[name = string("input_85")];
+            string gate_13_pad_type_0 = const()[name = string("gate_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_13_strides_0 = const()[name = string("gate_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_13_pad_0 = const()[name = string("gate_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_13_dilations_0 = const()[name = string("gate_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_13_groups_0 = const()[name = string("gate_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_13 = conv(dilations = gate_13_dilations_0, groups = gate_13_groups_0, pad = gate_13_pad_0, pad_type = gate_13_pad_type_0, strides = gate_13_strides_0, weight = layers_3_mlp_gate_proj_weight_palettized, x = input_85)[name = string("gate_13")];
+            string up_7_pad_type_0 = const()[name = string("up_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_7_strides_0 = const()[name = string("up_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_7_pad_0 = const()[name = string("up_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_7_dilations_0 = const()[name = string("up_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_7_groups_0 = const()[name = string("up_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_7 = conv(dilations = up_7_dilations_0, groups = up_7_groups_0, pad = up_7_pad_0, pad_type = up_7_pad_type_0, strides = up_7_strides_0, weight = layers_3_mlp_up_proj_weight_palettized, x = input_85)[name = string("up_7")];
+            string gate_15_mode_0 = const()[name = string("gate_15_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_15 = gelu(mode = gate_15_mode_0, x = gate_13)[name = string("gate_15")];
+            tensor<fp16, [1, 10240, 1, 3]> input_87 = mul(x = gate_15, y = up_7)[name = string("input_87")];
+            string mlp_out_7_pad_type_0 = const()[name = string("mlp_out_7_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_7_strides_0 = const()[name = string("mlp_out_7_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_7_pad_0 = const()[name = string("mlp_out_7_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_7_dilations_0 = const()[name = string("mlp_out_7_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_7_groups_0 = const()[name = string("mlp_out_7_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_7 = conv(dilations = mlp_out_7_dilations_0, groups = mlp_out_7_groups_0, pad = mlp_out_7_pad_0, pad_type = mlp_out_7_pad_type_0, strides = mlp_out_7_strides_0, weight = layers_3_mlp_down_proj_weight_palettized, x = input_87)[name = string("mlp_out_7")];
+            tensor<int32, [1]> var_1889_axes_0 = const()[name = string("op_1889_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1889 = squeeze(axes = var_1889_axes_0, x = mlp_out_7)[name = string("op_1889")];
+            tensor<int32, [3]> var_1893 = const()[name = string("op_1893"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1899 = const()[name = string("op_1899"), val = int32(-1)];
+            fp16 const_29_promoted = const()[name = string("const_29_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_59 = transpose(perm = var_1893, x = var_1889)[name = string("transpose_60")];
+            tensor<fp16, [1, 3, 2560]> var_1901 = mul(x = x_59, y = const_29_promoted)[name = string("op_1901")];
+            bool input_89_interleave_0 = const()[name = string("input_89_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_89 = concat(axis = var_1899, interleave = input_89_interleave_0, values = (x_59, var_1901))[name = string("input_89")];
+            tensor<int32, [1]> normed_89_axes_0 = const()[name = string("normed_89_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1896_to_fp16 = const()[name = string("op_1896_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_89_cast_fp16 = layer_norm(axes = normed_89_axes_0, epsilon = var_1896_to_fp16, x = input_89)[name = string("normed_89_cast_fp16")];
+            tensor<int32, [2]> var_1906_split_sizes_0 = const()[name = string("op_1906_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1906_axis_0 = const()[name = string("op_1906_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1906_0, tensor<fp16, [1, 3, 2560]> var_1906_1 = split(axis = var_1906_axis_0, split_sizes = var_1906_split_sizes_0, x = normed_89_cast_fp16)[name = string("op_1906")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_33 = mul(x = var_1906_0, y = layers_3_post_feedforward_layernorm_weight)[name = string("hidden_states_33")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_35_cast_fp16 = add(x = x_57_cast_fp16, y = hidden_states_33)[name = string("hidden_states_35_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_7_begin_0 = const()[name = string("per_layer_slice_7_begin_0"), val = tensor<int32, [3]>([0, 0, 9216])];
+            tensor<int32, [3]> per_layer_slice_7_end_0 = const()[name = string("per_layer_slice_7_end_0"), val = tensor<int32, [3]>([1, 3, 9472])];
+            tensor<bool, [3]> per_layer_slice_7_end_mask_0 = const()[name = string("per_layer_slice_7_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_7_cast_fp16 = slice_by_index(begin = per_layer_slice_7_begin_0, end = per_layer_slice_7_end_0, end_mask = per_layer_slice_7_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_7_cast_fp16")];
+            tensor<int32, [3]> var_1934 = const()[name = string("op_1934"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_91_axes_0 = const()[name = string("input_91_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1935 = transpose(perm = var_1934, x = hidden_states_35_cast_fp16)[name = string("transpose_59")];
+            tensor<fp16, [1, 2560, 1, 3]> input_91 = expand_dims(axes = input_91_axes_0, x = var_1935)[name = string("input_91")];
+            string gated_19_pad_type_0 = const()[name = string("gated_19_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_19_strides_0 = const()[name = string("gated_19_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_19_pad_0 = const()[name = string("gated_19_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_19_dilations_0 = const()[name = string("gated_19_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_19_groups_0 = const()[name = string("gated_19_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_19 = conv(dilations = gated_19_dilations_0, groups = gated_19_groups_0, pad = gated_19_pad_0, pad_type = gated_19_pad_type_0, strides = gated_19_strides_0, weight = layers_3_per_layer_input_gate_weight_palettized, x = input_91)[name = string("gated_19")];
+            string gated_21_mode_0 = const()[name = string("gated_21_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_21 = gelu(mode = gated_21_mode_0, x = gated_19)[name = string("gated_21")];
+            tensor<int32, [3]> var_1954 = const()[name = string("op_1954"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_7_axes_0 = const()[name = string("per_layer_slice_conv_7_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_1955_cast_fp16 = transpose(perm = var_1954, x = per_layer_slice_7_cast_fp16)[name = string("transpose_58")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_7_cast_fp16 = expand_dims(axes = per_layer_slice_conv_7_axes_0, x = var_1955_cast_fp16)[name = string("per_layer_slice_conv_7_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_93_cast_fp16 = mul(x = gated_21, y = per_layer_slice_conv_7_cast_fp16)[name = string("input_93_cast_fp16")];
+            string gated_23_pad_type_0 = const()[name = string("gated_23_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_23_strides_0 = const()[name = string("gated_23_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_23_pad_0 = const()[name = string("gated_23_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_23_dilations_0 = const()[name = string("gated_23_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_23_groups_0 = const()[name = string("gated_23_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_3_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400152512))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400480256))))[name = string("layers_3_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_23_cast_fp16 = conv(dilations = gated_23_dilations_0, groups = gated_23_groups_0, pad = gated_23_pad_0, pad_type = gated_23_pad_type_0, strides = gated_23_strides_0, weight = layers_3_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_93_cast_fp16)[name = string("gated_23_cast_fp16")];
+            tensor<int32, [1]> var_1971_axes_0 = const()[name = string("op_1971_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_1971_cast_fp16 = squeeze(axes = var_1971_axes_0, x = gated_23_cast_fp16)[name = string("op_1971_cast_fp16")];
+            tensor<int32, [3]> var_1975 = const()[name = string("op_1975"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_1981 = const()[name = string("op_1981"), val = int32(-1)];
+            fp16 const_30_promoted_to_fp16 = const()[name = string("const_30_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_61_cast_fp16 = transpose(perm = var_1975, x = var_1971_cast_fp16)[name = string("transpose_57")];
+            tensor<fp16, [1, 3, 2560]> var_1983_cast_fp16 = mul(x = x_61_cast_fp16, y = const_30_promoted_to_fp16)[name = string("op_1983_cast_fp16")];
+            bool input_95_interleave_0 = const()[name = string("input_95_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_95_cast_fp16 = concat(axis = var_1981, interleave = input_95_interleave_0, values = (x_61_cast_fp16, var_1983_cast_fp16))[name = string("input_95_cast_fp16")];
+            tensor<int32, [1]> normed_93_axes_0 = const()[name = string("normed_93_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_1978_to_fp16 = const()[name = string("op_1978_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_93_cast_fp16 = layer_norm(axes = normed_93_axes_0, epsilon = var_1978_to_fp16, x = input_95_cast_fp16)[name = string("normed_93_cast_fp16")];
+            tensor<int32, [2]> var_1988_split_sizes_0 = const()[name = string("op_1988_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_1988_axis_0 = const()[name = string("op_1988_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_1988_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_1988_cast_fp16_1 = split(axis = var_1988_axis_0, split_sizes = var_1988_split_sizes_0, x = normed_93_cast_fp16)[name = string("op_1988_cast_fp16")];
+            tensor<fp16, [2560]> layers_3_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_3_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400482880)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_39_cast_fp16 = mul(x = var_1988_cast_fp16_0, y = layers_3_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_39_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_41_cast_fp16 = add(x = hidden_states_35_cast_fp16, y = hidden_states_39_cast_fp16)[name = string("hidden_states_41_cast_fp16")];
+            tensor<fp16, [1]> const_31_promoted_to_fp16 = const()[name = string("const_31_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.b6p-1])];
+            tensor<fp16, [1, 3, 2560]> x_63_cast_fp16 = mul(x = hidden_states_41_cast_fp16, y = const_31_promoted_to_fp16)[name = string("x_63_cast_fp16")];
+            int32 var_2003 = const()[name = string("op_2003"), val = int32(-1)];
+            fp16 const_32_promoted_to_fp16 = const()[name = string("const_32_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_2005_cast_fp16 = mul(x = x_63_cast_fp16, y = const_32_promoted_to_fp16)[name = string("op_2005_cast_fp16")];
+            bool input_97_interleave_0 = const()[name = string("input_97_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_97_cast_fp16 = concat(axis = var_2003, interleave = input_97_interleave_0, values = (x_63_cast_fp16, var_2005_cast_fp16))[name = string("input_97_cast_fp16")];
+            tensor<int32, [1]> normed_97_axes_0 = const()[name = string("normed_97_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2000_to_fp16 = const()[name = string("op_2000_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_97_cast_fp16 = layer_norm(axes = normed_97_axes_0, epsilon = var_2000_to_fp16, x = input_97_cast_fp16)[name = string("normed_97_cast_fp16")];
+            tensor<int32, [2]> var_2010_split_sizes_0 = const()[name = string("op_2010_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2010_axis_0 = const()[name = string("op_2010_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2010_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2010_cast_fp16_1 = split(axis = var_2010_axis_0, split_sizes = var_2010_split_sizes_0, x = normed_97_cast_fp16)[name = string("op_2010_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_4_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400488064)))];
+            tensor<fp16, [1, 3, 2560]> h_25_cast_fp16 = mul(x = var_2010_cast_fp16_0, y = layers_4_input_layernorm_weight_promoted_to_fp16)[name = string("h_25_cast_fp16")];
+            tensor<int32, [3]> var_2016 = const()[name = string("op_2016"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_2019_axes_0 = const()[name = string("op_2019_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2017_cast_fp16 = transpose(perm = var_2016, x = h_25_cast_fp16)[name = string("transpose_56")];
+            tensor<fp16, [1, 2560, 1, 3]> var_2019_cast_fp16 = expand_dims(axes = var_2019_axes_0, x = var_2017_cast_fp16)[name = string("op_2019_cast_fp16")];
+            string q_41_pad_type_0 = const()[name = string("q_41_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_41_strides_0 = const()[name = string("q_41_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_41_pad_0 = const()[name = string("q_41_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_41_dilations_0 = const()[name = string("q_41_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_41_groups_0 = const()[name = string("q_41_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_41 = conv(dilations = q_41_dilations_0, groups = q_41_groups_0, pad = q_41_pad_0, pad_type = q_41_pad_type_0, strides = q_41_strides_0, weight = layers_4_self_attn_q_proj_weight_palettized, x = var_2019_cast_fp16)[name = string("q_41")];
+            tensor<int32, [4]> var_2040 = const()[name = string("op_2040"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_2041 = reshape(shape = var_2040, x = q_41)[name = string("op_2041")];
+            tensor<int32, [4]> transpose_44_perm_0 = const()[name = string("transpose_44_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_2064 = const()[name = string("op_2064"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_44 = transpose(perm = transpose_44_perm_0, x = var_2041)[name = string("transpose_55")];
+            tensor<fp16, [3, 8, 256]> x_65 = reshape(shape = var_2064, x = transpose_44)[name = string("x_65")];
+            int32 var_2070 = const()[name = string("op_2070"), val = int32(-1)];
+            fp16 const_33_promoted = const()[name = string("const_33_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_2072 = mul(x = x_65, y = const_33_promoted)[name = string("op_2072")];
+            bool input_101_interleave_0 = const()[name = string("input_101_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_101 = concat(axis = var_2070, interleave = input_101_interleave_0, values = (x_65, var_2072))[name = string("input_101")];
+            tensor<int32, [1]> normed_101_axes_0 = const()[name = string("normed_101_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2067_to_fp16 = const()[name = string("op_2067_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_101_cast_fp16 = layer_norm(axes = normed_101_axes_0, epsilon = var_2067_to_fp16, x = input_101)[name = string("normed_101_cast_fp16")];
+            tensor<int32, [2]> var_2077_split_sizes_0 = const()[name = string("op_2077_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2077_axis_0 = const()[name = string("op_2077_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_2077_0, tensor<fp16, [3, 8, 256]> var_2077_1 = split(axis = var_2077_axis_0, split_sizes = var_2077_split_sizes_0, x = normed_101_cast_fp16)[name = string("op_2077")];
+            tensor<fp16, [3, 8, 256]> q_45 = mul(x = var_2077_0, y = layers_0_self_attn_q_norm_weight)[name = string("q_45")];
+            tensor<int32, [4]> var_2084 = const()[name = string("op_2084"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_2085 = reshape(shape = var_2084, x = q_45)[name = string("op_2085")];
+            tensor<int32, [4]> var_2090 = const()[name = string("op_2090"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_47 = transpose(perm = var_2090, x = var_2085)[name = string("transpose_54")];
+            tensor<fp16, [1, 8, 3, 256]> var_2092_cast_fp16 = mul(x = q_47, y = cos_s)[name = string("op_2092_cast_fp16")];
+            tensor<int32, [2]> var_2093_split_sizes_0 = const()[name = string("op_2093_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2093_axis_0 = const()[name = string("op_2093_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_2093_0, tensor<fp16, [1, 8, 3, 128]> var_2093_1 = split(axis = var_2093_axis_0, split_sizes = var_2093_split_sizes_0, x = q_47)[name = string("op_2093")];
+            fp16 const_34_promoted = const()[name = string("const_34_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_2095 = mul(x = var_2093_1, y = const_34_promoted)[name = string("op_2095")];
+            int32 var_2097 = const()[name = string("op_2097"), val = int32(-1)];
+            bool var_2098_interleave_0 = const()[name = string("op_2098_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_2098 = concat(axis = var_2097, interleave = var_2098_interleave_0, values = (var_2095, var_2093_0))[name = string("op_2098")];
+            tensor<fp16, [1, 8, 3, 256]> var_2099_cast_fp16 = mul(x = var_2098, y = sin_s)[name = string("op_2099_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_49_cast_fp16 = add(x = var_2092_cast_fp16, y = var_2099_cast_fp16)[name = string("q_49_cast_fp16")];
+            bool attn_weights_17_transpose_x_0 = const()[name = string("attn_weights_17_transpose_x_0"), val = bool(false)];
+            bool attn_weights_17_transpose_y_0 = const()[name = string("attn_weights_17_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_17_cast_fp16 = matmul(transpose_x = attn_weights_17_transpose_x_0, transpose_y = attn_weights_17_transpose_y_0, x = q_49_cast_fp16, y = transpose_37_cast_fp16)[name = string("attn_weights_17_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_67_cast_fp16 = add(x = attn_weights_17_cast_fp16, y = causal_mask_sliding)[name = string("x_67_cast_fp16")];
+            tensor<int32, [1]> reduce_max_4_axes_0 = const()[name = string("reduce_max_4_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_4_keep_dims_0 = const()[name = string("reduce_max_4_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_4 = reduce_max(axes = reduce_max_4_axes_0, keep_dims = reduce_max_4_keep_dims_0, x = x_67_cast_fp16)[name = string("reduce_max_4")];
+            tensor<fp16, [1, 8, 3, 512]> var_2131 = sub(x = x_67_cast_fp16, y = reduce_max_4)[name = string("op_2131")];
+            tensor<fp16, [1, 8, 3, 512]> var_2137 = exp(x = var_2131)[name = string("op_2137")];
+            tensor<int32, [1]> var_2147_axes_0 = const()[name = string("op_2147_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2147_keep_dims_0 = const()[name = string("op_2147_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_2147 = reduce_sum(axes = var_2147_axes_0, keep_dims = var_2147_keep_dims_0, x = var_2137)[name = string("op_2147")];
+            tensor<fp16, [1, 8, 3, 512]> var_2153_cast_fp16 = real_div(x = var_2137, y = var_2147)[name = string("op_2153_cast_fp16")];
+            bool attn_output_25_transpose_x_0 = const()[name = string("attn_output_25_transpose_x_0"), val = bool(false)];
+            bool attn_output_25_transpose_y_0 = const()[name = string("attn_output_25_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_25_cast_fp16 = matmul(transpose_x = attn_output_25_transpose_x_0, transpose_y = attn_output_25_transpose_y_0, x = var_2153_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_25_cast_fp16")];
+            tensor<int32, [4]> var_2164 = const()[name = string("op_2164"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_2171 = const()[name = string("op_2171"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_2165_cast_fp16 = transpose(perm = var_2164, x = attn_output_25_cast_fp16)[name = string("transpose_53")];
+            tensor<fp16, [1, 3, 2048]> attn_output_27_cast_fp16 = reshape(shape = var_2171, x = var_2165_cast_fp16)[name = string("attn_output_27_cast_fp16")];
+            tensor<int32, [3]> var_2176 = const()[name = string("op_2176"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_2192_pad_type_0 = const()[name = string("op_2192_pad_type_0"), val = string("valid")];
+            int32 var_2192_groups_0 = const()[name = string("op_2192_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_2192_strides_0 = const()[name = string("op_2192_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_2192_pad_0 = const()[name = string("op_2192_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_2192_dilations_0 = const()[name = string("op_2192_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_4_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(400493248))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403114752))))[name = string("squeeze_4_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_2177_cast_fp16 = transpose(perm = var_2176, x = attn_output_27_cast_fp16)[name = string("transpose_52")];
+            tensor<fp16, [1, 2560, 3]> var_2192_cast_fp16 = conv(dilations = var_2192_dilations_0, groups = var_2192_groups_0, pad = var_2192_pad_0, pad_type = var_2192_pad_type_0, strides = var_2192_strides_0, weight = squeeze_4_cast_fp16_to_fp32_to_fp16_palettized, x = var_2177_cast_fp16)[name = string("op_2192_cast_fp16")];
+            tensor<int32, [3]> var_2196 = const()[name = string("op_2196"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2202 = const()[name = string("op_2202"), val = int32(-1)];
+            fp16 const_35_promoted_to_fp16 = const()[name = string("const_35_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_71_cast_fp16 = transpose(perm = var_2196, x = var_2192_cast_fp16)[name = string("transpose_51")];
+            tensor<fp16, [1, 3, 2560]> var_2204_cast_fp16 = mul(x = x_71_cast_fp16, y = const_35_promoted_to_fp16)[name = string("op_2204_cast_fp16")];
+            bool input_105_interleave_0 = const()[name = string("input_105_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_105_cast_fp16 = concat(axis = var_2202, interleave = input_105_interleave_0, values = (x_71_cast_fp16, var_2204_cast_fp16))[name = string("input_105_cast_fp16")];
+            tensor<int32, [1]> normed_105_axes_0 = const()[name = string("normed_105_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2199_to_fp16 = const()[name = string("op_2199_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_105_cast_fp16 = layer_norm(axes = normed_105_axes_0, epsilon = var_2199_to_fp16, x = input_105_cast_fp16)[name = string("normed_105_cast_fp16")];
+            tensor<int32, [2]> var_2209_split_sizes_0 = const()[name = string("op_2209_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2209_axis_0 = const()[name = string("op_2209_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2209_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2209_cast_fp16_1 = split(axis = var_2209_axis_0, split_sizes = var_2209_split_sizes_0, x = normed_105_cast_fp16)[name = string("op_2209_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_4_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403117376)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_29_cast_fp16 = mul(x = var_2209_cast_fp16_0, y = layers_4_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_29_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_73_cast_fp16 = add(x = x_63_cast_fp16, y = attn_output_29_cast_fp16)[name = string("x_73_cast_fp16")];
+            int32 var_2218 = const()[name = string("op_2218"), val = int32(-1)];
+            fp16 const_36_promoted_to_fp16 = const()[name = string("const_36_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_2220_cast_fp16 = mul(x = x_73_cast_fp16, y = const_36_promoted_to_fp16)[name = string("op_2220_cast_fp16")];
+            bool input_107_interleave_0 = const()[name = string("input_107_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_107_cast_fp16 = concat(axis = var_2218, interleave = input_107_interleave_0, values = (x_73_cast_fp16, var_2220_cast_fp16))[name = string("input_107_cast_fp16")];
+            tensor<int32, [1]> normed_109_axes_0 = const()[name = string("normed_109_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2215_to_fp16 = const()[name = string("op_2215_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_109_cast_fp16 = layer_norm(axes = normed_109_axes_0, epsilon = var_2215_to_fp16, x = input_107_cast_fp16)[name = string("normed_109_cast_fp16")];
+            tensor<int32, [2]> var_2225_split_sizes_0 = const()[name = string("op_2225_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2225_axis_0 = const()[name = string("op_2225_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2225_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2225_cast_fp16_1 = split(axis = var_2225_axis_0, split_sizes = var_2225_split_sizes_0, x = normed_109_cast_fp16)[name = string("op_2225_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_4_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403122560)))];
+            tensor<fp16, [1, 3, 2560]> h_27_cast_fp16 = mul(x = var_2225_cast_fp16_0, y = layers_4_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_27_cast_fp16")];
+            tensor<int32, [3]> var_2236 = const()[name = string("op_2236"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_109_axes_0 = const()[name = string("input_109_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2237 = transpose(perm = var_2236, x = h_27_cast_fp16)[name = string("transpose_50")];
+            tensor<fp16, [1, 2560, 1, 3]> input_109 = expand_dims(axes = input_109_axes_0, x = var_2237)[name = string("input_109")];
+            string gate_17_pad_type_0 = const()[name = string("gate_17_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_17_strides_0 = const()[name = string("gate_17_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_17_pad_0 = const()[name = string("gate_17_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_17_dilations_0 = const()[name = string("gate_17_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_17_groups_0 = const()[name = string("gate_17_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_17 = conv(dilations = gate_17_dilations_0, groups = gate_17_groups_0, pad = gate_17_pad_0, pad_type = gate_17_pad_type_0, strides = gate_17_strides_0, weight = layers_4_mlp_gate_proj_weight_palettized, x = input_109)[name = string("gate_17")];
+            string up_9_pad_type_0 = const()[name = string("up_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_9_strides_0 = const()[name = string("up_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_9_pad_0 = const()[name = string("up_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_9_dilations_0 = const()[name = string("up_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_9_groups_0 = const()[name = string("up_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_9 = conv(dilations = up_9_dilations_0, groups = up_9_groups_0, pad = up_9_pad_0, pad_type = up_9_pad_type_0, strides = up_9_strides_0, weight = layers_4_mlp_up_proj_weight_palettized, x = input_109)[name = string("up_9")];
+            string gate_19_mode_0 = const()[name = string("gate_19_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_19 = gelu(mode = gate_19_mode_0, x = gate_17)[name = string("gate_19")];
+            tensor<fp16, [1, 10240, 1, 3]> input_111 = mul(x = gate_19, y = up_9)[name = string("input_111")];
+            string mlp_out_9_pad_type_0 = const()[name = string("mlp_out_9_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_9_strides_0 = const()[name = string("mlp_out_9_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_9_pad_0 = const()[name = string("mlp_out_9_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_9_dilations_0 = const()[name = string("mlp_out_9_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_9_groups_0 = const()[name = string("mlp_out_9_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_9 = conv(dilations = mlp_out_9_dilations_0, groups = mlp_out_9_groups_0, pad = mlp_out_9_pad_0, pad_type = mlp_out_9_pad_type_0, strides = mlp_out_9_strides_0, weight = layers_4_mlp_down_proj_weight_palettized, x = input_111)[name = string("mlp_out_9")];
+            tensor<int32, [1]> var_2277_axes_0 = const()[name = string("op_2277_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2277 = squeeze(axes = var_2277_axes_0, x = mlp_out_9)[name = string("op_2277")];
+            tensor<int32, [3]> var_2281 = const()[name = string("op_2281"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2287 = const()[name = string("op_2287"), val = int32(-1)];
+            fp16 const_37_promoted = const()[name = string("const_37_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_75 = transpose(perm = var_2281, x = var_2277)[name = string("transpose_49")];
+            tensor<fp16, [1, 3, 2560]> var_2289 = mul(x = x_75, y = const_37_promoted)[name = string("op_2289")];
+            bool input_113_interleave_0 = const()[name = string("input_113_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_113 = concat(axis = var_2287, interleave = input_113_interleave_0, values = (x_75, var_2289))[name = string("input_113")];
+            tensor<int32, [1]> normed_113_axes_0 = const()[name = string("normed_113_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2284_to_fp16 = const()[name = string("op_2284_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_113_cast_fp16 = layer_norm(axes = normed_113_axes_0, epsilon = var_2284_to_fp16, x = input_113)[name = string("normed_113_cast_fp16")];
+            tensor<int32, [2]> var_2294_split_sizes_0 = const()[name = string("op_2294_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2294_axis_0 = const()[name = string("op_2294_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2294_0, tensor<fp16, [1, 3, 2560]> var_2294_1 = split(axis = var_2294_axis_0, split_sizes = var_2294_split_sizes_0, x = normed_113_cast_fp16)[name = string("op_2294")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_43 = mul(x = var_2294_0, y = layers_4_post_feedforward_layernorm_weight)[name = string("hidden_states_43")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_45_cast_fp16 = add(x = x_73_cast_fp16, y = hidden_states_43)[name = string("hidden_states_45_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_9_begin_0 = const()[name = string("per_layer_slice_9_begin_0"), val = tensor<int32, [3]>([0, 0, 9472])];
+            tensor<int32, [3]> per_layer_slice_9_end_0 = const()[name = string("per_layer_slice_9_end_0"), val = tensor<int32, [3]>([1, 3, 9728])];
+            tensor<bool, [3]> per_layer_slice_9_end_mask_0 = const()[name = string("per_layer_slice_9_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_9_cast_fp16 = slice_by_index(begin = per_layer_slice_9_begin_0, end = per_layer_slice_9_end_0, end_mask = per_layer_slice_9_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_9_cast_fp16")];
+            tensor<int32, [3]> var_2322 = const()[name = string("op_2322"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_115_axes_0 = const()[name = string("input_115_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2323 = transpose(perm = var_2322, x = hidden_states_45_cast_fp16)[name = string("transpose_48")];
+            tensor<fp16, [1, 2560, 1, 3]> input_115 = expand_dims(axes = input_115_axes_0, x = var_2323)[name = string("input_115")];
+            string gated_25_pad_type_0 = const()[name = string("gated_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_25_strides_0 = const()[name = string("gated_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_25_pad_0 = const()[name = string("gated_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_25_dilations_0 = const()[name = string("gated_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_25_groups_0 = const()[name = string("gated_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_25 = conv(dilations = gated_25_dilations_0, groups = gated_25_groups_0, pad = gated_25_pad_0, pad_type = gated_25_pad_type_0, strides = gated_25_strides_0, weight = layers_4_per_layer_input_gate_weight_palettized, x = input_115)[name = string("gated_25")];
+            string gated_27_mode_0 = const()[name = string("gated_27_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_27 = gelu(mode = gated_27_mode_0, x = gated_25)[name = string("gated_27")];
+            tensor<int32, [3]> var_2342 = const()[name = string("op_2342"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_9_axes_0 = const()[name = string("per_layer_slice_conv_9_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_2343_cast_fp16 = transpose(perm = var_2342, x = per_layer_slice_9_cast_fp16)[name = string("transpose_47")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_9_cast_fp16 = expand_dims(axes = per_layer_slice_conv_9_axes_0, x = var_2343_cast_fp16)[name = string("per_layer_slice_conv_9_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_117_cast_fp16 = mul(x = gated_27, y = per_layer_slice_conv_9_cast_fp16)[name = string("input_117_cast_fp16")];
+            string gated_29_pad_type_0 = const()[name = string("gated_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_29_strides_0 = const()[name = string("gated_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_29_pad_0 = const()[name = string("gated_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_29_dilations_0 = const()[name = string("gated_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_29_groups_0 = const()[name = string("gated_29_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_4_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403127744))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403455488))))[name = string("layers_4_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_29_cast_fp16 = conv(dilations = gated_29_dilations_0, groups = gated_29_groups_0, pad = gated_29_pad_0, pad_type = gated_29_pad_type_0, strides = gated_29_strides_0, weight = layers_4_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_117_cast_fp16)[name = string("gated_29_cast_fp16")];
+            tensor<int32, [1]> var_2359_axes_0 = const()[name = string("op_2359_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2359_cast_fp16 = squeeze(axes = var_2359_axes_0, x = gated_29_cast_fp16)[name = string("op_2359_cast_fp16")];
+            tensor<int32, [3]> var_2363 = const()[name = string("op_2363"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2369 = const()[name = string("op_2369"), val = int32(-1)];
+            fp16 const_38_promoted_to_fp16 = const()[name = string("const_38_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_77_cast_fp16 = transpose(perm = var_2363, x = var_2359_cast_fp16)[name = string("transpose_46")];
+            tensor<fp16, [1, 3, 2560]> var_2371_cast_fp16 = mul(x = x_77_cast_fp16, y = const_38_promoted_to_fp16)[name = string("op_2371_cast_fp16")];
+            bool input_119_interleave_0 = const()[name = string("input_119_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_119_cast_fp16 = concat(axis = var_2369, interleave = input_119_interleave_0, values = (x_77_cast_fp16, var_2371_cast_fp16))[name = string("input_119_cast_fp16")];
+            tensor<int32, [1]> normed_117_axes_0 = const()[name = string("normed_117_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2366_to_fp16 = const()[name = string("op_2366_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_117_cast_fp16 = layer_norm(axes = normed_117_axes_0, epsilon = var_2366_to_fp16, x = input_119_cast_fp16)[name = string("normed_117_cast_fp16")];
+            tensor<int32, [2]> var_2376_split_sizes_0 = const()[name = string("op_2376_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2376_axis_0 = const()[name = string("op_2376_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2376_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2376_cast_fp16_1 = split(axis = var_2376_axis_0, split_sizes = var_2376_split_sizes_0, x = normed_117_cast_fp16)[name = string("op_2376_cast_fp16")];
+            tensor<fp16, [2560]> layers_4_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_4_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403458112)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_49_cast_fp16 = mul(x = var_2376_cast_fp16_0, y = layers_4_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_49_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_51_cast_fp16 = add(x = hidden_states_45_cast_fp16, y = hidden_states_49_cast_fp16)[name = string("hidden_states_51_cast_fp16")];
+            tensor<fp16, [1]> const_39_promoted_to_fp16 = const()[name = string("const_39_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.c6p-1])];
+            tensor<fp16, [1, 3, 2560]> x_79_cast_fp16 = mul(x = hidden_states_51_cast_fp16, y = const_39_promoted_to_fp16)[name = string("x_79_cast_fp16")];
+            int32 var_2391 = const()[name = string("op_2391"), val = int32(-1)];
+            fp16 const_40_promoted_to_fp16 = const()[name = string("const_40_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_2393_cast_fp16 = mul(x = x_79_cast_fp16, y = const_40_promoted_to_fp16)[name = string("op_2393_cast_fp16")];
+            bool input_121_interleave_0 = const()[name = string("input_121_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_121_cast_fp16 = concat(axis = var_2391, interleave = input_121_interleave_0, values = (x_79_cast_fp16, var_2393_cast_fp16))[name = string("input_121_cast_fp16")];
+            tensor<int32, [1]> normed_121_axes_0 = const()[name = string("normed_121_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2388_to_fp16 = const()[name = string("op_2388_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_121_cast_fp16 = layer_norm(axes = normed_121_axes_0, epsilon = var_2388_to_fp16, x = input_121_cast_fp16)[name = string("normed_121_cast_fp16")];
+            tensor<int32, [2]> var_2398_split_sizes_0 = const()[name = string("op_2398_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2398_axis_0 = const()[name = string("op_2398_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2398_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2398_cast_fp16_1 = split(axis = var_2398_axis_0, split_sizes = var_2398_split_sizes_0, x = normed_121_cast_fp16)[name = string("op_2398_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_5_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403463296)))];
+            tensor<fp16, [1, 3, 2560]> h_31_cast_fp16 = mul(x = var_2398_cast_fp16_0, y = layers_5_input_layernorm_weight_promoted_to_fp16)[name = string("h_31_cast_fp16")];
+            tensor<int32, [3]> var_2404 = const()[name = string("op_2404"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_2407_axes_0 = const()[name = string("op_2407_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2405_cast_fp16 = transpose(perm = var_2404, x = h_31_cast_fp16)[name = string("transpose_45")];
+            tensor<fp16, [1, 2560, 1, 3]> var_2407_cast_fp16 = expand_dims(axes = var_2407_axes_0, x = var_2405_cast_fp16)[name = string("op_2407_cast_fp16")];
+            string q_51_pad_type_0 = const()[name = string("q_51_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_51_strides_0 = const()[name = string("q_51_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_51_pad_0 = const()[name = string("q_51_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_51_dilations_0 = const()[name = string("q_51_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_51_groups_0 = const()[name = string("q_51_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_51 = conv(dilations = q_51_dilations_0, groups = q_51_groups_0, pad = q_51_pad_0, pad_type = q_51_pad_type_0, strides = q_51_strides_0, weight = layers_5_self_attn_q_proj_weight_palettized, x = var_2407_cast_fp16)[name = string("q_51")];
+            tensor<int32, [4]> var_2428 = const()[name = string("op_2428"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_2429 = reshape(shape = var_2428, x = q_51)[name = string("op_2429")];
+            tensor<int32, [4]> transpose_46_perm_0 = const()[name = string("transpose_46_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_2452 = const()[name = string("op_2452"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_46 = transpose(perm = transpose_46_perm_0, x = var_2429)[name = string("transpose_44")];
+            tensor<fp16, [3, 8, 256]> x_81 = reshape(shape = var_2452, x = transpose_46)[name = string("x_81")];
+            int32 var_2458 = const()[name = string("op_2458"), val = int32(-1)];
+            fp16 const_41_promoted = const()[name = string("const_41_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_2460 = mul(x = x_81, y = const_41_promoted)[name = string("op_2460")];
+            bool input_125_interleave_0 = const()[name = string("input_125_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_125 = concat(axis = var_2458, interleave = input_125_interleave_0, values = (x_81, var_2460))[name = string("input_125")];
+            tensor<int32, [1]> normed_125_axes_0 = const()[name = string("normed_125_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2455_to_fp16 = const()[name = string("op_2455_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_125_cast_fp16 = layer_norm(axes = normed_125_axes_0, epsilon = var_2455_to_fp16, x = input_125)[name = string("normed_125_cast_fp16")];
+            tensor<int32, [2]> var_2465_split_sizes_0 = const()[name = string("op_2465_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2465_axis_0 = const()[name = string("op_2465_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_2465_0, tensor<fp16, [3, 8, 256]> var_2465_1 = split(axis = var_2465_axis_0, split_sizes = var_2465_split_sizes_0, x = normed_125_cast_fp16)[name = string("op_2465")];
+            tensor<fp16, [3, 8, 256]> q_55 = mul(x = var_2465_0, y = layers_0_self_attn_q_norm_weight)[name = string("q_55")];
+            tensor<int32, [4]> var_2472 = const()[name = string("op_2472"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_2473 = reshape(shape = var_2472, x = q_55)[name = string("op_2473")];
+            tensor<int32, [4]> var_2478 = const()[name = string("op_2478"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_57 = transpose(perm = var_2478, x = var_2473)[name = string("transpose_43")];
+            tensor<fp16, [1, 8, 3, 256]> var_2480_cast_fp16 = mul(x = q_57, y = cos_s)[name = string("op_2480_cast_fp16")];
+            tensor<int32, [2]> var_2481_split_sizes_0 = const()[name = string("op_2481_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2481_axis_0 = const()[name = string("op_2481_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_2481_0, tensor<fp16, [1, 8, 3, 128]> var_2481_1 = split(axis = var_2481_axis_0, split_sizes = var_2481_split_sizes_0, x = q_57)[name = string("op_2481")];
+            fp16 const_42_promoted = const()[name = string("const_42_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_2483 = mul(x = var_2481_1, y = const_42_promoted)[name = string("op_2483")];
+            int32 var_2485 = const()[name = string("op_2485"), val = int32(-1)];
+            bool var_2486_interleave_0 = const()[name = string("op_2486_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_2486 = concat(axis = var_2485, interleave = var_2486_interleave_0, values = (var_2483, var_2481_0))[name = string("op_2486")];
+            tensor<fp16, [1, 8, 3, 256]> var_2487_cast_fp16 = mul(x = var_2486, y = sin_s)[name = string("op_2487_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_59_cast_fp16 = add(x = var_2480_cast_fp16, y = var_2487_cast_fp16)[name = string("q_59_cast_fp16")];
+            bool attn_weights_21_transpose_x_0 = const()[name = string("attn_weights_21_transpose_x_0"), val = bool(false)];
+            bool attn_weights_21_transpose_y_0 = const()[name = string("attn_weights_21_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_21_cast_fp16 = matmul(transpose_x = attn_weights_21_transpose_x_0, transpose_y = attn_weights_21_transpose_y_0, x = q_59_cast_fp16, y = transpose_37_cast_fp16)[name = string("attn_weights_21_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_83_cast_fp16 = add(x = attn_weights_21_cast_fp16, y = causal_mask_sliding)[name = string("x_83_cast_fp16")];
+            tensor<int32, [1]> reduce_max_5_axes_0 = const()[name = string("reduce_max_5_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_5_keep_dims_0 = const()[name = string("reduce_max_5_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_5 = reduce_max(axes = reduce_max_5_axes_0, keep_dims = reduce_max_5_keep_dims_0, x = x_83_cast_fp16)[name = string("reduce_max_5")];
+            tensor<fp16, [1, 8, 3, 512]> var_2519 = sub(x = x_83_cast_fp16, y = reduce_max_5)[name = string("op_2519")];
+            tensor<fp16, [1, 8, 3, 512]> var_2525 = exp(x = var_2519)[name = string("op_2525")];
+            tensor<int32, [1]> var_2535_axes_0 = const()[name = string("op_2535_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2535_keep_dims_0 = const()[name = string("op_2535_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_2535 = reduce_sum(axes = var_2535_axes_0, keep_dims = var_2535_keep_dims_0, x = var_2525)[name = string("op_2535")];
+            tensor<fp16, [1, 8, 3, 512]> var_2541_cast_fp16 = real_div(x = var_2525, y = var_2535)[name = string("op_2541_cast_fp16")];
+            bool attn_output_31_transpose_x_0 = const()[name = string("attn_output_31_transpose_x_0"), val = bool(false)];
+            bool attn_output_31_transpose_y_0 = const()[name = string("attn_output_31_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_31_cast_fp16 = matmul(transpose_x = attn_output_31_transpose_x_0, transpose_y = attn_output_31_transpose_y_0, x = var_2541_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_31_cast_fp16")];
+            tensor<int32, [4]> var_2552 = const()[name = string("op_2552"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_2559 = const()[name = string("op_2559"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_2553_cast_fp16 = transpose(perm = var_2552, x = attn_output_31_cast_fp16)[name = string("transpose_42")];
+            tensor<fp16, [1, 3, 2048]> attn_output_33_cast_fp16 = reshape(shape = var_2559, x = var_2553_cast_fp16)[name = string("attn_output_33_cast_fp16")];
+            tensor<int32, [3]> var_2564 = const()[name = string("op_2564"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_2580_pad_type_0 = const()[name = string("op_2580_pad_type_0"), val = string("valid")];
+            int32 var_2580_groups_0 = const()[name = string("op_2580_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_2580_strides_0 = const()[name = string("op_2580_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_2580_pad_0 = const()[name = string("op_2580_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_2580_dilations_0 = const()[name = string("op_2580_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_5_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(403468480))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406089984))))[name = string("squeeze_5_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_2565_cast_fp16 = transpose(perm = var_2564, x = attn_output_33_cast_fp16)[name = string("transpose_41")];
+            tensor<fp16, [1, 2560, 3]> var_2580_cast_fp16 = conv(dilations = var_2580_dilations_0, groups = var_2580_groups_0, pad = var_2580_pad_0, pad_type = var_2580_pad_type_0, strides = var_2580_strides_0, weight = squeeze_5_cast_fp16_to_fp32_to_fp16_palettized, x = var_2565_cast_fp16)[name = string("op_2580_cast_fp16")];
+            tensor<int32, [3]> var_2584 = const()[name = string("op_2584"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2590 = const()[name = string("op_2590"), val = int32(-1)];
+            fp16 const_43_promoted_to_fp16 = const()[name = string("const_43_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_87_cast_fp16 = transpose(perm = var_2584, x = var_2580_cast_fp16)[name = string("transpose_40")];
+            tensor<fp16, [1, 3, 2560]> var_2592_cast_fp16 = mul(x = x_87_cast_fp16, y = const_43_promoted_to_fp16)[name = string("op_2592_cast_fp16")];
+            bool input_129_interleave_0 = const()[name = string("input_129_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_129_cast_fp16 = concat(axis = var_2590, interleave = input_129_interleave_0, values = (x_87_cast_fp16, var_2592_cast_fp16))[name = string("input_129_cast_fp16")];
+            tensor<int32, [1]> normed_129_axes_0 = const()[name = string("normed_129_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2587_to_fp16 = const()[name = string("op_2587_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_129_cast_fp16 = layer_norm(axes = normed_129_axes_0, epsilon = var_2587_to_fp16, x = input_129_cast_fp16)[name = string("normed_129_cast_fp16")];
+            tensor<int32, [2]> var_2597_split_sizes_0 = const()[name = string("op_2597_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2597_axis_0 = const()[name = string("op_2597_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2597_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2597_cast_fp16_1 = split(axis = var_2597_axis_0, split_sizes = var_2597_split_sizes_0, x = normed_129_cast_fp16)[name = string("op_2597_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_5_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406092608)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_35_cast_fp16 = mul(x = var_2597_cast_fp16_0, y = layers_5_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_35_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_89_cast_fp16 = add(x = x_79_cast_fp16, y = attn_output_35_cast_fp16)[name = string("x_89_cast_fp16")];
+            int32 var_2606 = const()[name = string("op_2606"), val = int32(-1)];
+            fp16 const_44_promoted_to_fp16 = const()[name = string("const_44_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_2608_cast_fp16 = mul(x = x_89_cast_fp16, y = const_44_promoted_to_fp16)[name = string("op_2608_cast_fp16")];
+            bool input_131_interleave_0 = const()[name = string("input_131_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_131_cast_fp16 = concat(axis = var_2606, interleave = input_131_interleave_0, values = (x_89_cast_fp16, var_2608_cast_fp16))[name = string("input_131_cast_fp16")];
+            tensor<int32, [1]> normed_133_axes_0 = const()[name = string("normed_133_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2603_to_fp16 = const()[name = string("op_2603_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_133_cast_fp16 = layer_norm(axes = normed_133_axes_0, epsilon = var_2603_to_fp16, x = input_131_cast_fp16)[name = string("normed_133_cast_fp16")];
+            tensor<int32, [2]> var_2613_split_sizes_0 = const()[name = string("op_2613_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2613_axis_0 = const()[name = string("op_2613_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2613_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2613_cast_fp16_1 = split(axis = var_2613_axis_0, split_sizes = var_2613_split_sizes_0, x = normed_133_cast_fp16)[name = string("op_2613_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_5_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406097792)))];
+            tensor<fp16, [1, 3, 2560]> h_33_cast_fp16 = mul(x = var_2613_cast_fp16_0, y = layers_5_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_33_cast_fp16")];
+            tensor<int32, [3]> var_2624 = const()[name = string("op_2624"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_133_axes_0 = const()[name = string("input_133_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2625 = transpose(perm = var_2624, x = h_33_cast_fp16)[name = string("transpose_39")];
+            tensor<fp16, [1, 2560, 1, 3]> input_133 = expand_dims(axes = input_133_axes_0, x = var_2625)[name = string("input_133")];
+            string gate_21_pad_type_0 = const()[name = string("gate_21_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_21_strides_0 = const()[name = string("gate_21_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_21_pad_0 = const()[name = string("gate_21_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_21_dilations_0 = const()[name = string("gate_21_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_21_groups_0 = const()[name = string("gate_21_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_21 = conv(dilations = gate_21_dilations_0, groups = gate_21_groups_0, pad = gate_21_pad_0, pad_type = gate_21_pad_type_0, strides = gate_21_strides_0, weight = layers_5_mlp_gate_proj_weight_palettized, x = input_133)[name = string("gate_21")];
+            string up_11_pad_type_0 = const()[name = string("up_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_11_strides_0 = const()[name = string("up_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_11_pad_0 = const()[name = string("up_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_11_dilations_0 = const()[name = string("up_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_11_groups_0 = const()[name = string("up_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_11 = conv(dilations = up_11_dilations_0, groups = up_11_groups_0, pad = up_11_pad_0, pad_type = up_11_pad_type_0, strides = up_11_strides_0, weight = layers_5_mlp_up_proj_weight_palettized, x = input_133)[name = string("up_11")];
+            string gate_23_mode_0 = const()[name = string("gate_23_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_23 = gelu(mode = gate_23_mode_0, x = gate_21)[name = string("gate_23")];
+            tensor<fp16, [1, 10240, 1, 3]> input_135 = mul(x = gate_23, y = up_11)[name = string("input_135")];
+            string mlp_out_11_pad_type_0 = const()[name = string("mlp_out_11_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_11_strides_0 = const()[name = string("mlp_out_11_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_11_pad_0 = const()[name = string("mlp_out_11_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_11_dilations_0 = const()[name = string("mlp_out_11_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_11_groups_0 = const()[name = string("mlp_out_11_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_11 = conv(dilations = mlp_out_11_dilations_0, groups = mlp_out_11_groups_0, pad = mlp_out_11_pad_0, pad_type = mlp_out_11_pad_type_0, strides = mlp_out_11_strides_0, weight = layers_5_mlp_down_proj_weight_palettized, x = input_135)[name = string("mlp_out_11")];
+            tensor<int32, [1]> var_2665_axes_0 = const()[name = string("op_2665_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2665 = squeeze(axes = var_2665_axes_0, x = mlp_out_11)[name = string("op_2665")];
+            tensor<int32, [3]> var_2669 = const()[name = string("op_2669"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2675 = const()[name = string("op_2675"), val = int32(-1)];
+            fp16 const_45_promoted = const()[name = string("const_45_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_91 = transpose(perm = var_2669, x = var_2665)[name = string("transpose_38")];
+            tensor<fp16, [1, 3, 2560]> var_2677 = mul(x = x_91, y = const_45_promoted)[name = string("op_2677")];
+            bool input_137_interleave_0 = const()[name = string("input_137_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_137 = concat(axis = var_2675, interleave = input_137_interleave_0, values = (x_91, var_2677))[name = string("input_137")];
+            tensor<int32, [1]> normed_137_axes_0 = const()[name = string("normed_137_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2672_to_fp16 = const()[name = string("op_2672_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_137_cast_fp16 = layer_norm(axes = normed_137_axes_0, epsilon = var_2672_to_fp16, x = input_137)[name = string("normed_137_cast_fp16")];
+            tensor<int32, [2]> var_2682_split_sizes_0 = const()[name = string("op_2682_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2682_axis_0 = const()[name = string("op_2682_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2682_0, tensor<fp16, [1, 3, 2560]> var_2682_1 = split(axis = var_2682_axis_0, split_sizes = var_2682_split_sizes_0, x = normed_137_cast_fp16)[name = string("op_2682")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_53 = mul(x = var_2682_0, y = layers_5_post_feedforward_layernorm_weight)[name = string("hidden_states_53")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_55_cast_fp16 = add(x = x_89_cast_fp16, y = hidden_states_53)[name = string("hidden_states_55_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_11_begin_0 = const()[name = string("per_layer_slice_11_begin_0"), val = tensor<int32, [3]>([0, 0, 9728])];
+            tensor<int32, [3]> per_layer_slice_11_end_0 = const()[name = string("per_layer_slice_11_end_0"), val = tensor<int32, [3]>([1, 3, 9984])];
+            tensor<bool, [3]> per_layer_slice_11_end_mask_0 = const()[name = string("per_layer_slice_11_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_11_cast_fp16 = slice_by_index(begin = per_layer_slice_11_begin_0, end = per_layer_slice_11_end_0, end_mask = per_layer_slice_11_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_11_cast_fp16")];
+            tensor<int32, [3]> var_2710 = const()[name = string("op_2710"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_139_axes_0 = const()[name = string("input_139_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2711 = transpose(perm = var_2710, x = hidden_states_55_cast_fp16)[name = string("transpose_37")];
+            tensor<fp16, [1, 2560, 1, 3]> input_139 = expand_dims(axes = input_139_axes_0, x = var_2711)[name = string("input_139")];
+            string gated_31_pad_type_0 = const()[name = string("gated_31_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_31_strides_0 = const()[name = string("gated_31_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_31_pad_0 = const()[name = string("gated_31_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_31_dilations_0 = const()[name = string("gated_31_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_31_groups_0 = const()[name = string("gated_31_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_31 = conv(dilations = gated_31_dilations_0, groups = gated_31_groups_0, pad = gated_31_pad_0, pad_type = gated_31_pad_type_0, strides = gated_31_strides_0, weight = layers_5_per_layer_input_gate_weight_palettized, x = input_139)[name = string("gated_31")];
+            string gated_33_mode_0 = const()[name = string("gated_33_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_33 = gelu(mode = gated_33_mode_0, x = gated_31)[name = string("gated_33")];
+            tensor<int32, [3]> var_2730 = const()[name = string("op_2730"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_11_axes_0 = const()[name = string("per_layer_slice_conv_11_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_2731_cast_fp16 = transpose(perm = var_2730, x = per_layer_slice_11_cast_fp16)[name = string("transpose_36")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_11_cast_fp16 = expand_dims(axes = per_layer_slice_conv_11_axes_0, x = var_2731_cast_fp16)[name = string("per_layer_slice_conv_11_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_141_cast_fp16 = mul(x = gated_33, y = per_layer_slice_conv_11_cast_fp16)[name = string("input_141_cast_fp16")];
+            string gated_35_pad_type_0 = const()[name = string("gated_35_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_35_strides_0 = const()[name = string("gated_35_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_35_pad_0 = const()[name = string("gated_35_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_35_dilations_0 = const()[name = string("gated_35_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_35_groups_0 = const()[name = string("gated_35_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_5_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406102976))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406430720))))[name = string("layers_5_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_35_cast_fp16 = conv(dilations = gated_35_dilations_0, groups = gated_35_groups_0, pad = gated_35_pad_0, pad_type = gated_35_pad_type_0, strides = gated_35_strides_0, weight = layers_5_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_141_cast_fp16)[name = string("gated_35_cast_fp16")];
+            tensor<int32, [1]> var_2747_axes_0 = const()[name = string("op_2747_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2747_cast_fp16 = squeeze(axes = var_2747_axes_0, x = gated_35_cast_fp16)[name = string("op_2747_cast_fp16")];
+            tensor<int32, [3]> var_2751 = const()[name = string("op_2751"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2757 = const()[name = string("op_2757"), val = int32(-1)];
+            fp16 const_46_promoted_to_fp16 = const()[name = string("const_46_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_93_cast_fp16 = transpose(perm = var_2751, x = var_2747_cast_fp16)[name = string("transpose_35")];
+            tensor<fp16, [1, 3, 2560]> var_2759_cast_fp16 = mul(x = x_93_cast_fp16, y = const_46_promoted_to_fp16)[name = string("op_2759_cast_fp16")];
+            bool input_143_interleave_0 = const()[name = string("input_143_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_143_cast_fp16 = concat(axis = var_2757, interleave = input_143_interleave_0, values = (x_93_cast_fp16, var_2759_cast_fp16))[name = string("input_143_cast_fp16")];
+            tensor<int32, [1]> normed_141_axes_0 = const()[name = string("normed_141_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2754_to_fp16 = const()[name = string("op_2754_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_141_cast_fp16 = layer_norm(axes = normed_141_axes_0, epsilon = var_2754_to_fp16, x = input_143_cast_fp16)[name = string("normed_141_cast_fp16")];
+            tensor<int32, [2]> var_2764_split_sizes_0 = const()[name = string("op_2764_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2764_axis_0 = const()[name = string("op_2764_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2764_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2764_cast_fp16_1 = split(axis = var_2764_axis_0, split_sizes = var_2764_split_sizes_0, x = normed_141_cast_fp16)[name = string("op_2764_cast_fp16")];
+            tensor<fp16, [2560]> layers_5_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_5_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406433344)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_59_cast_fp16 = mul(x = var_2764_cast_fp16_0, y = layers_5_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_59_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_61_cast_fp16 = add(x = hidden_states_55_cast_fp16, y = hidden_states_59_cast_fp16)[name = string("hidden_states_61_cast_fp16")];
+            tensor<fp16, [1]> const_47_promoted_to_fp16 = const()[name = string("const_47_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.c4p-1])];
+            tensor<fp16, [1, 3, 2560]> x_95_cast_fp16 = mul(x = hidden_states_61_cast_fp16, y = const_47_promoted_to_fp16)[name = string("x_95_cast_fp16")];
+            int32 var_2779 = const()[name = string("op_2779"), val = int32(-1)];
+            fp16 const_48_promoted_to_fp16 = const()[name = string("const_48_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_2781_cast_fp16 = mul(x = x_95_cast_fp16, y = const_48_promoted_to_fp16)[name = string("op_2781_cast_fp16")];
+            bool input_145_interleave_0 = const()[name = string("input_145_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_145_cast_fp16 = concat(axis = var_2779, interleave = input_145_interleave_0, values = (x_95_cast_fp16, var_2781_cast_fp16))[name = string("input_145_cast_fp16")];
+            tensor<int32, [1]> normed_145_axes_0 = const()[name = string("normed_145_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2776_to_fp16 = const()[name = string("op_2776_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_145_cast_fp16 = layer_norm(axes = normed_145_axes_0, epsilon = var_2776_to_fp16, x = input_145_cast_fp16)[name = string("normed_145_cast_fp16")];
+            tensor<int32, [2]> var_2786_split_sizes_0 = const()[name = string("op_2786_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2786_axis_0 = const()[name = string("op_2786_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2786_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2786_cast_fp16_1 = split(axis = var_2786_axis_0, split_sizes = var_2786_split_sizes_0, x = normed_145_cast_fp16)[name = string("op_2786_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_6_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406438528)))];
+            tensor<fp16, [1, 3, 2560]> h_37_cast_fp16 = mul(x = var_2786_cast_fp16_0, y = layers_6_input_layernorm_weight_promoted_to_fp16)[name = string("h_37_cast_fp16")];
+            tensor<int32, [3]> var_2792 = const()[name = string("op_2792"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_2795_axes_0 = const()[name = string("op_2795_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_2793_cast_fp16 = transpose(perm = var_2792, x = h_37_cast_fp16)[name = string("transpose_34")];
+            tensor<fp16, [1, 2560, 1, 3]> var_2795_cast_fp16 = expand_dims(axes = var_2795_axes_0, x = var_2793_cast_fp16)[name = string("op_2795_cast_fp16")];
+            string q_61_pad_type_0 = const()[name = string("q_61_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_61_strides_0 = const()[name = string("q_61_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_61_pad_0 = const()[name = string("q_61_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_61_dilations_0 = const()[name = string("q_61_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_61_groups_0 = const()[name = string("q_61_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_61 = conv(dilations = q_61_dilations_0, groups = q_61_groups_0, pad = q_61_pad_0, pad_type = q_61_pad_type_0, strides = q_61_strides_0, weight = layers_6_self_attn_q_proj_weight_palettized, x = var_2795_cast_fp16)[name = string("q_61")];
+            tensor<int32, [4]> var_2816 = const()[name = string("op_2816"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_2817 = reshape(shape = var_2816, x = q_61)[name = string("op_2817")];
+            tensor<int32, [4]> transpose_48_perm_0 = const()[name = string("transpose_48_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_2840 = const()[name = string("op_2840"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_48 = transpose(perm = transpose_48_perm_0, x = var_2817)[name = string("transpose_33")];
+            tensor<fp16, [3, 8, 256]> x_97 = reshape(shape = var_2840, x = transpose_48)[name = string("x_97")];
+            int32 var_2846 = const()[name = string("op_2846"), val = int32(-1)];
+            fp16 const_49_promoted = const()[name = string("const_49_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_2848 = mul(x = x_97, y = const_49_promoted)[name = string("op_2848")];
+            bool input_149_interleave_0 = const()[name = string("input_149_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_149 = concat(axis = var_2846, interleave = input_149_interleave_0, values = (x_97, var_2848))[name = string("input_149")];
+            tensor<int32, [1]> normed_149_axes_0 = const()[name = string("normed_149_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2843_to_fp16 = const()[name = string("op_2843_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_149_cast_fp16 = layer_norm(axes = normed_149_axes_0, epsilon = var_2843_to_fp16, x = input_149)[name = string("normed_149_cast_fp16")];
+            tensor<int32, [2]> var_2853_split_sizes_0 = const()[name = string("op_2853_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_2853_axis_0 = const()[name = string("op_2853_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_2853_0, tensor<fp16, [3, 8, 256]> var_2853_1 = split(axis = var_2853_axis_0, split_sizes = var_2853_split_sizes_0, x = normed_149_cast_fp16)[name = string("op_2853")];
+            tensor<fp16, [3, 8, 256]> q_65 = mul(x = var_2853_0, y = layers_0_self_attn_q_norm_weight)[name = string("q_65")];
+            tensor<int32, [4]> var_2860 = const()[name = string("op_2860"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_2861 = reshape(shape = var_2860, x = q_65)[name = string("op_2861")];
+            tensor<int32, [4]> var_2866 = const()[name = string("op_2866"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_67 = transpose(perm = var_2866, x = var_2861)[name = string("transpose_32")];
+            tensor<fp16, [1, 8, 3, 256]> var_2868_cast_fp16 = mul(x = q_67, y = cos_s)[name = string("op_2868_cast_fp16")];
+            tensor<int32, [2]> var_2869_split_sizes_0 = const()[name = string("op_2869_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_2869_axis_0 = const()[name = string("op_2869_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_2869_0, tensor<fp16, [1, 8, 3, 128]> var_2869_1 = split(axis = var_2869_axis_0, split_sizes = var_2869_split_sizes_0, x = q_67)[name = string("op_2869")];
+            fp16 const_50_promoted = const()[name = string("const_50_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_2871 = mul(x = var_2869_1, y = const_50_promoted)[name = string("op_2871")];
+            int32 var_2873 = const()[name = string("op_2873"), val = int32(-1)];
+            bool var_2874_interleave_0 = const()[name = string("op_2874_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_2874 = concat(axis = var_2873, interleave = var_2874_interleave_0, values = (var_2871, var_2869_0))[name = string("op_2874")];
+            tensor<fp16, [1, 8, 3, 256]> var_2875_cast_fp16 = mul(x = var_2874, y = sin_s)[name = string("op_2875_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_69_cast_fp16 = add(x = var_2868_cast_fp16, y = var_2875_cast_fp16)[name = string("q_69_cast_fp16")];
+            bool attn_weights_25_transpose_x_0 = const()[name = string("attn_weights_25_transpose_x_0"), val = bool(false)];
+            bool attn_weights_25_transpose_y_0 = const()[name = string("attn_weights_25_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_25_cast_fp16 = matmul(transpose_x = attn_weights_25_transpose_x_0, transpose_y = attn_weights_25_transpose_y_0, x = q_69_cast_fp16, y = transpose_37_cast_fp16)[name = string("attn_weights_25_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_99_cast_fp16 = add(x = attn_weights_25_cast_fp16, y = causal_mask_sliding)[name = string("x_99_cast_fp16")];
+            tensor<int32, [1]> reduce_max_6_axes_0 = const()[name = string("reduce_max_6_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_6_keep_dims_0 = const()[name = string("reduce_max_6_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_6 = reduce_max(axes = reduce_max_6_axes_0, keep_dims = reduce_max_6_keep_dims_0, x = x_99_cast_fp16)[name = string("reduce_max_6")];
+            tensor<fp16, [1, 8, 3, 512]> var_2907 = sub(x = x_99_cast_fp16, y = reduce_max_6)[name = string("op_2907")];
+            tensor<fp16, [1, 8, 3, 512]> var_2913 = exp(x = var_2907)[name = string("op_2913")];
+            tensor<int32, [1]> var_2923_axes_0 = const()[name = string("op_2923_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2923_keep_dims_0 = const()[name = string("op_2923_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_2923 = reduce_sum(axes = var_2923_axes_0, keep_dims = var_2923_keep_dims_0, x = var_2913)[name = string("op_2923")];
+            tensor<fp16, [1, 8, 3, 512]> var_2929_cast_fp16 = real_div(x = var_2913, y = var_2923)[name = string("op_2929_cast_fp16")];
+            bool attn_output_37_transpose_x_0 = const()[name = string("attn_output_37_transpose_x_0"), val = bool(false)];
+            bool attn_output_37_transpose_y_0 = const()[name = string("attn_output_37_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_37_cast_fp16 = matmul(transpose_x = attn_output_37_transpose_x_0, transpose_y = attn_output_37_transpose_y_0, x = var_2929_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_37_cast_fp16")];
+            tensor<int32, [4]> var_2940 = const()[name = string("op_2940"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_2947 = const()[name = string("op_2947"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_2941_cast_fp16 = transpose(perm = var_2940, x = attn_output_37_cast_fp16)[name = string("transpose_31")];
+            tensor<fp16, [1, 3, 2048]> attn_output_39_cast_fp16 = reshape(shape = var_2947, x = var_2941_cast_fp16)[name = string("attn_output_39_cast_fp16")];
+            tensor<int32, [3]> var_2952 = const()[name = string("op_2952"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_2968_pad_type_0 = const()[name = string("op_2968_pad_type_0"), val = string("valid")];
+            int32 var_2968_groups_0 = const()[name = string("op_2968_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_2968_strides_0 = const()[name = string("op_2968_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_2968_pad_0 = const()[name = string("op_2968_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_2968_dilations_0 = const()[name = string("op_2968_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_6_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(406443712))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409065216))))[name = string("squeeze_6_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_2953_cast_fp16 = transpose(perm = var_2952, x = attn_output_39_cast_fp16)[name = string("transpose_30")];
+            tensor<fp16, [1, 2560, 3]> var_2968_cast_fp16 = conv(dilations = var_2968_dilations_0, groups = var_2968_groups_0, pad = var_2968_pad_0, pad_type = var_2968_pad_type_0, strides = var_2968_strides_0, weight = squeeze_6_cast_fp16_to_fp32_to_fp16_palettized, x = var_2953_cast_fp16)[name = string("op_2968_cast_fp16")];
+            tensor<int32, [3]> var_2972 = const()[name = string("op_2972"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_2978 = const()[name = string("op_2978"), val = int32(-1)];
+            fp16 const_51_promoted_to_fp16 = const()[name = string("const_51_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_103_cast_fp16 = transpose(perm = var_2972, x = var_2968_cast_fp16)[name = string("transpose_29")];
+            tensor<fp16, [1, 3, 2560]> var_2980_cast_fp16 = mul(x = x_103_cast_fp16, y = const_51_promoted_to_fp16)[name = string("op_2980_cast_fp16")];
+            bool input_153_interleave_0 = const()[name = string("input_153_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_153_cast_fp16 = concat(axis = var_2978, interleave = input_153_interleave_0, values = (x_103_cast_fp16, var_2980_cast_fp16))[name = string("input_153_cast_fp16")];
+            tensor<int32, [1]> normed_153_axes_0 = const()[name = string("normed_153_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2975_to_fp16 = const()[name = string("op_2975_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_153_cast_fp16 = layer_norm(axes = normed_153_axes_0, epsilon = var_2975_to_fp16, x = input_153_cast_fp16)[name = string("normed_153_cast_fp16")];
+            tensor<int32, [2]> var_2985_split_sizes_0 = const()[name = string("op_2985_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_2985_axis_0 = const()[name = string("op_2985_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_2985_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_2985_cast_fp16_1 = split(axis = var_2985_axis_0, split_sizes = var_2985_split_sizes_0, x = normed_153_cast_fp16)[name = string("op_2985_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_6_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409067840)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_41_cast_fp16 = mul(x = var_2985_cast_fp16_0, y = layers_6_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_41_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_105_cast_fp16 = add(x = x_95_cast_fp16, y = attn_output_41_cast_fp16)[name = string("x_105_cast_fp16")];
+            int32 var_2994 = const()[name = string("op_2994"), val = int32(-1)];
+            fp16 const_52_promoted_to_fp16 = const()[name = string("const_52_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_2996_cast_fp16 = mul(x = x_105_cast_fp16, y = const_52_promoted_to_fp16)[name = string("op_2996_cast_fp16")];
+            bool input_155_interleave_0 = const()[name = string("input_155_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_155_cast_fp16 = concat(axis = var_2994, interleave = input_155_interleave_0, values = (x_105_cast_fp16, var_2996_cast_fp16))[name = string("input_155_cast_fp16")];
+            tensor<int32, [1]> normed_157_axes_0 = const()[name = string("normed_157_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_2991_to_fp16 = const()[name = string("op_2991_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_157_cast_fp16 = layer_norm(axes = normed_157_axes_0, epsilon = var_2991_to_fp16, x = input_155_cast_fp16)[name = string("normed_157_cast_fp16")];
+            tensor<int32, [2]> var_3001_split_sizes_0 = const()[name = string("op_3001_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3001_axis_0 = const()[name = string("op_3001_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3001_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3001_cast_fp16_1 = split(axis = var_3001_axis_0, split_sizes = var_3001_split_sizes_0, x = normed_157_cast_fp16)[name = string("op_3001_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_6_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409073024)))];
+            tensor<fp16, [1, 3, 2560]> h_39_cast_fp16 = mul(x = var_3001_cast_fp16_0, y = layers_6_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_39_cast_fp16")];
+            tensor<int32, [3]> var_3012 = const()[name = string("op_3012"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_157_axes_0 = const()[name = string("input_157_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3013 = transpose(perm = var_3012, x = h_39_cast_fp16)[name = string("transpose_28")];
+            tensor<fp16, [1, 2560, 1, 3]> input_157 = expand_dims(axes = input_157_axes_0, x = var_3013)[name = string("input_157")];
+            string gate_25_pad_type_0 = const()[name = string("gate_25_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_25_strides_0 = const()[name = string("gate_25_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_25_pad_0 = const()[name = string("gate_25_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_25_dilations_0 = const()[name = string("gate_25_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_25_groups_0 = const()[name = string("gate_25_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_25 = conv(dilations = gate_25_dilations_0, groups = gate_25_groups_0, pad = gate_25_pad_0, pad_type = gate_25_pad_type_0, strides = gate_25_strides_0, weight = layers_6_mlp_gate_proj_weight_palettized, x = input_157)[name = string("gate_25")];
+            string up_13_pad_type_0 = const()[name = string("up_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_13_strides_0 = const()[name = string("up_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_13_pad_0 = const()[name = string("up_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_13_dilations_0 = const()[name = string("up_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_13_groups_0 = const()[name = string("up_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_13 = conv(dilations = up_13_dilations_0, groups = up_13_groups_0, pad = up_13_pad_0, pad_type = up_13_pad_type_0, strides = up_13_strides_0, weight = layers_6_mlp_up_proj_weight_palettized, x = input_157)[name = string("up_13")];
+            string gate_27_mode_0 = const()[name = string("gate_27_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_27 = gelu(mode = gate_27_mode_0, x = gate_25)[name = string("gate_27")];
+            tensor<fp16, [1, 10240, 1, 3]> input_159 = mul(x = gate_27, y = up_13)[name = string("input_159")];
+            string mlp_out_13_pad_type_0 = const()[name = string("mlp_out_13_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_13_strides_0 = const()[name = string("mlp_out_13_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_13_pad_0 = const()[name = string("mlp_out_13_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_13_dilations_0 = const()[name = string("mlp_out_13_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_13_groups_0 = const()[name = string("mlp_out_13_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_13 = conv(dilations = mlp_out_13_dilations_0, groups = mlp_out_13_groups_0, pad = mlp_out_13_pad_0, pad_type = mlp_out_13_pad_type_0, strides = mlp_out_13_strides_0, weight = layers_6_mlp_down_proj_weight_palettized, x = input_159)[name = string("mlp_out_13")];
+            tensor<int32, [1]> var_3053_axes_0 = const()[name = string("op_3053_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3053 = squeeze(axes = var_3053_axes_0, x = mlp_out_13)[name = string("op_3053")];
+            tensor<int32, [3]> var_3057 = const()[name = string("op_3057"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3063 = const()[name = string("op_3063"), val = int32(-1)];
+            fp16 const_53_promoted = const()[name = string("const_53_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_107 = transpose(perm = var_3057, x = var_3053)[name = string("transpose_27")];
+            tensor<fp16, [1, 3, 2560]> var_3065 = mul(x = x_107, y = const_53_promoted)[name = string("op_3065")];
+            bool input_161_interleave_0 = const()[name = string("input_161_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_161 = concat(axis = var_3063, interleave = input_161_interleave_0, values = (x_107, var_3065))[name = string("input_161")];
+            tensor<int32, [1]> normed_161_axes_0 = const()[name = string("normed_161_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3060_to_fp16 = const()[name = string("op_3060_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_161_cast_fp16 = layer_norm(axes = normed_161_axes_0, epsilon = var_3060_to_fp16, x = input_161)[name = string("normed_161_cast_fp16")];
+            tensor<int32, [2]> var_3070_split_sizes_0 = const()[name = string("op_3070_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3070_axis_0 = const()[name = string("op_3070_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3070_0, tensor<fp16, [1, 3, 2560]> var_3070_1 = split(axis = var_3070_axis_0, split_sizes = var_3070_split_sizes_0, x = normed_161_cast_fp16)[name = string("op_3070")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_63 = mul(x = var_3070_0, y = layers_6_post_feedforward_layernorm_weight)[name = string("hidden_states_63")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_65_cast_fp16 = add(x = x_105_cast_fp16, y = hidden_states_63)[name = string("hidden_states_65_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_13_begin_0 = const()[name = string("per_layer_slice_13_begin_0"), val = tensor<int32, [3]>([0, 0, 9984])];
+            tensor<int32, [3]> per_layer_slice_13_end_0 = const()[name = string("per_layer_slice_13_end_0"), val = tensor<int32, [3]>([1, 3, 10240])];
+            tensor<bool, [3]> per_layer_slice_13_end_mask_0 = const()[name = string("per_layer_slice_13_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_13_cast_fp16 = slice_by_index(begin = per_layer_slice_13_begin_0, end = per_layer_slice_13_end_0, end_mask = per_layer_slice_13_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_13_cast_fp16")];
+            tensor<int32, [3]> var_3098 = const()[name = string("op_3098"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_163_axes_0 = const()[name = string("input_163_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3099 = transpose(perm = var_3098, x = hidden_states_65_cast_fp16)[name = string("transpose_26")];
+            tensor<fp16, [1, 2560, 1, 3]> input_163 = expand_dims(axes = input_163_axes_0, x = var_3099)[name = string("input_163")];
+            string gated_37_pad_type_0 = const()[name = string("gated_37_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_37_strides_0 = const()[name = string("gated_37_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_37_pad_0 = const()[name = string("gated_37_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_37_dilations_0 = const()[name = string("gated_37_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_37_groups_0 = const()[name = string("gated_37_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_37 = conv(dilations = gated_37_dilations_0, groups = gated_37_groups_0, pad = gated_37_pad_0, pad_type = gated_37_pad_type_0, strides = gated_37_strides_0, weight = layers_6_per_layer_input_gate_weight_palettized, x = input_163)[name = string("gated_37")];
+            string gated_39_mode_0 = const()[name = string("gated_39_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_39 = gelu(mode = gated_39_mode_0, x = gated_37)[name = string("gated_39")];
+            tensor<int32, [3]> var_3118 = const()[name = string("op_3118"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_13_axes_0 = const()[name = string("per_layer_slice_conv_13_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_3119_cast_fp16 = transpose(perm = var_3118, x = per_layer_slice_13_cast_fp16)[name = string("transpose_25")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_13_cast_fp16 = expand_dims(axes = per_layer_slice_conv_13_axes_0, x = var_3119_cast_fp16)[name = string("per_layer_slice_conv_13_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_165_cast_fp16 = mul(x = gated_39, y = per_layer_slice_conv_13_cast_fp16)[name = string("input_165_cast_fp16")];
+            string gated_41_pad_type_0 = const()[name = string("gated_41_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_41_strides_0 = const()[name = string("gated_41_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_41_pad_0 = const()[name = string("gated_41_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_41_dilations_0 = const()[name = string("gated_41_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_41_groups_0 = const()[name = string("gated_41_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_6_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409078208))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409405952))))[name = string("layers_6_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_41_cast_fp16 = conv(dilations = gated_41_dilations_0, groups = gated_41_groups_0, pad = gated_41_pad_0, pad_type = gated_41_pad_type_0, strides = gated_41_strides_0, weight = layers_6_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_165_cast_fp16)[name = string("gated_41_cast_fp16")];
+            tensor<int32, [1]> var_3135_axes_0 = const()[name = string("op_3135_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3135_cast_fp16 = squeeze(axes = var_3135_axes_0, x = gated_41_cast_fp16)[name = string("op_3135_cast_fp16")];
+            tensor<int32, [3]> var_3139 = const()[name = string("op_3139"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3145 = const()[name = string("op_3145"), val = int32(-1)];
+            fp16 const_54_promoted_to_fp16 = const()[name = string("const_54_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_109_cast_fp16 = transpose(perm = var_3139, x = var_3135_cast_fp16)[name = string("transpose_24")];
+            tensor<fp16, [1, 3, 2560]> var_3147_cast_fp16 = mul(x = x_109_cast_fp16, y = const_54_promoted_to_fp16)[name = string("op_3147_cast_fp16")];
+            bool input_167_interleave_0 = const()[name = string("input_167_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_167_cast_fp16 = concat(axis = var_3145, interleave = input_167_interleave_0, values = (x_109_cast_fp16, var_3147_cast_fp16))[name = string("input_167_cast_fp16")];
+            tensor<int32, [1]> normed_165_axes_0 = const()[name = string("normed_165_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3142_to_fp16 = const()[name = string("op_3142_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_165_cast_fp16 = layer_norm(axes = normed_165_axes_0, epsilon = var_3142_to_fp16, x = input_167_cast_fp16)[name = string("normed_165_cast_fp16")];
+            tensor<int32, [2]> var_3152_split_sizes_0 = const()[name = string("op_3152_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3152_axis_0 = const()[name = string("op_3152_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3152_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3152_cast_fp16_1 = split(axis = var_3152_axis_0, split_sizes = var_3152_split_sizes_0, x = normed_165_cast_fp16)[name = string("op_3152_cast_fp16")];
+            tensor<fp16, [2560]> layers_6_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_6_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409408576)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_69_cast_fp16 = mul(x = var_3152_cast_fp16_0, y = layers_6_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_69_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_71_cast_fp16 = add(x = hidden_states_65_cast_fp16, y = hidden_states_69_cast_fp16)[name = string("hidden_states_71_cast_fp16")];
+            tensor<fp16, [1]> const_55_promoted_to_fp16 = const()[name = string("const_55_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.b6p-1])];
+            tensor<fp16, [1, 3, 2560]> x_111_cast_fp16 = mul(x = hidden_states_71_cast_fp16, y = const_55_promoted_to_fp16)[name = string("x_111_cast_fp16")];
+            int32 var_3167 = const()[name = string("op_3167"), val = int32(-1)];
+            fp16 const_56_promoted_to_fp16 = const()[name = string("const_56_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_3169_cast_fp16 = mul(x = x_111_cast_fp16, y = const_56_promoted_to_fp16)[name = string("op_3169_cast_fp16")];
+            bool input_169_interleave_0 = const()[name = string("input_169_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_169_cast_fp16 = concat(axis = var_3167, interleave = input_169_interleave_0, values = (x_111_cast_fp16, var_3169_cast_fp16))[name = string("input_169_cast_fp16")];
+            tensor<int32, [1]> normed_169_axes_0 = const()[name = string("normed_169_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3164_to_fp16 = const()[name = string("op_3164_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_169_cast_fp16 = layer_norm(axes = normed_169_axes_0, epsilon = var_3164_to_fp16, x = input_169_cast_fp16)[name = string("normed_169_cast_fp16")];
+            tensor<int32, [2]> var_3174_split_sizes_0 = const()[name = string("op_3174_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3174_axis_0 = const()[name = string("op_3174_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3174_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3174_cast_fp16_1 = split(axis = var_3174_axis_0, split_sizes = var_3174_split_sizes_0, x = normed_169_cast_fp16)[name = string("op_3174_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_7_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409413760)))];
+            tensor<fp16, [1, 3, 2560]> h_43_cast_fp16 = mul(x = var_3174_cast_fp16_0, y = layers_7_input_layernorm_weight_promoted_to_fp16)[name = string("h_43_cast_fp16")];
+            tensor<int32, [3]> var_3180 = const()[name = string("op_3180"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_3183_axes_0 = const()[name = string("op_3183_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3181_cast_fp16 = transpose(perm = var_3180, x = h_43_cast_fp16)[name = string("transpose_23")];
+            tensor<fp16, [1, 2560, 1, 3]> var_3183_cast_fp16 = expand_dims(axes = var_3183_axes_0, x = var_3181_cast_fp16)[name = string("op_3183_cast_fp16")];
+            string q_71_pad_type_0 = const()[name = string("q_71_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_71_strides_0 = const()[name = string("q_71_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_71_pad_0 = const()[name = string("q_71_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_71_dilations_0 = const()[name = string("q_71_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_71_groups_0 = const()[name = string("q_71_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2048, 1, 3]> q_71 = conv(dilations = q_71_dilations_0, groups = q_71_groups_0, pad = q_71_pad_0, pad_type = q_71_pad_type_0, strides = q_71_strides_0, weight = layers_7_self_attn_q_proj_weight_palettized, x = var_3183_cast_fp16)[name = string("q_71")];
+            tensor<int32, [4]> var_3204 = const()[name = string("op_3204"), val = tensor<int32, [4]>([1, 8, 256, 3])];
+            tensor<fp16, [1, 8, 256, 3]> var_3205 = reshape(shape = var_3204, x = q_71)[name = string("op_3205")];
+            tensor<int32, [4]> transpose_50_perm_0 = const()[name = string("transpose_50_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_3228 = const()[name = string("op_3228"), val = tensor<int32, [3]>([3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> transpose_50 = transpose(perm = transpose_50_perm_0, x = var_3205)[name = string("transpose_22")];
+            tensor<fp16, [3, 8, 256]> x_113 = reshape(shape = var_3228, x = transpose_50)[name = string("x_113")];
+            int32 var_3234 = const()[name = string("op_3234"), val = int32(-1)];
+            fp16 const_57_promoted = const()[name = string("const_57_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 256]> var_3236 = mul(x = x_113, y = const_57_promoted)[name = string("op_3236")];
+            bool input_173_interleave_0 = const()[name = string("input_173_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 512]> input_173 = concat(axis = var_3234, interleave = input_173_interleave_0, values = (x_113, var_3236))[name = string("input_173")];
+            tensor<int32, [1]> normed_173_axes_0 = const()[name = string("normed_173_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3231_to_fp16 = const()[name = string("op_3231_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 512]> normed_173_cast_fp16 = layer_norm(axes = normed_173_axes_0, epsilon = var_3231_to_fp16, x = input_173)[name = string("normed_173_cast_fp16")];
+            tensor<int32, [2]> var_3241_split_sizes_0 = const()[name = string("op_3241_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_3241_axis_0 = const()[name = string("op_3241_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 256]> var_3241_0, tensor<fp16, [3, 8, 256]> var_3241_1 = split(axis = var_3241_axis_0, split_sizes = var_3241_split_sizes_0, x = normed_173_cast_fp16)[name = string("op_3241")];
+            tensor<fp16, [3, 8, 256]> q_75 = mul(x = var_3241_0, y = layers_0_self_attn_q_norm_weight)[name = string("q_75")];
+            tensor<int32, [4]> var_3248 = const()[name = string("op_3248"), val = tensor<int32, [4]>([1, 3, 8, 256])];
+            tensor<fp16, [1, 3, 8, 256]> var_3249 = reshape(shape = var_3248, x = q_75)[name = string("op_3249")];
+            tensor<int32, [4]> var_3254 = const()[name = string("op_3254"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 256]> q_77 = transpose(perm = var_3254, x = var_3249)[name = string("transpose_21")];
+            tensor<fp16, [1, 8, 3, 256]> var_3256_cast_fp16 = mul(x = q_77, y = cos_s)[name = string("op_3256_cast_fp16")];
+            tensor<int32, [2]> var_3257_split_sizes_0 = const()[name = string("op_3257_split_sizes_0"), val = tensor<int32, [2]>([128, 128])];
+            int32 var_3257_axis_0 = const()[name = string("op_3257_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 128]> var_3257_0, tensor<fp16, [1, 8, 3, 128]> var_3257_1 = split(axis = var_3257_axis_0, split_sizes = var_3257_split_sizes_0, x = q_77)[name = string("op_3257")];
+            fp16 const_58_promoted = const()[name = string("const_58_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 128]> var_3259 = mul(x = var_3257_1, y = const_58_promoted)[name = string("op_3259")];
+            int32 var_3261 = const()[name = string("op_3261"), val = int32(-1)];
+            bool var_3262_interleave_0 = const()[name = string("op_3262_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> var_3262 = concat(axis = var_3261, interleave = var_3262_interleave_0, values = (var_3259, var_3257_0))[name = string("op_3262")];
+            tensor<fp16, [1, 8, 3, 256]> var_3263_cast_fp16 = mul(x = var_3262, y = sin_s)[name = string("op_3263_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 256]> q_79_cast_fp16 = add(x = var_3256_cast_fp16, y = var_3263_cast_fp16)[name = string("q_79_cast_fp16")];
+            bool attn_weights_29_transpose_x_0 = const()[name = string("attn_weights_29_transpose_x_0"), val = bool(false)];
+            bool attn_weights_29_transpose_y_0 = const()[name = string("attn_weights_29_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 512]> attn_weights_29_cast_fp16 = matmul(transpose_x = attn_weights_29_transpose_x_0, transpose_y = attn_weights_29_transpose_y_0, x = q_79_cast_fp16, y = transpose_37_cast_fp16)[name = string("attn_weights_29_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> x_115_cast_fp16 = add(x = attn_weights_29_cast_fp16, y = causal_mask_sliding)[name = string("x_115_cast_fp16")];
+            tensor<int32, [1]> reduce_max_7_axes_0 = const()[name = string("reduce_max_7_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_7_keep_dims_0 = const()[name = string("reduce_max_7_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_7 = reduce_max(axes = reduce_max_7_axes_0, keep_dims = reduce_max_7_keep_dims_0, x = x_115_cast_fp16)[name = string("reduce_max_7")];
+            tensor<fp16, [1, 8, 3, 512]> var_3295 = sub(x = x_115_cast_fp16, y = reduce_max_7)[name = string("op_3295")];
+            tensor<fp16, [1, 8, 3, 512]> var_3301 = exp(x = var_3295)[name = string("op_3301")];
+            tensor<int32, [1]> var_3311_axes_0 = const()[name = string("op_3311_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3311_keep_dims_0 = const()[name = string("op_3311_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_3311 = reduce_sum(axes = var_3311_axes_0, keep_dims = var_3311_keep_dims_0, x = var_3301)[name = string("op_3311")];
+            tensor<fp16, [1, 8, 3, 512]> var_3317_cast_fp16 = real_div(x = var_3301, y = var_3311)[name = string("op_3317_cast_fp16")];
+            bool attn_output_43_transpose_x_0 = const()[name = string("attn_output_43_transpose_x_0"), val = bool(false)];
+            bool attn_output_43_transpose_y_0 = const()[name = string("attn_output_43_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 256]> attn_output_43_cast_fp16 = matmul(transpose_x = attn_output_43_transpose_x_0, transpose_y = attn_output_43_transpose_y_0, x = var_3317_cast_fp16, y = V_expanded_1_cast_fp16)[name = string("attn_output_43_cast_fp16")];
+            tensor<int32, [4]> var_3328 = const()[name = string("op_3328"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_3335 = const()[name = string("op_3335"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 256]> var_3329_cast_fp16 = transpose(perm = var_3328, x = attn_output_43_cast_fp16)[name = string("transpose_20")];
+            tensor<fp16, [1, 3, 2048]> attn_output_45_cast_fp16 = reshape(shape = var_3335, x = var_3329_cast_fp16)[name = string("attn_output_45_cast_fp16")];
+            tensor<int32, [3]> var_3340 = const()[name = string("op_3340"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_3356_pad_type_0 = const()[name = string("op_3356_pad_type_0"), val = string("valid")];
+            int32 var_3356_groups_0 = const()[name = string("op_3356_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_3356_strides_0 = const()[name = string("op_3356_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_3356_pad_0 = const()[name = string("op_3356_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_3356_dilations_0 = const()[name = string("op_3356_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 2048, 1]> squeeze_7_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 2048, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(409418944))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412040448))))[name = string("squeeze_7_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 2048, 3]> var_3341_cast_fp16 = transpose(perm = var_3340, x = attn_output_45_cast_fp16)[name = string("transpose_19")];
+            tensor<fp16, [1, 2560, 3]> var_3356_cast_fp16 = conv(dilations = var_3356_dilations_0, groups = var_3356_groups_0, pad = var_3356_pad_0, pad_type = var_3356_pad_type_0, strides = var_3356_strides_0, weight = squeeze_7_cast_fp16_to_fp32_to_fp16_palettized, x = var_3341_cast_fp16)[name = string("op_3356_cast_fp16")];
+            tensor<int32, [3]> var_3360 = const()[name = string("op_3360"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3366 = const()[name = string("op_3366"), val = int32(-1)];
+            fp16 const_59_promoted_to_fp16 = const()[name = string("const_59_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_119_cast_fp16 = transpose(perm = var_3360, x = var_3356_cast_fp16)[name = string("transpose_18")];
+            tensor<fp16, [1, 3, 2560]> var_3368_cast_fp16 = mul(x = x_119_cast_fp16, y = const_59_promoted_to_fp16)[name = string("op_3368_cast_fp16")];
+            bool input_177_interleave_0 = const()[name = string("input_177_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_177_cast_fp16 = concat(axis = var_3366, interleave = input_177_interleave_0, values = (x_119_cast_fp16, var_3368_cast_fp16))[name = string("input_177_cast_fp16")];
+            tensor<int32, [1]> normed_177_axes_0 = const()[name = string("normed_177_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3363_to_fp16 = const()[name = string("op_3363_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_177_cast_fp16 = layer_norm(axes = normed_177_axes_0, epsilon = var_3363_to_fp16, x = input_177_cast_fp16)[name = string("normed_177_cast_fp16")];
+            tensor<int32, [2]> var_3373_split_sizes_0 = const()[name = string("op_3373_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3373_axis_0 = const()[name = string("op_3373_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3373_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3373_cast_fp16_1 = split(axis = var_3373_axis_0, split_sizes = var_3373_split_sizes_0, x = normed_177_cast_fp16)[name = string("op_3373_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_7_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412043072)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_47_cast_fp16 = mul(x = var_3373_cast_fp16_0, y = layers_7_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_47_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_121_cast_fp16 = add(x = x_111_cast_fp16, y = attn_output_47_cast_fp16)[name = string("x_121_cast_fp16")];
+            int32 var_3382 = const()[name = string("op_3382"), val = int32(-1)];
+            fp16 const_60_promoted_to_fp16 = const()[name = string("const_60_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_3384_cast_fp16 = mul(x = x_121_cast_fp16, y = const_60_promoted_to_fp16)[name = string("op_3384_cast_fp16")];
+            bool input_179_interleave_0 = const()[name = string("input_179_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_179_cast_fp16 = concat(axis = var_3382, interleave = input_179_interleave_0, values = (x_121_cast_fp16, var_3384_cast_fp16))[name = string("input_179_cast_fp16")];
+            tensor<int32, [1]> normed_181_axes_0 = const()[name = string("normed_181_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3379_to_fp16 = const()[name = string("op_3379_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_181_cast_fp16 = layer_norm(axes = normed_181_axes_0, epsilon = var_3379_to_fp16, x = input_179_cast_fp16)[name = string("normed_181_cast_fp16")];
+            tensor<int32, [2]> var_3389_split_sizes_0 = const()[name = string("op_3389_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3389_axis_0 = const()[name = string("op_3389_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3389_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3389_cast_fp16_1 = split(axis = var_3389_axis_0, split_sizes = var_3389_split_sizes_0, x = normed_181_cast_fp16)[name = string("op_3389_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_7_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412048256)))];
+            tensor<fp16, [1, 3, 2560]> h_45_cast_fp16 = mul(x = var_3389_cast_fp16_0, y = layers_7_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_45_cast_fp16")];
+            tensor<int32, [3]> var_3400 = const()[name = string("op_3400"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_181_axes_0 = const()[name = string("input_181_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3401 = transpose(perm = var_3400, x = h_45_cast_fp16)[name = string("transpose_17")];
+            tensor<fp16, [1, 2560, 1, 3]> input_181 = expand_dims(axes = input_181_axes_0, x = var_3401)[name = string("input_181")];
+            string gate_29_pad_type_0 = const()[name = string("gate_29_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_29_strides_0 = const()[name = string("gate_29_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_29_pad_0 = const()[name = string("gate_29_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_29_dilations_0 = const()[name = string("gate_29_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_29_groups_0 = const()[name = string("gate_29_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_29 = conv(dilations = gate_29_dilations_0, groups = gate_29_groups_0, pad = gate_29_pad_0, pad_type = gate_29_pad_type_0, strides = gate_29_strides_0, weight = layers_7_mlp_gate_proj_weight_palettized, x = input_181)[name = string("gate_29")];
+            string up_15_pad_type_0 = const()[name = string("up_15_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_15_strides_0 = const()[name = string("up_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_15_pad_0 = const()[name = string("up_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_15_dilations_0 = const()[name = string("up_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_15_groups_0 = const()[name = string("up_15_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up_15 = conv(dilations = up_15_dilations_0, groups = up_15_groups_0, pad = up_15_pad_0, pad_type = up_15_pad_type_0, strides = up_15_strides_0, weight = layers_7_mlp_up_proj_weight_palettized, x = input_181)[name = string("up_15")];
+            string gate_31_mode_0 = const()[name = string("gate_31_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate_31 = gelu(mode = gate_31_mode_0, x = gate_29)[name = string("gate_31")];
+            tensor<fp16, [1, 10240, 1, 3]> input_183 = mul(x = gate_31, y = up_15)[name = string("input_183")];
+            string mlp_out_15_pad_type_0 = const()[name = string("mlp_out_15_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_15_strides_0 = const()[name = string("mlp_out_15_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_15_pad_0 = const()[name = string("mlp_out_15_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_15_dilations_0 = const()[name = string("mlp_out_15_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_15_groups_0 = const()[name = string("mlp_out_15_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out_15 = conv(dilations = mlp_out_15_dilations_0, groups = mlp_out_15_groups_0, pad = mlp_out_15_pad_0, pad_type = mlp_out_15_pad_type_0, strides = mlp_out_15_strides_0, weight = layers_7_mlp_down_proj_weight_palettized, x = input_183)[name = string("mlp_out_15")];
+            tensor<int32, [1]> var_3441_axes_0 = const()[name = string("op_3441_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3441 = squeeze(axes = var_3441_axes_0, x = mlp_out_15)[name = string("op_3441")];
+            tensor<int32, [3]> var_3445 = const()[name = string("op_3445"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3451 = const()[name = string("op_3451"), val = int32(-1)];
+            fp16 const_61_promoted = const()[name = string("const_61_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_123 = transpose(perm = var_3445, x = var_3441)[name = string("transpose_16")];
+            tensor<fp16, [1, 3, 2560]> var_3453 = mul(x = x_123, y = const_61_promoted)[name = string("op_3453")];
+            bool input_185_interleave_0 = const()[name = string("input_185_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_185 = concat(axis = var_3451, interleave = input_185_interleave_0, values = (x_123, var_3453))[name = string("input_185")];
+            tensor<int32, [1]> normed_185_axes_0 = const()[name = string("normed_185_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3448_to_fp16 = const()[name = string("op_3448_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_185_cast_fp16 = layer_norm(axes = normed_185_axes_0, epsilon = var_3448_to_fp16, x = input_185)[name = string("normed_185_cast_fp16")];
+            tensor<int32, [2]> var_3458_split_sizes_0 = const()[name = string("op_3458_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3458_axis_0 = const()[name = string("op_3458_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3458_0, tensor<fp16, [1, 3, 2560]> var_3458_1 = split(axis = var_3458_axis_0, split_sizes = var_3458_split_sizes_0, x = normed_185_cast_fp16)[name = string("op_3458")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_73 = mul(x = var_3458_0, y = layers_7_post_feedforward_layernorm_weight)[name = string("hidden_states_73")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_75_cast_fp16 = add(x = x_121_cast_fp16, y = hidden_states_73)[name = string("hidden_states_75_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_15_begin_0 = const()[name = string("per_layer_slice_15_begin_0"), val = tensor<int32, [3]>([0, 0, 10240])];
+            tensor<int32, [3]> per_layer_slice_15_end_0 = const()[name = string("per_layer_slice_15_end_0"), val = tensor<int32, [3]>([1, 3, 10496])];
+            tensor<bool, [3]> per_layer_slice_15_end_mask_0 = const()[name = string("per_layer_slice_15_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_15_cast_fp16 = slice_by_index(begin = per_layer_slice_15_begin_0, end = per_layer_slice_15_end_0, end_mask = per_layer_slice_15_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_15_cast_fp16")];
+            tensor<int32, [3]> var_3486 = const()[name = string("op_3486"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_187_axes_0 = const()[name = string("input_187_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3487 = transpose(perm = var_3486, x = hidden_states_75_cast_fp16)[name = string("transpose_15")];
+            tensor<fp16, [1, 2560, 1, 3]> input_187 = expand_dims(axes = input_187_axes_0, x = var_3487)[name = string("input_187")];
+            string gated_43_pad_type_0 = const()[name = string("gated_43_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_43_strides_0 = const()[name = string("gated_43_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_43_pad_0 = const()[name = string("gated_43_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_43_dilations_0 = const()[name = string("gated_43_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_43_groups_0 = const()[name = string("gated_43_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_43 = conv(dilations = gated_43_dilations_0, groups = gated_43_groups_0, pad = gated_43_pad_0, pad_type = gated_43_pad_type_0, strides = gated_43_strides_0, weight = layers_7_per_layer_input_gate_weight_palettized, x = input_187)[name = string("gated_43")];
+            string gated_45_mode_0 = const()[name = string("gated_45_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_45 = gelu(mode = gated_45_mode_0, x = gated_43)[name = string("gated_45")];
+            tensor<int32, [3]> var_3506 = const()[name = string("op_3506"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_15_axes_0 = const()[name = string("per_layer_slice_conv_15_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_3507_cast_fp16 = transpose(perm = var_3506, x = per_layer_slice_15_cast_fp16)[name = string("transpose_14")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_15_cast_fp16 = expand_dims(axes = per_layer_slice_conv_15_axes_0, x = var_3507_cast_fp16)[name = string("per_layer_slice_conv_15_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_189_cast_fp16 = mul(x = gated_45, y = per_layer_slice_conv_15_cast_fp16)[name = string("input_189_cast_fp16")];
+            string gated_47_pad_type_0 = const()[name = string("gated_47_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_47_strides_0 = const()[name = string("gated_47_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_47_pad_0 = const()[name = string("gated_47_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_47_dilations_0 = const()[name = string("gated_47_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_47_groups_0 = const()[name = string("gated_47_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_7_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412053440))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412381184))))[name = string("layers_7_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_47_cast_fp16 = conv(dilations = gated_47_dilations_0, groups = gated_47_groups_0, pad = gated_47_pad_0, pad_type = gated_47_pad_type_0, strides = gated_47_strides_0, weight = layers_7_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_189_cast_fp16)[name = string("gated_47_cast_fp16")];
+            tensor<int32, [1]> var_3523_axes_0 = const()[name = string("op_3523_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3523_cast_fp16 = squeeze(axes = var_3523_axes_0, x = gated_47_cast_fp16)[name = string("op_3523_cast_fp16")];
+            tensor<int32, [3]> var_3527 = const()[name = string("op_3527"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3533 = const()[name = string("op_3533"), val = int32(-1)];
+            fp16 const_62_promoted_to_fp16 = const()[name = string("const_62_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_125_cast_fp16 = transpose(perm = var_3527, x = var_3523_cast_fp16)[name = string("transpose_13")];
+            tensor<fp16, [1, 3, 2560]> var_3535_cast_fp16 = mul(x = x_125_cast_fp16, y = const_62_promoted_to_fp16)[name = string("op_3535_cast_fp16")];
+            bool input_191_interleave_0 = const()[name = string("input_191_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_191_cast_fp16 = concat(axis = var_3533, interleave = input_191_interleave_0, values = (x_125_cast_fp16, var_3535_cast_fp16))[name = string("input_191_cast_fp16")];
+            tensor<int32, [1]> normed_189_axes_0 = const()[name = string("normed_189_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3530_to_fp16 = const()[name = string("op_3530_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_189_cast_fp16 = layer_norm(axes = normed_189_axes_0, epsilon = var_3530_to_fp16, x = input_191_cast_fp16)[name = string("normed_189_cast_fp16")];
+            tensor<int32, [2]> var_3540_split_sizes_0 = const()[name = string("op_3540_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3540_axis_0 = const()[name = string("op_3540_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3540_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3540_cast_fp16_1 = split(axis = var_3540_axis_0, split_sizes = var_3540_split_sizes_0, x = normed_189_cast_fp16)[name = string("op_3540_cast_fp16")];
+            tensor<fp16, [2560]> layers_7_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_7_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412383808)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_79_cast_fp16 = mul(x = var_3540_cast_fp16_0, y = layers_7_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_79_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_81_cast_fp16 = add(x = hidden_states_75_cast_fp16, y = hidden_states_79_cast_fp16)[name = string("hidden_states_81_cast_fp16")];
+            tensor<fp16, [1]> const_63_promoted_to_fp16 = const()[name = string("const_63_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.9ep-1])];
+            tensor<fp16, [1, 3, 2560]> x_127_cast_fp16 = mul(x = hidden_states_81_cast_fp16, y = const_63_promoted_to_fp16)[name = string("x_127_cast_fp16")];
+            int32 var_3555 = const()[name = string("op_3555"), val = int32(-1)];
+            fp16 const_64_promoted_to_fp16 = const()[name = string("const_64_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_3557_cast_fp16 = mul(x = x_127_cast_fp16, y = const_64_promoted_to_fp16)[name = string("op_3557_cast_fp16")];
+            bool input_193_interleave_0 = const()[name = string("input_193_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_193_cast_fp16 = concat(axis = var_3555, interleave = input_193_interleave_0, values = (x_127_cast_fp16, var_3557_cast_fp16))[name = string("input_193_cast_fp16")];
+            tensor<int32, [1]> normed_193_axes_0 = const()[name = string("normed_193_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3552_to_fp16 = const()[name = string("op_3552_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_193_cast_fp16 = layer_norm(axes = normed_193_axes_0, epsilon = var_3552_to_fp16, x = input_193_cast_fp16)[name = string("normed_193_cast_fp16")];
+            tensor<int32, [2]> var_3562_split_sizes_0 = const()[name = string("op_3562_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3562_axis_0 = const()[name = string("op_3562_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3562_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3562_cast_fp16_1 = split(axis = var_3562_axis_0, split_sizes = var_3562_split_sizes_0, x = normed_193_cast_fp16)[name = string("op_3562_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_input_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_8_input_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412388992)))];
+            tensor<fp16, [1, 3, 2560]> h_49_cast_fp16 = mul(x = var_3562_cast_fp16_0, y = layers_8_input_layernorm_weight_promoted_to_fp16)[name = string("h_49_cast_fp16")];
+            tensor<int32, [3]> var_3568 = const()[name = string("op_3568"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> var_3571_axes_0 = const()[name = string("op_3571_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3569_cast_fp16 = transpose(perm = var_3568, x = h_49_cast_fp16)[name = string("transpose_12")];
+            tensor<fp16, [1, 2560, 1, 3]> var_3571_cast_fp16 = expand_dims(axes = var_3571_axes_0, x = var_3569_cast_fp16)[name = string("op_3571_cast_fp16")];
+            string q_81_pad_type_0 = const()[name = string("q_81_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> q_81_strides_0 = const()[name = string("q_81_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> q_81_pad_0 = const()[name = string("q_81_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> q_81_dilations_0 = const()[name = string("q_81_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 q_81_groups_0 = const()[name = string("q_81_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 4096, 1, 3]> q_81 = conv(dilations = q_81_dilations_0, groups = q_81_groups_0, pad = q_81_pad_0, pad_type = q_81_pad_type_0, strides = q_81_strides_0, weight = layers_8_self_attn_q_proj_weight_palettized, x = var_3571_cast_fp16)[name = string("q_81")];
+            tensor<int32, [4]> var_3592 = const()[name = string("op_3592"), val = tensor<int32, [4]>([1, 8, 512, 3])];
+            tensor<fp16, [1, 8, 512, 3]> var_3593 = reshape(shape = var_3592, x = q_81)[name = string("op_3593")];
+            tensor<int32, [4]> transpose_52_perm_0 = const()[name = string("transpose_52_perm_0"), val = tensor<int32, [4]>([0, 3, 1, 2])];
+            tensor<int32, [3]> var_3616 = const()[name = string("op_3616"), val = tensor<int32, [3]>([3, 8, 512])];
+            tensor<fp16, [1, 3, 8, 512]> transpose_52 = transpose(perm = transpose_52_perm_0, x = var_3593)[name = string("transpose_11")];
+            tensor<fp16, [3, 8, 512]> x_129 = reshape(shape = var_3616, x = transpose_52)[name = string("x_129")];
+            int32 var_3622 = const()[name = string("op_3622"), val = int32(-1)];
+            fp16 const_65_promoted = const()[name = string("const_65_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [3, 8, 512]> var_3624 = mul(x = x_129, y = const_65_promoted)[name = string("op_3624")];
+            bool input_197_interleave_0 = const()[name = string("input_197_interleave_0"), val = bool(false)];
+            tensor<fp16, [3, 8, 1024]> input_197 = concat(axis = var_3622, interleave = input_197_interleave_0, values = (x_129, var_3624))[name = string("input_197")];
+            tensor<int32, [1]> normed_197_axes_0 = const()[name = string("normed_197_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3619_to_fp16 = const()[name = string("op_3619_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [3, 8, 1024]> normed_197_cast_fp16 = layer_norm(axes = normed_197_axes_0, epsilon = var_3619_to_fp16, x = input_197)[name = string("normed_197_cast_fp16")];
+            tensor<int32, [2]> var_3629_split_sizes_0 = const()[name = string("op_3629_split_sizes_0"), val = tensor<int32, [2]>([512, 512])];
+            int32 var_3629_axis_0 = const()[name = string("op_3629_axis_0"), val = int32(-1)];
+            tensor<fp16, [3, 8, 512]> var_3629_0, tensor<fp16, [3, 8, 512]> var_3629_1 = split(axis = var_3629_axis_0, split_sizes = var_3629_split_sizes_0, x = normed_197_cast_fp16)[name = string("op_3629")];
+            tensor<fp16, [3, 8, 512]> q_85 = mul(x = var_3629_0, y = layers_2_self_attn_q_norm_weight)[name = string("q_85")];
+            tensor<int32, [4]> var_3636 = const()[name = string("op_3636"), val = tensor<int32, [4]>([1, 3, 8, 512])];
+            tensor<fp16, [1, 3, 8, 512]> var_3637 = reshape(shape = var_3636, x = q_85)[name = string("op_3637")];
+            tensor<int32, [4]> var_3642 = const()[name = string("op_3642"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<fp16, [1, 8, 3, 512]> q_87 = transpose(perm = var_3642, x = var_3637)[name = string("transpose_10")];
+            tensor<fp16, [1, 8, 3, 512]> var_3644_cast_fp16 = mul(x = q_87, y = cos_f)[name = string("op_3644_cast_fp16")];
+            tensor<int32, [2]> var_3645_split_sizes_0 = const()[name = string("op_3645_split_sizes_0"), val = tensor<int32, [2]>([256, 256])];
+            int32 var_3645_axis_0 = const()[name = string("op_3645_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 8, 3, 256]> var_3645_0, tensor<fp16, [1, 8, 3, 256]> var_3645_1 = split(axis = var_3645_axis_0, split_sizes = var_3645_split_sizes_0, x = q_87)[name = string("op_3645")];
+            fp16 const_66_promoted = const()[name = string("const_66_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 8, 3, 256]> var_3647 = mul(x = var_3645_1, y = const_66_promoted)[name = string("op_3647")];
+            int32 var_3649 = const()[name = string("op_3649"), val = int32(-1)];
+            bool var_3650_interleave_0 = const()[name = string("op_3650_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 512]> var_3650 = concat(axis = var_3649, interleave = var_3650_interleave_0, values = (var_3647, var_3645_0))[name = string("op_3650")];
+            tensor<fp16, [1, 8, 3, 512]> var_3651_cast_fp16 = mul(x = var_3650, y = sin_f)[name = string("op_3651_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 512]> q_cast_fp16 = add(x = var_3644_cast_fp16, y = var_3651_cast_fp16)[name = string("q_cast_fp16")];
+            bool attn_weights_33_transpose_x_0 = const()[name = string("attn_weights_33_transpose_x_0"), val = bool(false)];
+            bool attn_weights_33_transpose_y_0 = const()[name = string("attn_weights_33_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 2048]> attn_weights_33_cast_fp16 = matmul(transpose_x = attn_weights_33_transpose_x_0, transpose_y = attn_weights_33_transpose_y_0, x = q_cast_fp16, y = transpose_41_cast_fp16)[name = string("attn_weights_33_cast_fp16")];
+            tensor<fp16, [1, 8, 3, 2048]> x_131_cast_fp16 = add(x = attn_weights_33_cast_fp16, y = causal_mask_full)[name = string("x_131_cast_fp16")];
+            tensor<int32, [1]> reduce_max_8_axes_0 = const()[name = string("reduce_max_8_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_8_keep_dims_0 = const()[name = string("reduce_max_8_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> reduce_max_8 = reduce_max(axes = reduce_max_8_axes_0, keep_dims = reduce_max_8_keep_dims_0, x = x_131_cast_fp16)[name = string("reduce_max_8")];
+            tensor<fp16, [1, 8, 3, 2048]> var_3683 = sub(x = x_131_cast_fp16, y = reduce_max_8)[name = string("op_3683")];
+            tensor<fp16, [1, 8, 3, 2048]> var_3689 = exp(x = var_3683)[name = string("op_3689")];
+            tensor<int32, [1]> var_3699_axes_0 = const()[name = string("op_3699_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3699_keep_dims_0 = const()[name = string("op_3699_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 8, 3, 1]> var_3699 = reduce_sum(axes = var_3699_axes_0, keep_dims = var_3699_keep_dims_0, x = var_3689)[name = string("op_3699")];
+            tensor<fp16, [1, 8, 3, 2048]> var_3705_cast_fp16 = real_div(x = var_3689, y = var_3699)[name = string("op_3705_cast_fp16")];
+            bool attn_output_49_transpose_x_0 = const()[name = string("attn_output_49_transpose_x_0"), val = bool(false)];
+            bool attn_output_49_transpose_y_0 = const()[name = string("attn_output_49_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 8, 3, 512]> attn_output_49_cast_fp16 = matmul(transpose_x = attn_output_49_transpose_x_0, transpose_y = attn_output_49_transpose_y_0, x = var_3705_cast_fp16, y = V_expanded_5_cast_fp16)[name = string("attn_output_49_cast_fp16")];
+            tensor<int32, [4]> var_3716 = const()[name = string("op_3716"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_3723 = const()[name = string("op_3723"), val = tensor<int32, [3]>([1, 3, -1])];
+            tensor<fp16, [1, 3, 8, 512]> var_3717_cast_fp16 = transpose(perm = var_3716, x = attn_output_49_cast_fp16)[name = string("transpose_9")];
+            tensor<fp16, [1, 3, 4096]> attn_output_51_cast_fp16 = reshape(shape = var_3723, x = var_3717_cast_fp16)[name = string("attn_output_51_cast_fp16")];
+            tensor<int32, [3]> var_3728 = const()[name = string("op_3728"), val = tensor<int32, [3]>([0, 2, 1])];
+            string var_3744_pad_type_0 = const()[name = string("op_3744_pad_type_0"), val = string("valid")];
+            int32 var_3744_groups_0 = const()[name = string("op_3744_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_3744_strides_0 = const()[name = string("op_3744_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_3744_pad_0 = const()[name = string("op_3744_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_3744_dilations_0 = const()[name = string("op_3744_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [2560, 4096, 1]> squeeze_8_cast_fp16_to_fp32_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 4096, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(412394176))), lut = tensor<fp16, [80, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(417637120))))[name = string("squeeze_8_cast_fp16_to_fp32_to_fp16_palettized")];
+            tensor<fp16, [1, 4096, 3]> var_3729_cast_fp16 = transpose(perm = var_3728, x = attn_output_51_cast_fp16)[name = string("transpose_8")];
+            tensor<fp16, [1, 2560, 3]> var_3744_cast_fp16 = conv(dilations = var_3744_dilations_0, groups = var_3744_groups_0, pad = var_3744_pad_0, pad_type = var_3744_pad_type_0, strides = var_3744_strides_0, weight = squeeze_8_cast_fp16_to_fp32_to_fp16_palettized, x = var_3729_cast_fp16)[name = string("op_3744_cast_fp16")];
+            tensor<int32, [3]> var_3748 = const()[name = string("op_3748"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3754 = const()[name = string("op_3754"), val = int32(-1)];
+            fp16 const_67_promoted_to_fp16 = const()[name = string("const_67_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_135_cast_fp16 = transpose(perm = var_3748, x = var_3744_cast_fp16)[name = string("transpose_7")];
+            tensor<fp16, [1, 3, 2560]> var_3756_cast_fp16 = mul(x = x_135_cast_fp16, y = const_67_promoted_to_fp16)[name = string("op_3756_cast_fp16")];
+            bool input_201_interleave_0 = const()[name = string("input_201_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_201_cast_fp16 = concat(axis = var_3754, interleave = input_201_interleave_0, values = (x_135_cast_fp16, var_3756_cast_fp16))[name = string("input_201_cast_fp16")];
+            tensor<int32, [1]> normed_201_axes_0 = const()[name = string("normed_201_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3751_to_fp16 = const()[name = string("op_3751_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_201_cast_fp16 = layer_norm(axes = normed_201_axes_0, epsilon = var_3751_to_fp16, x = input_201_cast_fp16)[name = string("normed_201_cast_fp16")];
+            tensor<int32, [2]> var_3761_split_sizes_0 = const()[name = string("op_3761_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3761_axis_0 = const()[name = string("op_3761_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3761_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3761_cast_fp16_1 = split(axis = var_3761_axis_0, split_sizes = var_3761_split_sizes_0, x = normed_201_cast_fp16)[name = string("op_3761_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_post_attention_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_8_post_attention_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(417639744)))];
+            tensor<fp16, [1, 3, 2560]> attn_output_cast_fp16 = mul(x = var_3761_cast_fp16_0, y = layers_8_post_attention_layernorm_weight_promoted_to_fp16)[name = string("attn_output_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> x_137_cast_fp16 = add(x = x_127_cast_fp16, y = attn_output_cast_fp16)[name = string("x_137_cast_fp16")];
+            int32 var_3770 = const()[name = string("op_3770"), val = int32(-1)];
+            fp16 const_68_promoted_to_fp16 = const()[name = string("const_68_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_3772_cast_fp16 = mul(x = x_137_cast_fp16, y = const_68_promoted_to_fp16)[name = string("op_3772_cast_fp16")];
+            bool input_203_interleave_0 = const()[name = string("input_203_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_203_cast_fp16 = concat(axis = var_3770, interleave = input_203_interleave_0, values = (x_137_cast_fp16, var_3772_cast_fp16))[name = string("input_203_cast_fp16")];
+            tensor<int32, [1]> normed_205_axes_0 = const()[name = string("normed_205_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3767_to_fp16 = const()[name = string("op_3767_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_205_cast_fp16 = layer_norm(axes = normed_205_axes_0, epsilon = var_3767_to_fp16, x = input_203_cast_fp16)[name = string("normed_205_cast_fp16")];
+            tensor<int32, [2]> var_3777_split_sizes_0 = const()[name = string("op_3777_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3777_axis_0 = const()[name = string("op_3777_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3777_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3777_cast_fp16_1 = split(axis = var_3777_axis_0, split_sizes = var_3777_split_sizes_0, x = normed_205_cast_fp16)[name = string("op_3777_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_pre_feedforward_layernorm_weight_promoted_to_fp16 = const()[name = string("layers_8_pre_feedforward_layernorm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(417644928)))];
+            tensor<fp16, [1, 3, 2560]> h_51_cast_fp16 = mul(x = var_3777_cast_fp16_0, y = layers_8_pre_feedforward_layernorm_weight_promoted_to_fp16)[name = string("h_51_cast_fp16")];
+            tensor<int32, [3]> var_3788 = const()[name = string("op_3788"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_205_axes_0 = const()[name = string("input_205_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3789 = transpose(perm = var_3788, x = h_51_cast_fp16)[name = string("transpose_6")];
+            tensor<fp16, [1, 2560, 1, 3]> input_205 = expand_dims(axes = input_205_axes_0, x = var_3789)[name = string("input_205")];
+            string gate_33_pad_type_0 = const()[name = string("gate_33_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gate_33_strides_0 = const()[name = string("gate_33_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gate_33_pad_0 = const()[name = string("gate_33_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gate_33_dilations_0 = const()[name = string("gate_33_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gate_33_groups_0 = const()[name = string("gate_33_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> gate_33 = conv(dilations = gate_33_dilations_0, groups = gate_33_groups_0, pad = gate_33_pad_0, pad_type = gate_33_pad_type_0, strides = gate_33_strides_0, weight = layers_8_mlp_gate_proj_weight_palettized, x = input_205)[name = string("gate_33")];
+            string up_pad_type_0 = const()[name = string("up_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> up_strides_0 = const()[name = string("up_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> up_pad_0 = const()[name = string("up_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> up_dilations_0 = const()[name = string("up_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 up_groups_0 = const()[name = string("up_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 10240, 1, 3]> up = conv(dilations = up_dilations_0, groups = up_groups_0, pad = up_pad_0, pad_type = up_pad_type_0, strides = up_strides_0, weight = layers_8_mlp_up_proj_weight_palettized, x = input_205)[name = string("up")];
+            string gate_mode_0 = const()[name = string("gate_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 10240, 1, 3]> gate = gelu(mode = gate_mode_0, x = gate_33)[name = string("gate")];
+            tensor<fp16, [1, 10240, 1, 3]> input_207 = mul(x = gate, y = up)[name = string("input_207")];
+            string mlp_out_pad_type_0 = const()[name = string("mlp_out_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> mlp_out_strides_0 = const()[name = string("mlp_out_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> mlp_out_pad_0 = const()[name = string("mlp_out_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> mlp_out_dilations_0 = const()[name = string("mlp_out_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 mlp_out_groups_0 = const()[name = string("mlp_out_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 2560, 1, 3]> mlp_out = conv(dilations = mlp_out_dilations_0, groups = mlp_out_groups_0, pad = mlp_out_pad_0, pad_type = mlp_out_pad_type_0, strides = mlp_out_strides_0, weight = layers_8_mlp_down_proj_weight_palettized, x = input_207)[name = string("mlp_out")];
+            tensor<int32, [1]> var_3829_axes_0 = const()[name = string("op_3829_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3829 = squeeze(axes = var_3829_axes_0, x = mlp_out)[name = string("op_3829")];
+            tensor<int32, [3]> var_3833 = const()[name = string("op_3833"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3839 = const()[name = string("op_3839"), val = int32(-1)];
+            fp16 const_69_promoted = const()[name = string("const_69_promoted"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_139 = transpose(perm = var_3833, x = var_3829)[name = string("transpose_5")];
+            tensor<fp16, [1, 3, 2560]> var_3841 = mul(x = x_139, y = const_69_promoted)[name = string("op_3841")];
+            bool input_209_interleave_0 = const()[name = string("input_209_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_209 = concat(axis = var_3839, interleave = input_209_interleave_0, values = (x_139, var_3841))[name = string("input_209")];
+            tensor<int32, [1]> normed_209_axes_0 = const()[name = string("normed_209_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3836_to_fp16 = const()[name = string("op_3836_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_209_cast_fp16 = layer_norm(axes = normed_209_axes_0, epsilon = var_3836_to_fp16, x = input_209)[name = string("normed_209_cast_fp16")];
+            tensor<int32, [2]> var_3846_split_sizes_0 = const()[name = string("op_3846_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3846_axis_0 = const()[name = string("op_3846_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3846_0, tensor<fp16, [1, 3, 2560]> var_3846_1 = split(axis = var_3846_axis_0, split_sizes = var_3846_split_sizes_0, x = normed_209_cast_fp16)[name = string("op_3846")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_83 = mul(x = var_3846_0, y = layers_8_post_feedforward_layernorm_weight)[name = string("hidden_states_83")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_85_cast_fp16 = add(x = x_137_cast_fp16, y = hidden_states_83)[name = string("hidden_states_85_cast_fp16")];
+            tensor<int32, [3]> per_layer_slice_begin_0 = const()[name = string("per_layer_slice_begin_0"), val = tensor<int32, [3]>([0, 0, 10496])];
+            tensor<int32, [3]> per_layer_slice_end_0 = const()[name = string("per_layer_slice_end_0"), val = tensor<int32, [3]>([1, 3, 1])];
+            tensor<bool, [3]> per_layer_slice_end_mask_0 = const()[name = string("per_layer_slice_end_mask_0"), val = tensor<bool, [3]>([true, true, true])];
+            tensor<fp16, [1, 3, 256]> per_layer_slice_cast_fp16 = slice_by_index(begin = per_layer_slice_begin_0, end = per_layer_slice_end_0, end_mask = per_layer_slice_end_mask_0, x = per_layer_combined)[name = string("per_layer_slice_cast_fp16")];
+            tensor<int32, [3]> var_3874 = const()[name = string("op_3874"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> input_211_axes_0 = const()[name = string("input_211_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3875 = transpose(perm = var_3874, x = hidden_states_85_cast_fp16)[name = string("transpose_4")];
+            tensor<fp16, [1, 2560, 1, 3]> input_211 = expand_dims(axes = input_211_axes_0, x = var_3875)[name = string("input_211")];
+            string gated_49_pad_type_0 = const()[name = string("gated_49_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_49_strides_0 = const()[name = string("gated_49_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_49_pad_0 = const()[name = string("gated_49_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_49_dilations_0 = const()[name = string("gated_49_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_49_groups_0 = const()[name = string("gated_49_groups_0"), val = int32(1)];
+            tensor<fp16, [1, 256, 1, 3]> gated_49 = conv(dilations = gated_49_dilations_0, groups = gated_49_groups_0, pad = gated_49_pad_0, pad_type = gated_49_pad_type_0, strides = gated_49_strides_0, weight = layers_8_per_layer_input_gate_weight_palettized, x = input_211)[name = string("gated_49")];
+            string gated_51_mode_0 = const()[name = string("gated_51_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 256, 1, 3]> gated_51 = gelu(mode = gated_51_mode_0, x = gated_49)[name = string("gated_51")];
+            tensor<int32, [3]> var_3894 = const()[name = string("op_3894"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<int32, [1]> per_layer_slice_conv_axes_0 = const()[name = string("per_layer_slice_conv_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 256, 3]> var_3895_cast_fp16 = transpose(perm = var_3894, x = per_layer_slice_cast_fp16)[name = string("transpose_3")];
+            tensor<fp16, [1, 256, 1, 3]> per_layer_slice_conv_cast_fp16 = expand_dims(axes = per_layer_slice_conv_axes_0, x = var_3895_cast_fp16)[name = string("per_layer_slice_conv_cast_fp16")];
+            tensor<fp16, [1, 256, 1, 3]> input_213_cast_fp16 = mul(x = gated_51, y = per_layer_slice_conv_cast_fp16)[name = string("input_213_cast_fp16")];
+            string gated_pad_type_0 = const()[name = string("gated_pad_type_0"), val = string("valid")];
+            tensor<int32, [2]> gated_strides_0 = const()[name = string("gated_strides_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<int32, [4]> gated_pad_0 = const()[name = string("gated_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [2]> gated_dilations_0 = const()[name = string("gated_dilations_0"), val = tensor<int32, [2]>([1, 1])];
+            int32 gated_groups_0 = const()[name = string("gated_groups_0"), val = int32(1)];
+            tensor<fp16, [2560, 256, 1, 1]> layers_8_per_layer_projection_weight_promoted_to_fp16_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [2560, 256, 1, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(417650112))), lut = tensor<fp16, [80, 1, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(417977856))))[name = string("layers_8_per_layer_projection_weight_promoted_to_fp16_palettized")];
+            tensor<fp16, [1, 2560, 1, 3]> gated_cast_fp16 = conv(dilations = gated_dilations_0, groups = gated_groups_0, pad = gated_pad_0, pad_type = gated_pad_type_0, strides = gated_strides_0, weight = layers_8_per_layer_projection_weight_promoted_to_fp16_palettized, x = input_213_cast_fp16)[name = string("gated_cast_fp16")];
+            tensor<int32, [1]> var_3911_axes_0 = const()[name = string("op_3911_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2560, 3]> var_3911_cast_fp16 = squeeze(axes = var_3911_axes_0, x = gated_cast_fp16)[name = string("op_3911_cast_fp16")];
+            tensor<int32, [3]> var_3915 = const()[name = string("op_3915"), val = tensor<int32, [3]>([0, 2, 1])];
+            int32 var_3921 = const()[name = string("op_3921"), val = int32(-1)];
+            fp16 const_70_promoted_to_fp16 = const()[name = string("const_70_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> x_141_cast_fp16 = transpose(perm = var_3915, x = var_3911_cast_fp16)[name = string("transpose_2")];
+            tensor<fp16, [1, 3, 2560]> var_3923_cast_fp16 = mul(x = x_141_cast_fp16, y = const_70_promoted_to_fp16)[name = string("op_3923_cast_fp16")];
+            bool input_215_interleave_0 = const()[name = string("input_215_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_215_cast_fp16 = concat(axis = var_3921, interleave = input_215_interleave_0, values = (x_141_cast_fp16, var_3923_cast_fp16))[name = string("input_215_cast_fp16")];
+            tensor<int32, [1]> normed_213_axes_0 = const()[name = string("normed_213_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3918_to_fp16 = const()[name = string("op_3918_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_213_cast_fp16 = layer_norm(axes = normed_213_axes_0, epsilon = var_3918_to_fp16, x = input_215_cast_fp16)[name = string("normed_213_cast_fp16")];
+            tensor<int32, [2]> var_3928_split_sizes_0 = const()[name = string("op_3928_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3928_axis_0 = const()[name = string("op_3928_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3928_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3928_cast_fp16_1 = split(axis = var_3928_axis_0, split_sizes = var_3928_split_sizes_0, x = normed_213_cast_fp16)[name = string("op_3928_cast_fp16")];
+            tensor<fp16, [2560]> layers_8_post_per_layer_input_norm_weight_promoted_to_fp16 = const()[name = string("layers_8_post_per_layer_input_norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(417980480)))];
+            tensor<fp16, [1, 3, 2560]> hidden_states_89_cast_fp16 = mul(x = var_3928_cast_fp16_0, y = layers_8_post_per_layer_input_norm_weight_promoted_to_fp16)[name = string("hidden_states_89_cast_fp16")];
+            tensor<fp16, [1, 3, 2560]> hidden_states_cast_fp16 = add(x = hidden_states_85_cast_fp16, y = hidden_states_89_cast_fp16)[name = string("hidden_states_cast_fp16")];
+            tensor<fp16, [1]> const_71_promoted_to_fp16 = const()[name = string("const_71_promoted_to_fp16"), val = tensor<fp16, [1]>([0x1.c8p-2])];
+            tensor<fp16, [1, 3, 2560]> hidden_states_out = mul(x = hidden_states_cast_fp16, y = const_71_promoted_to_fp16)[name = string("x_cast_fp16")];
+            int32 var_3943 = const()[name = string("op_3943"), val = int32(-1)];
+            fp16 const_72_promoted_to_fp16 = const()[name = string("const_72_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 3, 2560]> var_3945_cast_fp16 = mul(x = hidden_states_out, y = const_72_promoted_to_fp16)[name = string("op_3945_cast_fp16")];
+            bool input_217_interleave_0 = const()[name = string("input_217_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 3, 5120]> input_217_cast_fp16 = concat(axis = var_3943, interleave = input_217_interleave_0, values = (hidden_states_out, var_3945_cast_fp16))[name = string("input_217_cast_fp16")];
+            tensor<int32, [1]> normed_217_axes_0 = const()[name = string("normed_217_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_3940_to_fp16 = const()[name = string("op_3940_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 3, 5120]> normed_217_cast_fp16 = layer_norm(axes = normed_217_axes_0, epsilon = var_3940_to_fp16, x = input_217_cast_fp16)[name = string("normed_217_cast_fp16")];
+            tensor<int32, [2]> var_3950_split_sizes_0 = const()[name = string("op_3950_split_sizes_0"), val = tensor<int32, [2]>([2560, 2560])];
+            int32 var_3950_axis_0 = const()[name = string("op_3950_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 3, 2560]> var_3950_cast_fp16_0, tensor<fp16, [1, 3, 2560]> var_3950_cast_fp16_1 = split(axis = var_3950_axis_0, split_sizes = var_3950_split_sizes_0, x = normed_217_cast_fp16)[name = string("op_3950_cast_fp16")];
+            tensor<fp16, [2560]> norm_weight_promoted_to_fp16 = const()[name = string("norm_weight_promoted_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(417985664)))];
+            tensor<fp16, [1, 3, 2560]> normed_221_cast_fp16 = mul(x = var_3950_cast_fp16_0, y = norm_weight_promoted_to_fp16)[name = string("normed_221_cast_fp16")];
+            tensor<int32, [3]> var_3961 = const()[name = string("op_3961"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp16, [262144, 2560, 1]> squeeze_9_palettized = constexpr_lut_to_dense(indices = tensor<uint4, [262144, 2560, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(417990848))), lut = tensor<fp16, [8192, 1, 1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(753535232))))[name = string("squeeze_9_palettized")];
+            string var_3977_pad_type_0 = const()[name = string("op_3977_pad_type_0"), val = string("valid")];
+            int32 var_3977_groups_0 = const()[name = string("op_3977_groups_0"), val = int32(1)];
+            tensor<int32, [1]> var_3977_strides_0 = const()[name = string("op_3977_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_3977_pad_0 = const()[name = string("op_3977_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_3977_dilations_0 = const()[name = string("op_3977_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 2560, 3]> var_3962 = transpose(perm = var_3961, x = normed_221_cast_fp16)[name = string("transpose_1")];
+            tensor<fp16, [1, 262144, 3]> var_3977 = conv(dilations = var_3977_dilations_0, groups = var_3977_groups_0, pad = var_3977_pad_0, pad_type = var_3977_pad_type_0, strides = var_3977_strides_0, weight = squeeze_9_palettized, x = var_3962)[name = string("op_3977")];
+            tensor<int32, [3]> var_3981 = const()[name = string("op_3981"), val = tensor<int32, [3]>([0, 2, 1])];
+            fp16 _inversed_3984_y_0_to_fp16 = const()[name = string("_inversed_3984_y_0_to_fp16"), val = fp16(0x1.11p-5)];
+            tensor<fp16, [1, 3, 262144]> logits_1 = transpose(perm = var_3981, x = var_3977)[name = string("transpose_0")];
+            tensor<fp16, [1, 3, 262144]> _inversed_3984_cast_fp16 = mul(x = logits_1, y = _inversed_3984_y_0_to_fp16)[name = string("_inversed_3984_cast_fp16")];
+            tensor<fp16, [1, 3, 262144]> var_3985_cast_fp16 = tanh(x = _inversed_3984_cast_fp16)[name = string("op_3985_cast_fp16")];
+            fp16 var_3986_to_fp16 = const()[name = string("op_3986_to_fp16"), val = fp16(0x1.ep+4)];
+            tensor<fp16, [1, 3, 262144]> logits_cast_fp16 = mul(x = var_3985_cast_fp16, y = var_3986_to_fp16)[name = string("logits_cast_fp16")];
+            int32 var_3990_axis_0 = const()[name = string("op_3990_axis_0"), val = int32(-1)];
+            bool var_3990_keep_dims_0 = const()[name = string("op_3990_keep_dims_0"), val = bool(false)];
+            string var_3990_output_dtype_0 = const()[name = string("op_3990_output_dtype_0"), val = string("int32")];
+            tensor<int32, [1, 3]> token_ids = reduce_argmax(axis = var_3990_axis_0, keep_dims = var_3990_keep_dims_0, output_dtype = var_3990_output_dtype_0, x = logits_cast_fp16)[name = string("op_3990_cast_fp16")];
+        } -> (token_ids, hidden_states_out);
+}
\ No newline at end of file
diff --git a/chunk4.mlmodelc/weights/weight.bin b/chunk4.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..adcac95f1a12061f4deb5d464d08365bc361a40d
--- /dev/null
+++ b/chunk4.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f044d109750fec5d781baada3c070cd3d524b674ea68a00c1c99b5e8015cfbb
+size 753797440
diff --git a/cos_full.npy b/cos_full.npy
new file mode 100644
index 0000000000000000000000000000000000000000..058ad65d525c836881e2aa531644304f1a3bafdb
--- /dev/null
+++ b/cos_full.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:767b3a21305a67e3a3fd22e256f2e7385014b32374442b6103fb820c7d9ef1fc
+size 4194432
diff --git a/cos_sliding.npy b/cos_sliding.npy
new file mode 100644
index 0000000000000000000000000000000000000000..81b6cd936a9988f2372f611d346b9efc6056811b
--- /dev/null
+++ b/cos_sliding.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a27afac2d0282008c59736cb498f0b49e6f775e0b3847811fdd06be09c6df4a1
+size 2097280
diff --git a/embed_proj_weight.npy b/embed_proj_weight.npy
new file mode 100644
index 0000000000000000000000000000000000000000..9db21521929994c12b8541a2c09340b5f5b04ff9
--- /dev/null
+++ b/embed_proj_weight.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9cae010a79030ee666136bf5317af76019a87a686bc93947ca1e20535f4a9109
+size 7864448
diff --git a/embed_tokens_per_layer_q8.bin b/embed_tokens_per_layer_q8.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5057e698d18cb2404767c1285cbefbec61dfc2fc
--- /dev/null
+++ b/embed_tokens_per_layer_q8.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:269eb54aa366e2d73474d7521b376025fa53bd2d72c6d3bc0301b0882c6ae681
+size 2818572288
diff --git a/embed_tokens_per_layer_scales.bin b/embed_tokens_per_layer_scales.bin
new file mode 100644
index 0000000000000000000000000000000000000000..104df79461e2090edb9c517f9210d6928120bf40
--- /dev/null
+++ b/embed_tokens_per_layer_scales.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc32925fb678b96d96cf804b77c1d137f157a6136b76e6f8a003d69f4e976fea
+size 524288
diff --git a/embed_tokens_q8.bin b/embed_tokens_q8.bin
new file mode 100644
index 0000000000000000000000000000000000000000..87966ac230eb189839f50f90764b79b584c7e24a
--- /dev/null
+++ b/embed_tokens_q8.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8921fcbb6d8a79c7b304b929f357cdb41905fec75a90bad00dfed071c76fb82
+size 671088640
diff --git a/embed_tokens_scales.bin b/embed_tokens_scales.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5da6d354c44eb2197c2445ecd48a51ce866b6c95
--- /dev/null
+++ b/embed_tokens_scales.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a5c20211cbfbb5e5a1059d91c5c1e0493e7630505028459e7843a9b1b41ee854
+size 524288
diff --git a/hf_model/config.json b/hf_model/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..d68960fdcce766f2bfe41436325a8a483a74d125
--- /dev/null
+++ b/hf_model/config.json
@@ -0,0 +1,197 @@
+{
+  "architectures": [
+    "Gemma4ForConditionalGeneration"
+  ],
+  "audio_config": {
+    "_name_or_path": "",
+    "architectures": null,
+    "attention_chunk_size": 12,
+    "attention_context_left": 13,
+    "attention_context_right": 0,
+    "attention_invalid_logits_value": -1000000000.0,
+    "attention_logit_cap": 50.0,
+    "chunk_size_feed_forward": 0,
+    "conv_kernel_size": 5,
+    "dtype": "bfloat16",
+    "gradient_clipping": 10000000000.0,
+    "hidden_act": "silu",
+    "hidden_size": 1024,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "model_type": "gemma4_audio",
+    "num_attention_heads": 8,
+    "num_hidden_layers": 12,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_proj_dims": 1536,
+    "problem_type": null,
+    "residual_weight": 0.5,
+    "return_dict": true,
+    "rms_norm_eps": 1e-06,
+    "subsampling_conv_channels": [
+      128,
+      32
+    ],
+    "use_clipped_linears": true
+  },
+  "audio_token_id": 258881,
+  "boa_token_id": 256000,
+  "boi_token_id": 255999,
+  "dtype": "bfloat16",
+  "eoa_token_id": 258883,
+  "eoa_token_index": 258883,
+  "eoi_token_id": 258882,
+  "eos_token_id": [
+    1,
+    106
+  ],
+  "image_token_id": 258880,
+  "initializer_range": 0.02,
+  "model_type": "gemma4",
+  "text_config": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "attention_k_eq_v": false,
+    "bos_token_id": 2,
+    "dtype": "bfloat16",
+    "enable_moe_block": false,
+    "eos_token_id": 1,
+    "expert_intermediate_size": null,
+    "final_logit_softcapping": 30.0,
+    "global_head_dim": 512,
+    "head_dim": 256,
+    "hidden_activation": "gelu_pytorch_tanh",
+    "hidden_size": 2560,
+    "hidden_size_per_layer_input": 256,
+    "initializer_range": 0.02,
+    "intermediate_size": 10240,
+    "layer_types": [
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 131072,
+    "model_type": "gemma4_text",
+    "num_attention_heads": 8,
+    "num_experts": null,
+    "num_global_key_value_heads": null,
+    "num_hidden_layers": 42,
+    "num_key_value_heads": 2,
+    "num_kv_shared_layers": 18,
+    "pad_token_id": 0,
+    "rms_norm_eps": 1e-06,
+    "rope_parameters": {
+      "full_attention": {
+        "partial_rotary_factor": 0.25,
+        "rope_theta": 1000000.0,
+        "rope_type": "proportional"
+      },
+      "sliding_attention": {
+        "rope_theta": 10000.0,
+        "rope_type": "default"
+      }
+    },
+    "sliding_window": 512,
+    "tie_word_embeddings": true,
+    "top_k_experts": null,
+    "use_bidirectional_attention": null,
+    "use_cache": true,
+    "use_double_wide_mlp": false,
+    "vocab_size": 262144,
+    "vocab_size_per_layer_input": 262144
+  },
+  "tie_word_embeddings": true,
+  "transformers_version": "5.5.0.dev0",
+  "video_token_id": 258884,
+  "vision_config": {
+    "_name_or_path": "",
+    "architectures": null,
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "chunk_size_feed_forward": 0,
+    "default_output_length": 280,
+    "dtype": "bfloat16",
+    "global_head_dim": 64,
+    "head_dim": 64,
+    "hidden_activation": "gelu_pytorch_tanh",
+    "hidden_size": 768,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "max_position_embeddings": 131072,
+    "model_type": "gemma4_vision",
+    "num_attention_heads": 12,
+    "num_hidden_layers": 16,
+    "num_key_value_heads": 12,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "patch_size": 16,
+    "pooling_kernel_size": 3,
+    "position_embedding_size": 10240,
+    "problem_type": null,
+    "return_dict": true,
+    "rms_norm_eps": 1e-06,
+    "rope_parameters": {
+      "rope_theta": 100.0,
+      "rope_type": "default"
+    },
+    "standardize": false,
+    "use_clipped_linears": true
+  },
+  "vision_soft_tokens_per_image": 280
+}
diff --git a/hf_model/generation_config.json b/hf_model/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..e605bb4523b1462ea9d9a3810b9e3ecf7ab7b1f6
--- /dev/null
+++ b/hf_model/generation_config.json
@@ -0,0 +1,14 @@
+{
+  "bos_token_id": 2,
+  "do_sample": true,
+  "eos_token_id": [
+    1,
+    106,
+    50
+  ],
+  "pad_token_id": 0,
+  "temperature": 1.0,
+  "top_k": 64,
+  "top_p": 0.95,
+  "transformers_version": "5.5.0.dev0"
+}
diff --git a/hf_model/tokenizer.json b/hf_model/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503
--- /dev/null
+++ b/hf_model/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
+size 32169626
diff --git a/hf_model/tokenizer_config.json b/hf_model/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..375b25dc8be85705251e41be1c25310d24932051
--- /dev/null
+++ b/hf_model/tokenizer_config.json
@@ -0,0 +1,74 @@
+{
+  "audio_token": "<|audio|>",
+  "backend": "tokenizers",
+  "boa_token": "<|audio>",
+  "boi_token": "<|image>",
+  "bos_token": "<bos>",
+  "eoa_token": "<audio|>",
+  "eoc_token": "<channel|>",
+  "eoi_token": "<image|>",
+  "eos_token": "<eos>",
+  "eot_token": "<turn|>",
+  "escape_token": "<|\"|>",
+  "etc_token": "<tool_call|>",
+  "etd_token": "<tool|>",
+  "etr_token": "<tool_response|>",
+  "extra_special_tokens": [
+    "<|video|>"
+  ],
+  "image_token": "<|image|>",
+  "mask_token": "<mask>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "padding_side": "left",
+  "processor_class": "Gemma4Processor",
+  "response_schema": {
+    "type": "object",
+    "properties": {
+      "role": {
+        "const": "assistant"
+      },
+      "thinking": {
+        "type": "string"
+      },
+      "content": {
+        "type": "string"
+      },
+      "tool_calls": {
+        "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>",
+        "type": "array",
+        "items": {
+          "type": "object",
+          "properties": {
+            "type": {
+              "const": "function"
+            },
+            "function": {
+              "type": "object",
+              "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})",
+              "properties": {
+                "name": {
+                  "type": "string"
+                },
+                "arguments": {
+                  "type": "object",
+                  "x-parser": "gemma4-tool-call",
+                  "additionalProperties": {}
+                }
+              }
+            }
+          }
+        }
+      }
+    },
+    "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
+  },
+  "soc_token": "<|channel>",
+  "sot_token": "<|turn>",
+  "stc_token": "<|tool_call>",
+  "std_token": "<|tool>",
+  "str_token": "<|tool_response>",
+  "think_token": "<|think|>",
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>"
+}
diff --git a/mel_filterbank.bin b/mel_filterbank.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ea2424596423068bc05fa92aba129c762f76cb8e
--- /dev/null
+++ b/mel_filterbank.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:427860b9a9429175f0e450512def4224f46ced89960dfb1d9cf7479d7e485e2b
+size 131584
diff --git a/model_config.json b/model_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..ef5ff6c86cbdc5bee360796fd8530719038e2f52
--- /dev/null
+++ b/model_config.json
@@ -0,0 +1,26 @@
+{
+  "model_name": "gemma4-e4b",
+  "architecture": "gemma4",
+  "hidden_size": 2560,
+  "num_hidden_layers": 42,
+  "num_attention_heads": 8,
+  "num_key_value_heads": 2,
+  "head_dim": 256,
+  "global_head_dim": 512,
+  "vocab_size": 262144,
+  "context_length": 2048,
+  "sliding_window": 512,
+  "per_layer_dim": 256,
+  "num_layers": 42,
+  "embed_scale": 50.59644256269407,
+  "per_layer_embed_scale": 16.0,
+  "per_layer_model_projection_scale": 0.01976423537605237,
+  "per_layer_input_scale": 0.7071067811865475,
+  "rms_norm_eps": 1e-06,
+  "bos_token_id": 2,
+  "eos_token_id": 1,
+  "final_logit_softcapping": 30.0,
+  "quantization": "int4",
+  "compute_units": "CPU_AND_NE",
+  "tokenizer_repo": "google/gemma-4-E4B-it"
+}
\ No newline at end of file
diff --git a/output_proj_bias.npy b/output_proj_bias.npy
new file mode 100644
index 0000000000000000000000000000000000000000..839b7c9c54c0777bbbd8f9a50523223b269150af
--- /dev/null
+++ b/output_proj_bias.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b5f564508f25bb143480d3dd9071c5deb8340f7c9b35582fefd55815ab355597
+size 3200
diff --git a/output_proj_weight.npy b/output_proj_weight.npy
new file mode 100644
index 0000000000000000000000000000000000000000..d0a509337452653891a974a7a612a66fd0bffa69
--- /dev/null
+++ b/output_proj_weight.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:074449599eed804bc8fe0e83e96d535109c1f9f9a3c51df7c3a232d67502e285
+size 3145856
diff --git a/per_layer_norm_weight.bin b/per_layer_norm_weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..2c2e296eb70aa8677008db33c3db01be0e62c139
--- /dev/null
+++ b/per_layer_norm_weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff67a6ed2e1ac597c28467653c4d67ecd4018f668b1d667af95e564539bd4c10
+size 512
diff --git a/per_layer_projection.bin b/per_layer_projection.bin
new file mode 100644
index 0000000000000000000000000000000000000000..66e5e2170c01ffefd98d3404b48d49d3fb4380fd
--- /dev/null
+++ b/per_layer_projection.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8889e468a6c7e0a43f6ea4a7df18435cf75a5e496415146790f9e0a8a9cd63f
+size 55050240
diff --git a/sin_full.npy b/sin_full.npy
new file mode 100644
index 0000000000000000000000000000000000000000..1d5eb6ea4929fefb9bb3d459c2224d6772506c2c
--- /dev/null
+++ b/sin_full.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa9d6aa40ca6b4f02f2be7563be801bc8ed77bcbc06f3aeb46050587401f2b4e
+size 4194432
diff --git a/sin_sliding.npy b/sin_sliding.npy
new file mode 100644
index 0000000000000000000000000000000000000000..30a46c37c40dea4cffd340d215d0253a8d8bb854
--- /dev/null
+++ b/sin_sliding.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b8a996e1172dbb9748de65e1b7cede880e07f89310719215c627cf26faebcd55
+size 2097280
diff --git a/vision.ane.mlmodelc/analytics/coremldata.bin b/vision.ane.mlmodelc/analytics/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..6c62398ed1dc87044069208da609bd1d9e672628
--- /dev/null
+++ b/vision.ane.mlmodelc/analytics/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fcfbfea0ab4e72567e783811aad1d436664ec9c6ecc2b1004406cab6a27b3b4b
+size 243
diff --git a/vision.ane.mlmodelc/coremldata.bin b/vision.ane.mlmodelc/coremldata.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d72155fb99f7cbb8da9bcd7de526a2e793e67163
--- /dev/null
+++ b/vision.ane.mlmodelc/coremldata.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:67e5aeccbc809b1a933d9859e140e927272469e6714736580382e3b807d92d91
+size 426
diff --git a/vision.ane.mlmodelc/metadata.json b/vision.ane.mlmodelc/metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..1795f749c3737fb8dad0a4112f5113acd82b78d3
--- /dev/null
+++ b/vision.ane.mlmodelc/metadata.json
@@ -0,0 +1,100 @@
+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "storagePrecision" : "Float16",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 256 × 2560)",
+        "shortDescription" : "",
+        "shape" : "[1, 256, 2560]",
+        "name" : "image_features",
+        "type" : "MultiArray"
+      }
+    ],
+    "modelParameters" : [
+
+    ],
+    "specificationVersion" : 9,
+    "mlProgramOperationTypeHistogram" : {
+      "Ios18.expandDims" : 134,
+      "Ios18.mul" : 423,
+      "Ios18.cos" : 2,
+      "Ios18.softmax" : 16,
+      "Ios18.matmul" : 36,
+      "Ios18.floorDiv" : 2,
+      "Ios16.reduceMean" : 112,
+      "Ios18.logicalNot" : 1,
+      "Ios18.equal" : 1,
+      "Ios18.sin" : 2,
+      "Split" : 97,
+      "Select" : 2,
+      "Ios16.reduceMax" : 1,
+      "Ios16.reduceMin" : 1,
+      "Ios18.add" : 227,
+      "Ios16.reduceSum" : 1,
+      "Tile" : 1,
+      "Ios18.layerNorm" : 1,
+      "Ios18.reshape" : 64,
+      "Ios18.maximum" : 1,
+      "Ios18.linear" : 114,
+      "Ios18.concat" : 101,
+      "Ios18.transpose" : 67,
+      "OneHot" : 2,
+      "Ios18.sub" : 2,
+      "Ios18.cast" : 10,
+      "Ios18.pow" : 224,
+      "Ios18.clip" : 176,
+      "Ios18.gelu" : 16,
+      "Ios18.sliceByIndex" : 133
+    },
+    "computePrecision" : "Mixed (Float16, Int32)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+
+    ],
+    "availability" : {
+      "macOS" : "15.0",
+      "tvOS" : "18.0",
+      "visionOS" : "2.0",
+      "watchOS" : "11.0",
+      "iOS" : "18.0",
+      "macCatalyst" : "18.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.conversion_date" : "2026-04-30",
+      "com.github.apple.coremltools.source" : "torch==2.11.0",
+      "com.github.apple.coremltools.version" : "9.0",
+      "com.github.apple.coremltools.source_dialect" : "TorchScript"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float16",
+        "formattedType" : "MultiArray (Float16 1 × 2304 × 768)",
+        "shortDescription" : "",
+        "shape" : "[1, 2304, 768]",
+        "name" : "pixel_values",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1 × 2304 × 2)",
+        "shortDescription" : "",
+        "shape" : "[1, 2304, 2]",
+        "name" : "pixel_position_ids",
+        "type" : "MultiArray"
+      }
+    ],
+    "generatedClassName" : "vision_ane",
+    "method" : "predict"
+  }
+]
\ No newline at end of file
diff --git a/vision.ane.mlmodelc/model.mil b/vision.ane.mlmodelc/model.mil
new file mode 100644
index 0000000000000000000000000000000000000000..a91c0e99dc32ab695a7a8a424680506a9512d479
--- /dev/null
+++ b/vision.ane.mlmodelc/model.mil
@@ -0,0 +1,4167 @@
+program(1.3)
+[buildInfo = dict<string, string>({{"coremlc-component-MIL", "3500.14.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.11.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "9.0"}})]
+{
+    func main<ios18>(tensor<int32, [1, 2304, 2]> pixel_position_ids, tensor<fp16, [1, 2304, 768]> pixel_values) {
+            tensor<fp16, [2, 10240, 768]> model_vision_tower_patch_embedder_position_embedding_table = const()[name = string("model_vision_tower_patch_embedder_position_embedding_table"), val = tensor<fp16, [2, 10240, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64)))];
+            tensor<fp16, [768, 768]> model_vision_tower_patch_embedder_input_proj_weight = const()[name = string("model_vision_tower_patch_embedder_input_proj_weight"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(31457408)))];
+            int32 var_17 = const()[name = string("op_17"), val = int32(3)];
+            int32 var_31 = const()[name = string("op_31"), val = int32(0)];
+            int32 var_38 = const()[name = string("op_38"), val = int32(-1)];
+            tensor<bool, [1, 2304, 2]> var_47 = equal(x = pixel_position_ids, y = var_38)[name = string("op_47")];
+            tensor<int32, [1]> var_38_list = const()[name = string("op_38_list"), val = tensor<int32, [1]>([-1])];
+            string cast_1_dtype_0 = const()[name = string("cast_1_dtype_0"), val = string("int32")];
+            bool reduce_min_0_keep_dims_0 = const()[name = string("reduce_min_0_keep_dims_0"), val = bool(false)];
+            tensor<int32, [1, 2304, 2]> cast_1 = cast(dtype = cast_1_dtype_0, x = var_47)[name = string("cast_80")];
+            tensor<int32, [1, 2304]> reduce_min_0 = reduce_min(axes = var_38_list, keep_dims = reduce_min_0_keep_dims_0, x = cast_1)[name = string("reduce_min_0")];
+            string padding_positions_dtype_0 = const()[name = string("padding_positions_dtype_0"), val = string("bool")];
+            fp16 var_51_to_fp16 = const()[name = string("op_51_to_fp16"), val = fp16(0x1p-1)];
+            tensor<fp16, [1, 2304, 768]> var_52_cast_fp16 = sub(x = pixel_values, y = var_51_to_fp16)[name = string("op_52_cast_fp16")];
+            fp16 var_53_promoted_to_fp16 = const()[name = string("op_53_promoted_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> pixel_values_cast_fp16 = mul(x = var_52_cast_fp16, y = var_53_promoted_to_fp16)[name = string("pixel_values_cast_fp16")];
+            tensor<fp16, [768]> linear_0_bias_0 = const()[name = string("linear_0_bias_0"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(32637120)))];
+            tensor<fp16, [1, 2304, 768]> hidden_states_1 = linear(bias = linear_0_bias_0, weight = model_vision_tower_patch_embedder_input_proj_weight, x = pixel_values_cast_fp16)[name = string("linear_0")];
+            tensor<int32, [1, 2304, 2]> clamped_positions_1 = maximum(x = pixel_position_ids, y = var_31)[name = string("clamped_positions_1")];
+            int32 one_hot_1_one_hot_vector_size_0 = const()[name = string("one_hot_1_one_hot_vector_size_0"), val = int32(10240)];
+            int32 one_hot_1_axis_0 = const()[name = string("one_hot_1_axis_0"), val = int32(-1)];
+            int32 one_hot_1_on_value_0 = const()[name = string("one_hot_1_on_value_0"), val = int32(1)];
+            int32 one_hot_1_off_value_0 = const()[name = string("one_hot_1_off_value_0"), val = int32(0)];
+            tensor<int32, [1, 2304, 2, 10240]> one_hot_1 = one_hot(axis = one_hot_1_axis_0, indices = clamped_positions_1, off_value = one_hot_1_off_value_0, on_value = one_hot_1_on_value_0, one_hot_vector_size = one_hot_1_one_hot_vector_size_0)[name = string("one_hot_1")];
+            tensor<int32, [4]> var_60 = const()[name = string("op_60"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            string one_hot_dtype_0 = const()[name = string("one_hot_dtype_0"), val = string("fp16")];
+            bool position_embeddings_1_transpose_x_0 = const()[name = string("position_embeddings_1_transpose_x_0"), val = bool(false)];
+            bool position_embeddings_1_transpose_y_0 = const()[name = string("position_embeddings_1_transpose_y_0"), val = bool(false)];
+            tensor<int32, [1, 2, 2304, 10240]> var_61 = transpose(perm = var_60, x = one_hot_1)[name = string("transpose_162")];
+            tensor<fp16, [1, 2, 2304, 10240]> one_hot = cast(dtype = one_hot_dtype_0, x = var_61)[name = string("cast_78")];
+            tensor<fp16, [1, 2, 2304, 768]> position_embeddings_1 = matmul(transpose_x = position_embeddings_1_transpose_x_0, transpose_y = position_embeddings_1_transpose_y_0, x = one_hot, y = model_vision_tower_patch_embedder_position_embedding_table)[name = string("position_embeddings_1")];
+            tensor<int32, [1]> position_embeddings_3_axes_0 = const()[name = string("position_embeddings_3_axes_0"), val = tensor<int32, [1]>([1])];
+            bool position_embeddings_3_keep_dims_0 = const()[name = string("position_embeddings_3_keep_dims_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 768]> position_embeddings_3 = reduce_sum(axes = position_embeddings_3_axes_0, keep_dims = position_embeddings_3_keep_dims_0, x = position_embeddings_1)[name = string("position_embeddings_3")];
+            tensor<int32, [1]> var_66_axes_0 = const()[name = string("op_66_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<bool, [1, 2304]> padding_positions = cast(dtype = padding_positions_dtype_0, x = reduce_min_0)[name = string("cast_79")];
+            tensor<bool, [1, 2304, 1]> var_66 = expand_dims(axes = var_66_axes_0, x = padding_positions)[name = string("op_66")];
+            tensor<int32, [3]> var_66_after_broadcast_reps_0 = const()[name = string("op_66_after_broadcast_reps_0"), val = tensor<int32, [3]>([1, 1, 768])];
+            tensor<bool, [1, 2304, 768]> var_66_after_broadcast = tile(reps = var_66_after_broadcast_reps_0, x = var_66)[name = string("op_66_after_broadcast")];
+            tensor<fp16, [1, 2304, 768]> var_36_after_broadcast_to_fp16 = const()[name = string("op_36_after_broadcast_to_fp16"), val = tensor<fp16, [1, 2304, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(32638720)))];
+            tensor<fp16, [1, 2304, 768]> position_embeddings_cast_fp16 = select(a = var_36_after_broadcast_to_fp16, b = position_embeddings_3, cond = var_66_after_broadcast)[name = string("position_embeddings_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_3_cast_fp16 = add(x = hidden_states_1, y = position_embeddings_cast_fp16)[name = string("hidden_states_3_cast_fp16")];
+            tensor<bool, [1, 2304]> attention_mask_1 = logical_not(x = padding_positions)[name = string("attention_mask_1")];
+            string am_dtype_0 = const()[name = string("am_dtype_0"), val = string("fp16")];
+            tensor<int32, [1]> var_104_axes_0 = const()[name = string("op_104_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 2304]> am = cast(dtype = am_dtype_0, x = attention_mask_1)[name = string("cast_77")];
+            tensor<fp16, [1, 1, 2304]> var_104 = expand_dims(axes = var_104_axes_0, x = am)[name = string("op_104")];
+            tensor<int32, [1]> row_axes_0 = const()[name = string("row_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 1, 1, 2304]> row = expand_dims(axes = row_axes_0, x = var_104)[name = string("row")];
+            tensor<int32, [1]> col_axes_0 = const()[name = string("col_axes_0"), val = tensor<int32, [1]>([3])];
+            tensor<fp16, [1, 1, 2304, 1]> col = expand_dims(axes = col_axes_0, x = var_104)[name = string("col")];
+            tensor<fp16, [1, 1, 2304, 2304]> var_108 = mul(x = row, y = col)[name = string("op_108")];
+            fp16 var_21_to_fp16 = const()[name = string("op_21_to_fp16"), val = fp16(0x1p+0)];
+            tensor<fp16, [1, 1, 2304, 2304]> var_109_cast_fp16 = sub(x = var_21_to_fp16, y = var_108)[name = string("op_109_cast_fp16")];
+            fp16 var_110_to_fp16 = const()[name = string("op_110_to_fp16"), val = fp16(-0x1.ffcp+15)];
+            tensor<fp16, [1, 1, 2304, 2304]> attention_mask_cast_fp16 = mul(x = var_109_cast_fp16, y = var_110_to_fp16)[name = string("attention_mask_cast_fp16")];
+            tensor<int32, [3]> dim_position_ids_1_begin_0 = const()[name = string("dim_position_ids_1_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> dim_position_ids_1_end_0 = const()[name = string("dim_position_ids_1_end_0"), val = tensor<int32, [3]>([1, 2304, 1])];
+            tensor<bool, [3]> dim_position_ids_1_end_mask_0 = const()[name = string("dim_position_ids_1_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<bool, [3]> dim_position_ids_1_squeeze_mask_0 = const()[name = string("dim_position_ids_1_squeeze_mask_0"), val = tensor<bool, [3]>([false, false, true])];
+            tensor<int32, [1, 2304]> dim_position_ids_1 = slice_by_index(begin = dim_position_ids_1_begin_0, end = dim_position_ids_1_end_0, end_mask = dim_position_ids_1_end_mask_0, squeeze_mask = dim_position_ids_1_squeeze_mask_0, x = pixel_position_ids)[name = string("dim_position_ids_1")];
+            tensor<int32, [1]> var_125_axes_0 = const()[name = string("op_125_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [1, 1, 2304]> var_125 = expand_dims(axes = var_125_axes_0, x = dim_position_ids_1)[name = string("op_125")];
+            bool var_130_transpose_x_0 = const()[name = string("op_130_transpose_x_0"), val = bool(false)];
+            bool var_130_transpose_y_0 = const()[name = string("op_130_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1]> const_4_to_fp16 = const()[name = string("const_4_to_fp16"), val = tensor<fp16, [1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(36177728)))];
+            string dim_position_ids_expanded_1_to_fp16_dtype_0 = const()[name = string("dim_position_ids_expanded_1_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 1, 2304]> var_125_to_fp16 = cast(dtype = dim_position_ids_expanded_1_to_fp16_dtype_0, x = var_125)[name = string("cast_76")];
+            tensor<fp16, [1, 16, 2304]> var_130_cast_fp16 = matmul(transpose_x = var_130_transpose_x_0, transpose_y = var_130_transpose_y_0, x = const_4_to_fp16, y = var_125_to_fp16)[name = string("op_130_cast_fp16")];
+            tensor<int32, [3]> freqs_1_perm_0 = const()[name = string("freqs_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            bool emb_1_interleave_0 = const()[name = string("emb_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 16]> freqs_1_cast_fp16 = transpose(perm = freqs_1_perm_0, x = var_130_cast_fp16)[name = string("transpose_161")];
+            tensor<fp16, [1, 2304, 32]> emb_1_cast_fp16 = concat(axis = var_38, interleave = emb_1_interleave_0, values = (freqs_1_cast_fp16, freqs_1_cast_fp16))[name = string("emb_1_cast_fp16")];
+            tensor<fp16, [1, 2304, 32]> var_134_cast_fp16 = cos(x = emb_1_cast_fp16)[name = string("op_134_cast_fp16")];
+            tensor<fp16, [1, 2304, 32]> var_137_cast_fp16 = sin(x = emb_1_cast_fp16)[name = string("op_137_cast_fp16")];
+            tensor<int32, [3]> dim_position_ids_begin_0 = const()[name = string("dim_position_ids_begin_0"), val = tensor<int32, [3]>([0, 0, 1])];
+            tensor<int32, [3]> dim_position_ids_end_0 = const()[name = string("dim_position_ids_end_0"), val = tensor<int32, [3]>([1, 2304, 2])];
+            tensor<bool, [3]> dim_position_ids_end_mask_0 = const()[name = string("dim_position_ids_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<bool, [3]> dim_position_ids_squeeze_mask_0 = const()[name = string("dim_position_ids_squeeze_mask_0"), val = tensor<bool, [3]>([false, false, true])];
+            tensor<int32, [1, 2304]> dim_position_ids = slice_by_index(begin = dim_position_ids_begin_0, end = dim_position_ids_end_0, end_mask = dim_position_ids_end_mask_0, squeeze_mask = dim_position_ids_squeeze_mask_0, x = pixel_position_ids)[name = string("dim_position_ids")];
+            tensor<int32, [1]> var_144_axes_0 = const()[name = string("op_144_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [1, 1, 2304]> var_144 = expand_dims(axes = var_144_axes_0, x = dim_position_ids)[name = string("op_144")];
+            bool var_149_transpose_x_0 = const()[name = string("op_149_transpose_x_0"), val = bool(false)];
+            bool var_149_transpose_y_0 = const()[name = string("op_149_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 16, 1]> const_5_to_fp16 = const()[name = string("const_5_to_fp16"), val = tensor<fp16, [1, 16, 1]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(36177856)))];
+            string dim_position_ids_expanded_5_to_fp16_dtype_0 = const()[name = string("dim_position_ids_expanded_5_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 1, 2304]> var_144_to_fp16 = cast(dtype = dim_position_ids_expanded_5_to_fp16_dtype_0, x = var_144)[name = string("cast_75")];
+            tensor<fp16, [1, 16, 2304]> var_149_cast_fp16 = matmul(transpose_x = var_149_transpose_x_0, transpose_y = var_149_transpose_y_0, x = const_5_to_fp16, y = var_144_to_fp16)[name = string("op_149_cast_fp16")];
+            tensor<int32, [3]> freqs_perm_0 = const()[name = string("freqs_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            bool emb_interleave_0 = const()[name = string("emb_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 16]> freqs_cast_fp16 = transpose(perm = freqs_perm_0, x = var_149_cast_fp16)[name = string("transpose_160")];
+            tensor<fp16, [1, 2304, 32]> emb_cast_fp16 = concat(axis = var_38, interleave = emb_interleave_0, values = (freqs_cast_fp16, freqs_cast_fp16))[name = string("emb_cast_fp16")];
+            tensor<fp16, [1, 2304, 32]> var_153_cast_fp16 = cos(x = emb_cast_fp16)[name = string("op_153_cast_fp16")];
+            tensor<fp16, [1, 2304, 32]> var_156_cast_fp16 = sin(x = emb_cast_fp16)[name = string("op_156_cast_fp16")];
+            bool var_160_interleave_0 = const()[name = string("op_160_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 64]> var_160_cast_fp16 = concat(axis = var_38, interleave = var_160_interleave_0, values = (var_134_cast_fp16, var_153_cast_fp16))[name = string("op_160_cast_fp16")];
+            bool var_163_interleave_0 = const()[name = string("op_163_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 64]> var_163_cast_fp16 = concat(axis = var_38, interleave = var_163_interleave_0, values = (var_137_cast_fp16, var_156_cast_fp16))[name = string("op_163_cast_fp16")];
+            fp16 var_33_promoted_to_fp16 = const()[name = string("op_33_promoted_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_176_cast_fp16 = pow(x = hidden_states_3_cast_fp16, y = var_33_promoted_to_fp16)[name = string("op_176_cast_fp16")];
+            tensor<int32, [1]> var_178_axes_0 = const()[name = string("op_178_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_178_keep_dims_0 = const()[name = string("op_178_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_178_cast_fp16 = reduce_mean(axes = var_178_axes_0, keep_dims = var_178_keep_dims_0, x = var_176_cast_fp16)[name = string("op_178_cast_fp16")];
+            fp16 var_179_to_fp16 = const()[name = string("op_179_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_1_cast_fp16 = add(x = var_178_cast_fp16, y = var_179_to_fp16)[name = string("mean_squared_1_cast_fp16")];
+            fp16 var_27_to_fp16 = const()[name = string("op_27_to_fp16"), val = fp16(-0x1p-1)];
+            tensor<fp16, [1, 2304, 1]> var_181_cast_fp16 = pow(x = mean_squared_1_cast_fp16, y = var_27_to_fp16)[name = string("op_181_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_1_cast_fp16 = mul(x = hidden_states_3_cast_fp16, y = var_181_cast_fp16)[name = string("normed_output_1_cast_fp16")];
+            tensor<fp16, [768]> const_6_to_fp16 = const()[name = string("const_6_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(36177984)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_3_cast_fp16 = mul(x = normed_output_1_cast_fp16, y = const_6_to_fp16)[name = string("normed_output_3_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_0_self_attn_q_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_0_self_attn_q_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.98p+2)];
+            fp16 model_vision_tower_encoder_layers_0_self_attn_q_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_0_self_attn_q_proj_input_max_promoted_to_fp16"), val = fp16(0x1.94p+2)];
+            tensor<fp16, [1, 2304, 768]> clip_0_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_0_self_attn_q_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_0_self_attn_q_proj_input_max_promoted_to_fp16, x = normed_output_3_cast_fp16)[name = string("clip_0_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_0_self_attn_q_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_0_self_attn_q_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(36179584)))];
+            tensor<fp16, [1, 2304, 768]> linear_1_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_0_self_attn_q_proj_linear_weight_promoted_to_fp16, x = clip_0_cast_fp16)[name = string("linear_1_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_0_self_attn_q_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_0_self_attn_q_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.6ap+3)];
+            fp16 model_vision_tower_encoder_layers_0_self_attn_q_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_0_self_attn_q_proj_output_max_promoted_to_fp16"), val = fp16(0x1.66p+3)];
+            tensor<fp16, [1, 2304, 768]> clip_1_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_0_self_attn_q_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_0_self_attn_q_proj_output_max_promoted_to_fp16, x = linear_1_cast_fp16)[name = string("clip_1_cast_fp16")];
+            tensor<int32, [4]> var_203 = const()[name = string("op_203"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_11_cast_fp16 = reshape(shape = var_203, x = clip_1_cast_fp16)[name = string("hidden_states_11_cast_fp16")];
+            fp16 var_33_promoted_1_to_fp16 = const()[name = string("op_33_promoted_1_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_207_cast_fp16 = pow(x = hidden_states_11_cast_fp16, y = var_33_promoted_1_to_fp16)[name = string("op_207_cast_fp16")];
+            tensor<int32, [1]> var_209_axes_0 = const()[name = string("op_209_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_209_keep_dims_0 = const()[name = string("op_209_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_209_cast_fp16 = reduce_mean(axes = var_209_axes_0, keep_dims = var_209_keep_dims_0, x = var_207_cast_fp16)[name = string("op_209_cast_fp16")];
+            fp16 var_210_to_fp16 = const()[name = string("op_210_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_3_cast_fp16 = add(x = var_209_cast_fp16, y = var_210_to_fp16)[name = string("mean_squared_3_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_212_cast_fp16 = pow(x = mean_squared_3_cast_fp16, y = var_27_to_fp16)[name = string("op_212_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_5_cast_fp16 = mul(x = hidden_states_11_cast_fp16, y = var_212_cast_fp16)[name = string("normed_output_5_cast_fp16")];
+            tensor<fp16, [64]> const_9_to_fp16 = const()[name = string("const_9_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(37359296)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_7_cast_fp16 = mul(x = normed_output_5_cast_fp16, y = const_9_to_fp16)[name = string("normed_output_7_cast_fp16")];
+            tensor<int32, [2]> var_232 = const()[name = string("op_232"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_233_axis_0 = const()[name = string("op_233_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_233_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_233_cast_fp16_1 = split(axis = var_233_axis_0, split_sizes = var_232, x = normed_output_7_cast_fp16)[name = string("op_233_cast_fp16")];
+            tensor<int32, [2]> var_236 = const()[name = string("op_236"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_237_axis_0 = const()[name = string("op_237_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_237_0, tensor<fp16, [1, 2304, 32]> var_237_1 = split(axis = var_237_axis_0, split_sizes = var_236, x = var_160_cast_fp16)[name = string("op_237")];
+            tensor<int32, [2]> var_240 = const()[name = string("op_240"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_241_axis_0 = const()[name = string("op_241_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_241_0, tensor<fp16, [1, 2304, 32]> var_241_1 = split(axis = var_241_axis_0, split_sizes = var_240, x = var_163_cast_fp16)[name = string("op_241")];
+            tensor<int32, [1]> cos_5_axes_0 = const()[name = string("cos_5_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_5 = expand_dims(axes = cos_5_axes_0, x = var_237_0)[name = string("cos_5")];
+            tensor<int32, [1]> sin_5_axes_0 = const()[name = string("sin_5_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_5 = expand_dims(axes = sin_5_axes_0, x = var_241_0)[name = string("sin_5")];
+            tensor<fp16, [1, 2304, 12, 32]> var_246_cast_fp16 = mul(x = var_233_cast_fp16_0, y = cos_5)[name = string("op_246_cast_fp16")];
+            tensor<int32, [4]> x1_1_begin_0 = const()[name = string("x1_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_1_end_0 = const()[name = string("x1_1_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_1_end_mask_0 = const()[name = string("x1_1_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_1_cast_fp16 = slice_by_index(begin = x1_1_begin_0, end = x1_1_end_0, end_mask = x1_1_end_mask_0, x = var_233_cast_fp16_0)[name = string("x1_1_cast_fp16")];
+            tensor<int32, [4]> x2_1_begin_0 = const()[name = string("x2_1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_1_end_0 = const()[name = string("x2_1_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_1_end_mask_0 = const()[name = string("x2_1_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_1_cast_fp16 = slice_by_index(begin = x2_1_begin_0, end = x2_1_end_0, end_mask = x2_1_end_mask_0, x = var_233_cast_fp16_0)[name = string("x2_1_cast_fp16")];
+            fp16 const_14_promoted_to_fp16 = const()[name = string("const_14_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_257_cast_fp16 = mul(x = x2_1_cast_fp16, y = const_14_promoted_to_fp16)[name = string("op_257_cast_fp16")];
+            bool var_259_interleave_0 = const()[name = string("op_259_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_259_cast_fp16 = concat(axis = var_38, interleave = var_259_interleave_0, values = (var_257_cast_fp16, x1_1_cast_fp16))[name = string("op_259_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_260_cast_fp16 = mul(x = var_259_cast_fp16, y = sin_5)[name = string("op_260_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_261_cast_fp16 = add(x = var_246_cast_fp16, y = var_260_cast_fp16)[name = string("op_261_cast_fp16")];
+            tensor<int32, [1]> cos_9_axes_0 = const()[name = string("cos_9_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_9 = expand_dims(axes = cos_9_axes_0, x = var_237_1)[name = string("cos_9")];
+            tensor<int32, [1]> sin_9_axes_0 = const()[name = string("sin_9_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_9 = expand_dims(axes = sin_9_axes_0, x = var_241_1)[name = string("sin_9")];
+            tensor<fp16, [1, 2304, 12, 32]> var_264_cast_fp16 = mul(x = var_233_cast_fp16_1, y = cos_9)[name = string("op_264_cast_fp16")];
+            tensor<int32, [4]> x1_3_begin_0 = const()[name = string("x1_3_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_3_end_0 = const()[name = string("x1_3_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_3_end_mask_0 = const()[name = string("x1_3_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_3_cast_fp16 = slice_by_index(begin = x1_3_begin_0, end = x1_3_end_0, end_mask = x1_3_end_mask_0, x = var_233_cast_fp16_1)[name = string("x1_3_cast_fp16")];
+            tensor<int32, [4]> x2_3_begin_0 = const()[name = string("x2_3_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_3_end_0 = const()[name = string("x2_3_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_3_end_mask_0 = const()[name = string("x2_3_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_3_cast_fp16 = slice_by_index(begin = x2_3_begin_0, end = x2_3_end_0, end_mask = x2_3_end_mask_0, x = var_233_cast_fp16_1)[name = string("x2_3_cast_fp16")];
+            fp16 const_17_promoted_to_fp16 = const()[name = string("const_17_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_275_cast_fp16 = mul(x = x2_3_cast_fp16, y = const_17_promoted_to_fp16)[name = string("op_275_cast_fp16")];
+            bool var_277_interleave_0 = const()[name = string("op_277_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_277_cast_fp16 = concat(axis = var_38, interleave = var_277_interleave_0, values = (var_275_cast_fp16, x1_3_cast_fp16))[name = string("op_277_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_278_cast_fp16 = mul(x = var_277_cast_fp16, y = sin_9)[name = string("op_278_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_279_cast_fp16 = add(x = var_264_cast_fp16, y = var_278_cast_fp16)[name = string("op_279_cast_fp16")];
+            bool query_states_1_interleave_0 = const()[name = string("query_states_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> query_states_1_cast_fp16 = concat(axis = var_38, interleave = query_states_1_interleave_0, values = (var_261_cast_fp16, var_279_cast_fp16))[name = string("query_states_1_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_0_self_attn_k_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_0_self_attn_k_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(37359488)))];
+            tensor<fp16, [1, 2304, 768]> linear_2_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_0_self_attn_k_proj_linear_weight_promoted_to_fp16, x = clip_0_cast_fp16)[name = string("linear_2_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_0_self_attn_k_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_0_self_attn_k_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.44p+3)];
+            fp16 model_vision_tower_encoder_layers_0_self_attn_k_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_0_self_attn_k_proj_output_max_promoted_to_fp16"), val = fp16(0x1.42p+3)];
+            tensor<fp16, [1, 2304, 768]> clip_3_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_0_self_attn_k_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_0_self_attn_k_proj_output_max_promoted_to_fp16, x = linear_2_cast_fp16)[name = string("clip_3_cast_fp16")];
+            tensor<int32, [4]> var_292 = const()[name = string("op_292"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_17_cast_fp16 = reshape(shape = var_292, x = clip_3_cast_fp16)[name = string("hidden_states_17_cast_fp16")];
+            fp16 var_33_promoted_2_to_fp16 = const()[name = string("op_33_promoted_2_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_296_cast_fp16 = pow(x = hidden_states_17_cast_fp16, y = var_33_promoted_2_to_fp16)[name = string("op_296_cast_fp16")];
+            tensor<int32, [1]> var_298_axes_0 = const()[name = string("op_298_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_298_keep_dims_0 = const()[name = string("op_298_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_298_cast_fp16 = reduce_mean(axes = var_298_axes_0, keep_dims = var_298_keep_dims_0, x = var_296_cast_fp16)[name = string("op_298_cast_fp16")];
+            fp16 var_299_to_fp16 = const()[name = string("op_299_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_5_cast_fp16 = add(x = var_298_cast_fp16, y = var_299_to_fp16)[name = string("mean_squared_5_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_301_cast_fp16 = pow(x = mean_squared_5_cast_fp16, y = var_27_to_fp16)[name = string("op_301_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_9_cast_fp16 = mul(x = hidden_states_17_cast_fp16, y = var_301_cast_fp16)[name = string("normed_output_9_cast_fp16")];
+            tensor<fp16, [64]> const_18_to_fp16 = const()[name = string("const_18_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(38539200)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_11_cast_fp16 = mul(x = normed_output_9_cast_fp16, y = const_18_to_fp16)[name = string("normed_output_11_cast_fp16")];
+            tensor<int32, [2]> var_321 = const()[name = string("op_321"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_322_axis_0 = const()[name = string("op_322_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_322_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_322_cast_fp16_1 = split(axis = var_322_axis_0, split_sizes = var_321, x = normed_output_11_cast_fp16)[name = string("op_322_cast_fp16")];
+            tensor<int32, [2]> var_325 = const()[name = string("op_325"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_326_axis_0 = const()[name = string("op_326_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_326_0, tensor<fp16, [1, 2304, 32]> var_326_1 = split(axis = var_326_axis_0, split_sizes = var_325, x = var_160_cast_fp16)[name = string("op_326")];
+            tensor<int32, [2]> var_329 = const()[name = string("op_329"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_330_axis_0 = const()[name = string("op_330_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_330_0, tensor<fp16, [1, 2304, 32]> var_330_1 = split(axis = var_330_axis_0, split_sizes = var_329, x = var_163_cast_fp16)[name = string("op_330")];
+            tensor<int32, [1]> cos_13_axes_0 = const()[name = string("cos_13_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_13 = expand_dims(axes = cos_13_axes_0, x = var_326_0)[name = string("cos_13")];
+            tensor<int32, [1]> sin_13_axes_0 = const()[name = string("sin_13_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_13 = expand_dims(axes = sin_13_axes_0, x = var_330_0)[name = string("sin_13")];
+            tensor<fp16, [1, 2304, 12, 32]> var_335_cast_fp16 = mul(x = var_322_cast_fp16_0, y = cos_13)[name = string("op_335_cast_fp16")];
+            tensor<int32, [4]> x1_5_begin_0 = const()[name = string("x1_5_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_5_end_0 = const()[name = string("x1_5_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_5_end_mask_0 = const()[name = string("x1_5_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_5_cast_fp16 = slice_by_index(begin = x1_5_begin_0, end = x1_5_end_0, end_mask = x1_5_end_mask_0, x = var_322_cast_fp16_0)[name = string("x1_5_cast_fp16")];
+            tensor<int32, [4]> x2_5_begin_0 = const()[name = string("x2_5_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_5_end_0 = const()[name = string("x2_5_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_5_end_mask_0 = const()[name = string("x2_5_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_5_cast_fp16 = slice_by_index(begin = x2_5_begin_0, end = x2_5_end_0, end_mask = x2_5_end_mask_0, x = var_322_cast_fp16_0)[name = string("x2_5_cast_fp16")];
+            fp16 const_23_promoted_to_fp16 = const()[name = string("const_23_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_346_cast_fp16 = mul(x = x2_5_cast_fp16, y = const_23_promoted_to_fp16)[name = string("op_346_cast_fp16")];
+            bool var_348_interleave_0 = const()[name = string("op_348_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_348_cast_fp16 = concat(axis = var_38, interleave = var_348_interleave_0, values = (var_346_cast_fp16, x1_5_cast_fp16))[name = string("op_348_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_349_cast_fp16 = mul(x = var_348_cast_fp16, y = sin_13)[name = string("op_349_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_350_cast_fp16 = add(x = var_335_cast_fp16, y = var_349_cast_fp16)[name = string("op_350_cast_fp16")];
+            tensor<int32, [1]> cos_17_axes_0 = const()[name = string("cos_17_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_17 = expand_dims(axes = cos_17_axes_0, x = var_326_1)[name = string("cos_17")];
+            tensor<int32, [1]> sin_17_axes_0 = const()[name = string("sin_17_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_17 = expand_dims(axes = sin_17_axes_0, x = var_330_1)[name = string("sin_17")];
+            tensor<fp16, [1, 2304, 12, 32]> var_353_cast_fp16 = mul(x = var_322_cast_fp16_1, y = cos_17)[name = string("op_353_cast_fp16")];
+            tensor<int32, [4]> x1_7_begin_0 = const()[name = string("x1_7_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_7_end_0 = const()[name = string("x1_7_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_7_end_mask_0 = const()[name = string("x1_7_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_7_cast_fp16 = slice_by_index(begin = x1_7_begin_0, end = x1_7_end_0, end_mask = x1_7_end_mask_0, x = var_322_cast_fp16_1)[name = string("x1_7_cast_fp16")];
+            tensor<int32, [4]> x2_7_begin_0 = const()[name = string("x2_7_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_7_end_0 = const()[name = string("x2_7_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_7_end_mask_0 = const()[name = string("x2_7_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_7_cast_fp16 = slice_by_index(begin = x2_7_begin_0, end = x2_7_end_0, end_mask = x2_7_end_mask_0, x = var_322_cast_fp16_1)[name = string("x2_7_cast_fp16")];
+            fp16 const_26_promoted_to_fp16 = const()[name = string("const_26_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_364_cast_fp16 = mul(x = x2_7_cast_fp16, y = const_26_promoted_to_fp16)[name = string("op_364_cast_fp16")];
+            bool var_366_interleave_0 = const()[name = string("op_366_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_366_cast_fp16 = concat(axis = var_38, interleave = var_366_interleave_0, values = (var_364_cast_fp16, x1_7_cast_fp16))[name = string("op_366_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_367_cast_fp16 = mul(x = var_366_cast_fp16, y = sin_17)[name = string("op_367_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_368_cast_fp16 = add(x = var_353_cast_fp16, y = var_367_cast_fp16)[name = string("op_368_cast_fp16")];
+            bool key_states_1_interleave_0 = const()[name = string("key_states_1_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> key_states_1_cast_fp16 = concat(axis = var_38, interleave = key_states_1_interleave_0, values = (var_350_cast_fp16, var_368_cast_fp16))[name = string("key_states_1_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_0_self_attn_v_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_0_self_attn_v_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(38539392)))];
+            tensor<fp16, [1, 2304, 768]> linear_3_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_0_self_attn_v_proj_linear_weight_promoted_to_fp16, x = clip_0_cast_fp16)[name = string("linear_3_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_0_self_attn_v_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_0_self_attn_v_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.44p+3)];
+            fp16 model_vision_tower_encoder_layers_0_self_attn_v_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_0_self_attn_v_proj_output_max_promoted_to_fp16"), val = fp16(0x1.42p+3)];
+            tensor<fp16, [1, 2304, 768]> clip_5_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_0_self_attn_v_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_0_self_attn_v_proj_output_max_promoted_to_fp16, x = linear_3_cast_fp16)[name = string("clip_5_cast_fp16")];
+            tensor<int32, [4]> var_381 = const()[name = string("op_381"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_23_cast_fp16 = reshape(shape = var_381, x = clip_5_cast_fp16)[name = string("hidden_states_23_cast_fp16")];
+            fp16 var_33_promoted_3_to_fp16 = const()[name = string("op_33_promoted_3_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_384_cast_fp16 = pow(x = hidden_states_23_cast_fp16, y = var_33_promoted_3_to_fp16)[name = string("op_384_cast_fp16")];
+            tensor<int32, [1]> var_386_axes_0 = const()[name = string("op_386_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_386_keep_dims_0 = const()[name = string("op_386_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_386_cast_fp16 = reduce_mean(axes = var_386_axes_0, keep_dims = var_386_keep_dims_0, x = var_384_cast_fp16)[name = string("op_386_cast_fp16")];
+            fp16 var_387_to_fp16 = const()[name = string("op_387_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_7_cast_fp16 = add(x = var_386_cast_fp16, y = var_387_to_fp16)[name = string("mean_squared_7_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_389_cast_fp16 = pow(x = mean_squared_7_cast_fp16, y = var_27_to_fp16)[name = string("op_389_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_13_cast_fp16 = mul(x = hidden_states_23_cast_fp16, y = var_389_cast_fp16)[name = string("normed_output_13_cast_fp16")];
+            tensor<int32, [4]> hidden_states_29_perm_0 = const()[name = string("hidden_states_29_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            bool matmul_0_transpose_y_0 = const()[name = string("matmul_0_transpose_y_0"), val = bool(true)];
+            bool matmul_0_transpose_x_0 = const()[name = string("matmul_0_transpose_x_0"), val = bool(false)];
+            tensor<int32, [4]> transpose_64_perm_0 = const()[name = string("transpose_64_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<int32, [4]> transpose_65_perm_0 = const()[name = string("transpose_65_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_65 = transpose(perm = transpose_65_perm_0, x = key_states_1_cast_fp16)[name = string("transpose_157")];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_64 = transpose(perm = transpose_64_perm_0, x = query_states_1_cast_fp16)[name = string("transpose_158")];
+            tensor<fp16, [1, 12, 2304, 2304]> matmul_0_cast_fp16 = matmul(transpose_x = matmul_0_transpose_x_0, transpose_y = matmul_0_transpose_y_0, x = transpose_64, y = transpose_65)[name = string("matmul_0_cast_fp16")];
+            tensor<fp16, [1, 12, 2304, 2304]> add_0_cast_fp16 = add(x = matmul_0_cast_fp16, y = attention_mask_cast_fp16)[name = string("add_0_cast_fp16")];
+            int32 softmax_0_axis_0 = const()[name = string("softmax_0_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 12, 2304, 2304]> softmax_0_cast_fp16 = softmax(axis = softmax_0_axis_0, x = add_0_cast_fp16)[name = string("softmax_0_cast_fp16")];
+            bool attn_output_1_transpose_x_0 = const()[name = string("attn_output_1_transpose_x_0"), val = bool(false)];
+            bool attn_output_1_transpose_y_0 = const()[name = string("attn_output_1_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 12, 2304, 64]> hidden_states_29_cast_fp16 = transpose(perm = hidden_states_29_perm_0, x = normed_output_13_cast_fp16)[name = string("transpose_159")];
+            tensor<fp16, [1, 12, 2304, 64]> attn_output_1_cast_fp16 = matmul(transpose_x = attn_output_1_transpose_x_0, transpose_y = attn_output_1_transpose_y_0, x = softmax_0_cast_fp16, y = hidden_states_29_cast_fp16)[name = string("attn_output_1_cast_fp16")];
+            tensor<int32, [4]> var_394_perm_0 = const()[name = string("op_394_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_396 = const()[name = string("op_396"), val = tensor<int32, [3]>([1, 2304, -1])];
+            tensor<fp16, [1, 2304, 12, 64]> var_394_cast_fp16 = transpose(perm = var_394_perm_0, x = attn_output_1_cast_fp16)[name = string("transpose_156")];
+            tensor<fp16, [1, 2304, 768]> var_397_cast_fp16 = reshape(shape = var_396, x = var_394_cast_fp16)[name = string("op_397_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_0_self_attn_o_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_0_self_attn_o_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.cp+1)];
+            fp16 model_vision_tower_encoder_layers_0_self_attn_o_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_0_self_attn_o_proj_input_max_promoted_to_fp16"), val = fp16(0x1.bep+1)];
+            tensor<fp16, [1, 2304, 768]> clip_6_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_0_self_attn_o_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_0_self_attn_o_proj_input_max_promoted_to_fp16, x = var_397_cast_fp16)[name = string("clip_6_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_0_self_attn_o_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_0_self_attn_o_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(39719104)))];
+            tensor<fp16, [1, 2304, 768]> linear_4_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_0_self_attn_o_proj_linear_weight_promoted_to_fp16, x = clip_6_cast_fp16)[name = string("linear_4_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_0_self_attn_o_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_0_self_attn_o_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.56p+4)];
+            fp16 model_vision_tower_encoder_layers_0_self_attn_o_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_0_self_attn_o_proj_output_max_promoted_to_fp16"), val = fp16(0x1.54p+4)];
+            tensor<fp16, [1, 2304, 768]> clip_7_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_0_self_attn_o_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_0_self_attn_o_proj_output_max_promoted_to_fp16, x = linear_4_cast_fp16)[name = string("clip_7_cast_fp16")];
+            fp16 var_33_promoted_4_to_fp16 = const()[name = string("op_33_promoted_4_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_410_cast_fp16 = pow(x = clip_7_cast_fp16, y = var_33_promoted_4_to_fp16)[name = string("op_410_cast_fp16")];
+            tensor<int32, [1]> var_412_axes_0 = const()[name = string("op_412_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_412_keep_dims_0 = const()[name = string("op_412_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_412_cast_fp16 = reduce_mean(axes = var_412_axes_0, keep_dims = var_412_keep_dims_0, x = var_410_cast_fp16)[name = string("op_412_cast_fp16")];
+            fp16 var_413_to_fp16 = const()[name = string("op_413_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_9_cast_fp16 = add(x = var_412_cast_fp16, y = var_413_to_fp16)[name = string("mean_squared_9_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_415_cast_fp16 = pow(x = mean_squared_9_cast_fp16, y = var_27_to_fp16)[name = string("op_415_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_15_cast_fp16 = mul(x = clip_7_cast_fp16, y = var_415_cast_fp16)[name = string("normed_output_15_cast_fp16")];
+            tensor<fp16, [768]> const_27_to_fp16 = const()[name = string("const_27_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(40898816)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_17_cast_fp16 = mul(x = normed_output_15_cast_fp16, y = const_27_to_fp16)[name = string("normed_output_17_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_41_cast_fp16 = add(x = hidden_states_3_cast_fp16, y = normed_output_17_cast_fp16)[name = string("hidden_states_41_cast_fp16")];
+            fp16 var_33_promoted_5_to_fp16 = const()[name = string("op_33_promoted_5_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_423_cast_fp16 = pow(x = hidden_states_41_cast_fp16, y = var_33_promoted_5_to_fp16)[name = string("op_423_cast_fp16")];
+            tensor<int32, [1]> var_425_axes_0 = const()[name = string("op_425_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_425_keep_dims_0 = const()[name = string("op_425_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_425_cast_fp16 = reduce_mean(axes = var_425_axes_0, keep_dims = var_425_keep_dims_0, x = var_423_cast_fp16)[name = string("op_425_cast_fp16")];
+            fp16 var_426_to_fp16 = const()[name = string("op_426_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_11_cast_fp16 = add(x = var_425_cast_fp16, y = var_426_to_fp16)[name = string("mean_squared_11_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_428_cast_fp16 = pow(x = mean_squared_11_cast_fp16, y = var_27_to_fp16)[name = string("op_428_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_19_cast_fp16 = mul(x = hidden_states_41_cast_fp16, y = var_428_cast_fp16)[name = string("normed_output_19_cast_fp16")];
+            tensor<fp16, [768]> const_28_to_fp16 = const()[name = string("const_28_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(40900416)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_21_cast_fp16 = mul(x = normed_output_19_cast_fp16, y = const_28_to_fp16)[name = string("normed_output_21_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_0_mlp_gate_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_0_mlp_gate_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.3ap+1)];
+            fp16 model_vision_tower_encoder_layers_0_mlp_gate_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_0_mlp_gate_proj_input_max_promoted_to_fp16"), val = fp16(0x1.38p+1)];
+            tensor<fp16, [1, 2304, 768]> clip_8_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_0_mlp_gate_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_0_mlp_gate_proj_input_max_promoted_to_fp16, x = normed_output_21_cast_fp16)[name = string("clip_8_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_0_mlp_gate_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_0_mlp_gate_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(40902016)))];
+            tensor<fp16, [3072]> linear_5_bias_0_to_fp16 = const()[name = string("linear_5_bias_0_to_fp16"), val = tensor<fp16, [3072]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(45620672)))];
+            tensor<fp16, [1, 2304, 3072]> linear_5_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_0_mlp_gate_proj_linear_weight_promoted_to_fp16, x = clip_8_cast_fp16)[name = string("linear_5_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_0_mlp_gate_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_0_mlp_gate_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.2p+2)];
+            fp16 model_vision_tower_encoder_layers_0_mlp_gate_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_0_mlp_gate_proj_output_max_promoted_to_fp16"), val = fp16(0x1.1ep+2)];
+            tensor<fp16, [1, 2304, 3072]> clip_9_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_0_mlp_gate_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_0_mlp_gate_proj_output_max_promoted_to_fp16, x = linear_5_cast_fp16)[name = string("clip_9_cast_fp16")];
+            string var_445_mode_0 = const()[name = string("op_445_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 2304, 3072]> var_445_cast_fp16 = gelu(mode = var_445_mode_0, x = clip_9_cast_fp16)[name = string("op_445_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_0_mlp_up_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_0_mlp_up_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(45626880)))];
+            tensor<fp16, [1, 2304, 3072]> linear_6_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_0_mlp_up_proj_linear_weight_promoted_to_fp16, x = clip_8_cast_fp16)[name = string("linear_6_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_0_mlp_up_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_0_mlp_up_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.2p+2)];
+            fp16 model_vision_tower_encoder_layers_0_mlp_up_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_0_mlp_up_proj_output_max_promoted_to_fp16"), val = fp16(0x1.1ep+2)];
+            tensor<fp16, [1, 2304, 3072]> clip_11_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_0_mlp_up_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_0_mlp_up_proj_output_max_promoted_to_fp16, x = linear_6_cast_fp16)[name = string("clip_11_cast_fp16")];
+            tensor<fp16, [1, 2304, 3072]> hidden_states_51_cast_fp16 = mul(x = var_445_cast_fp16, y = clip_11_cast_fp16)[name = string("hidden_states_51_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_0_mlp_down_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_0_mlp_down_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.88p+3)];
+            fp16 model_vision_tower_encoder_layers_0_mlp_down_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_0_mlp_down_proj_input_max_promoted_to_fp16"), val = fp16(0x1.86p+3)];
+            tensor<fp16, [1, 2304, 3072]> clip_12_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_0_mlp_down_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_0_mlp_down_proj_input_max_promoted_to_fp16, x = hidden_states_51_cast_fp16)[name = string("clip_12_cast_fp16")];
+            tensor<fp16, [768, 3072]> model_vision_tower_encoder_layers_0_mlp_down_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_0_mlp_down_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 3072]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(50345536)))];
+            tensor<fp16, [1, 2304, 768]> linear_7_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_0_mlp_down_proj_linear_weight_promoted_to_fp16, x = clip_12_cast_fp16)[name = string("linear_7_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_0_mlp_down_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_0_mlp_down_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.4cp+3)];
+            fp16 model_vision_tower_encoder_layers_0_mlp_down_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_0_mlp_down_proj_output_max_promoted_to_fp16"), val = fp16(0x1.48p+3)];
+            tensor<fp16, [1, 2304, 768]> clip_13_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_0_mlp_down_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_0_mlp_down_proj_output_max_promoted_to_fp16, x = linear_7_cast_fp16)[name = string("clip_13_cast_fp16")];
+            fp16 var_33_promoted_6_to_fp16 = const()[name = string("op_33_promoted_6_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_467_cast_fp16 = pow(x = clip_13_cast_fp16, y = var_33_promoted_6_to_fp16)[name = string("op_467_cast_fp16")];
+            tensor<int32, [1]> var_469_axes_0 = const()[name = string("op_469_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_469_keep_dims_0 = const()[name = string("op_469_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_469_cast_fp16 = reduce_mean(axes = var_469_axes_0, keep_dims = var_469_keep_dims_0, x = var_467_cast_fp16)[name = string("op_469_cast_fp16")];
+            fp16 var_470_to_fp16 = const()[name = string("op_470_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_13_cast_fp16 = add(x = var_469_cast_fp16, y = var_470_to_fp16)[name = string("mean_squared_13_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_472_cast_fp16 = pow(x = mean_squared_13_cast_fp16, y = var_27_to_fp16)[name = string("op_472_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_23_cast_fp16 = mul(x = clip_13_cast_fp16, y = var_472_cast_fp16)[name = string("normed_output_23_cast_fp16")];
+            tensor<fp16, [768]> const_29_to_fp16 = const()[name = string("const_29_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(55064192)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_25_cast_fp16 = mul(x = normed_output_23_cast_fp16, y = const_29_to_fp16)[name = string("normed_output_25_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_61_cast_fp16 = add(x = hidden_states_41_cast_fp16, y = normed_output_25_cast_fp16)[name = string("hidden_states_61_cast_fp16")];
+            fp16 var_33_promoted_7_to_fp16 = const()[name = string("op_33_promoted_7_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_486_cast_fp16 = pow(x = hidden_states_61_cast_fp16, y = var_33_promoted_7_to_fp16)[name = string("op_486_cast_fp16")];
+            tensor<int32, [1]> var_488_axes_0 = const()[name = string("op_488_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_488_keep_dims_0 = const()[name = string("op_488_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_488_cast_fp16 = reduce_mean(axes = var_488_axes_0, keep_dims = var_488_keep_dims_0, x = var_486_cast_fp16)[name = string("op_488_cast_fp16")];
+            fp16 var_489_to_fp16 = const()[name = string("op_489_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_15_cast_fp16 = add(x = var_488_cast_fp16, y = var_489_to_fp16)[name = string("mean_squared_15_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_491_cast_fp16 = pow(x = mean_squared_15_cast_fp16, y = var_27_to_fp16)[name = string("op_491_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_27_cast_fp16 = mul(x = hidden_states_61_cast_fp16, y = var_491_cast_fp16)[name = string("normed_output_27_cast_fp16")];
+            tensor<fp16, [768]> const_30_to_fp16 = const()[name = string("const_30_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(55065792)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_29_cast_fp16 = mul(x = normed_output_27_cast_fp16, y = const_30_to_fp16)[name = string("normed_output_29_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_1_self_attn_q_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_1_self_attn_q_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.64p+2)];
+            fp16 model_vision_tower_encoder_layers_1_self_attn_q_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_1_self_attn_q_proj_input_max_promoted_to_fp16"), val = fp16(0x1.62p+2)];
+            tensor<fp16, [1, 2304, 768]> clip_14_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_1_self_attn_q_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_1_self_attn_q_proj_input_max_promoted_to_fp16, x = normed_output_29_cast_fp16)[name = string("clip_14_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_1_self_attn_q_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_1_self_attn_q_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(55067392)))];
+            tensor<fp16, [1, 2304, 768]> linear_8_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_1_self_attn_q_proj_linear_weight_promoted_to_fp16, x = clip_14_cast_fp16)[name = string("linear_8_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_1_self_attn_q_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_1_self_attn_q_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.8ep+3)];
+            fp16 model_vision_tower_encoder_layers_1_self_attn_q_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_1_self_attn_q_proj_output_max_promoted_to_fp16"), val = fp16(0x1.8cp+3)];
+            tensor<fp16, [1, 2304, 768]> clip_15_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_1_self_attn_q_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_1_self_attn_q_proj_output_max_promoted_to_fp16, x = linear_8_cast_fp16)[name = string("clip_15_cast_fp16")];
+            tensor<int32, [4]> var_513 = const()[name = string("op_513"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_69_cast_fp16 = reshape(shape = var_513, x = clip_15_cast_fp16)[name = string("hidden_states_69_cast_fp16")];
+            fp16 var_33_promoted_8_to_fp16 = const()[name = string("op_33_promoted_8_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_517_cast_fp16 = pow(x = hidden_states_69_cast_fp16, y = var_33_promoted_8_to_fp16)[name = string("op_517_cast_fp16")];
+            tensor<int32, [1]> var_519_axes_0 = const()[name = string("op_519_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_519_keep_dims_0 = const()[name = string("op_519_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_519_cast_fp16 = reduce_mean(axes = var_519_axes_0, keep_dims = var_519_keep_dims_0, x = var_517_cast_fp16)[name = string("op_519_cast_fp16")];
+            fp16 var_520_to_fp16 = const()[name = string("op_520_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_17_cast_fp16 = add(x = var_519_cast_fp16, y = var_520_to_fp16)[name = string("mean_squared_17_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_522_cast_fp16 = pow(x = mean_squared_17_cast_fp16, y = var_27_to_fp16)[name = string("op_522_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_31_cast_fp16 = mul(x = hidden_states_69_cast_fp16, y = var_522_cast_fp16)[name = string("normed_output_31_cast_fp16")];
+            tensor<fp16, [64]> const_33_to_fp16 = const()[name = string("const_33_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(56247104)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_33_cast_fp16 = mul(x = normed_output_31_cast_fp16, y = const_33_to_fp16)[name = string("normed_output_33_cast_fp16")];
+            tensor<int32, [2]> var_542 = const()[name = string("op_542"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_543_axis_0 = const()[name = string("op_543_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_543_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_543_cast_fp16_1 = split(axis = var_543_axis_0, split_sizes = var_542, x = normed_output_33_cast_fp16)[name = string("op_543_cast_fp16")];
+            tensor<int32, [2]> var_546 = const()[name = string("op_546"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_547_axis_0 = const()[name = string("op_547_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_547_0, tensor<fp16, [1, 2304, 32]> var_547_1 = split(axis = var_547_axis_0, split_sizes = var_546, x = var_160_cast_fp16)[name = string("op_547")];
+            tensor<int32, [2]> var_550 = const()[name = string("op_550"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_551_axis_0 = const()[name = string("op_551_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_551_0, tensor<fp16, [1, 2304, 32]> var_551_1 = split(axis = var_551_axis_0, split_sizes = var_550, x = var_163_cast_fp16)[name = string("op_551")];
+            tensor<int32, [1]> cos_21_axes_0 = const()[name = string("cos_21_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_21 = expand_dims(axes = cos_21_axes_0, x = var_547_0)[name = string("cos_21")];
+            tensor<int32, [1]> sin_21_axes_0 = const()[name = string("sin_21_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_21 = expand_dims(axes = sin_21_axes_0, x = var_551_0)[name = string("sin_21")];
+            tensor<fp16, [1, 2304, 12, 32]> var_556_cast_fp16 = mul(x = var_543_cast_fp16_0, y = cos_21)[name = string("op_556_cast_fp16")];
+            tensor<int32, [4]> x1_9_begin_0 = const()[name = string("x1_9_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_9_end_0 = const()[name = string("x1_9_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_9_end_mask_0 = const()[name = string("x1_9_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_9_cast_fp16 = slice_by_index(begin = x1_9_begin_0, end = x1_9_end_0, end_mask = x1_9_end_mask_0, x = var_543_cast_fp16_0)[name = string("x1_9_cast_fp16")];
+            tensor<int32, [4]> x2_9_begin_0 = const()[name = string("x2_9_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_9_end_0 = const()[name = string("x2_9_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_9_end_mask_0 = const()[name = string("x2_9_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_9_cast_fp16 = slice_by_index(begin = x2_9_begin_0, end = x2_9_end_0, end_mask = x2_9_end_mask_0, x = var_543_cast_fp16_0)[name = string("x2_9_cast_fp16")];
+            fp16 const_38_promoted_to_fp16 = const()[name = string("const_38_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_567_cast_fp16 = mul(x = x2_9_cast_fp16, y = const_38_promoted_to_fp16)[name = string("op_567_cast_fp16")];
+            bool var_569_interleave_0 = const()[name = string("op_569_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_569_cast_fp16 = concat(axis = var_38, interleave = var_569_interleave_0, values = (var_567_cast_fp16, x1_9_cast_fp16))[name = string("op_569_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_570_cast_fp16 = mul(x = var_569_cast_fp16, y = sin_21)[name = string("op_570_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_571_cast_fp16 = add(x = var_556_cast_fp16, y = var_570_cast_fp16)[name = string("op_571_cast_fp16")];
+            tensor<int32, [1]> cos_25_axes_0 = const()[name = string("cos_25_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_25 = expand_dims(axes = cos_25_axes_0, x = var_547_1)[name = string("cos_25")];
+            tensor<int32, [1]> sin_25_axes_0 = const()[name = string("sin_25_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_25 = expand_dims(axes = sin_25_axes_0, x = var_551_1)[name = string("sin_25")];
+            tensor<fp16, [1, 2304, 12, 32]> var_574_cast_fp16 = mul(x = var_543_cast_fp16_1, y = cos_25)[name = string("op_574_cast_fp16")];
+            tensor<int32, [4]> x1_11_begin_0 = const()[name = string("x1_11_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_11_end_0 = const()[name = string("x1_11_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_11_end_mask_0 = const()[name = string("x1_11_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_11_cast_fp16 = slice_by_index(begin = x1_11_begin_0, end = x1_11_end_0, end_mask = x1_11_end_mask_0, x = var_543_cast_fp16_1)[name = string("x1_11_cast_fp16")];
+            tensor<int32, [4]> x2_11_begin_0 = const()[name = string("x2_11_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_11_end_0 = const()[name = string("x2_11_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_11_end_mask_0 = const()[name = string("x2_11_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_11_cast_fp16 = slice_by_index(begin = x2_11_begin_0, end = x2_11_end_0, end_mask = x2_11_end_mask_0, x = var_543_cast_fp16_1)[name = string("x2_11_cast_fp16")];
+            fp16 const_41_promoted_to_fp16 = const()[name = string("const_41_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_585_cast_fp16 = mul(x = x2_11_cast_fp16, y = const_41_promoted_to_fp16)[name = string("op_585_cast_fp16")];
+            bool var_587_interleave_0 = const()[name = string("op_587_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_587_cast_fp16 = concat(axis = var_38, interleave = var_587_interleave_0, values = (var_585_cast_fp16, x1_11_cast_fp16))[name = string("op_587_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_588_cast_fp16 = mul(x = var_587_cast_fp16, y = sin_25)[name = string("op_588_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_589_cast_fp16 = add(x = var_574_cast_fp16, y = var_588_cast_fp16)[name = string("op_589_cast_fp16")];
+            bool query_states_3_interleave_0 = const()[name = string("query_states_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> query_states_3_cast_fp16 = concat(axis = var_38, interleave = query_states_3_interleave_0, values = (var_571_cast_fp16, var_589_cast_fp16))[name = string("query_states_3_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_1_self_attn_k_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_1_self_attn_k_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(56247296)))];
+            tensor<fp16, [1, 2304, 768]> linear_9_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_1_self_attn_k_proj_linear_weight_promoted_to_fp16, x = clip_14_cast_fp16)[name = string("linear_9_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_1_self_attn_k_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_1_self_attn_k_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.66p+3)];
+            fp16 model_vision_tower_encoder_layers_1_self_attn_k_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_1_self_attn_k_proj_output_max_promoted_to_fp16"), val = fp16(0x1.64p+3)];
+            tensor<fp16, [1, 2304, 768]> clip_17_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_1_self_attn_k_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_1_self_attn_k_proj_output_max_promoted_to_fp16, x = linear_9_cast_fp16)[name = string("clip_17_cast_fp16")];
+            tensor<int32, [4]> var_602 = const()[name = string("op_602"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_75_cast_fp16 = reshape(shape = var_602, x = clip_17_cast_fp16)[name = string("hidden_states_75_cast_fp16")];
+            fp16 var_33_promoted_9_to_fp16 = const()[name = string("op_33_promoted_9_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_606_cast_fp16 = pow(x = hidden_states_75_cast_fp16, y = var_33_promoted_9_to_fp16)[name = string("op_606_cast_fp16")];
+            tensor<int32, [1]> var_608_axes_0 = const()[name = string("op_608_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_608_keep_dims_0 = const()[name = string("op_608_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_608_cast_fp16 = reduce_mean(axes = var_608_axes_0, keep_dims = var_608_keep_dims_0, x = var_606_cast_fp16)[name = string("op_608_cast_fp16")];
+            fp16 var_609_to_fp16 = const()[name = string("op_609_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_19_cast_fp16 = add(x = var_608_cast_fp16, y = var_609_to_fp16)[name = string("mean_squared_19_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_611_cast_fp16 = pow(x = mean_squared_19_cast_fp16, y = var_27_to_fp16)[name = string("op_611_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_35_cast_fp16 = mul(x = hidden_states_75_cast_fp16, y = var_611_cast_fp16)[name = string("normed_output_35_cast_fp16")];
+            tensor<fp16, [64]> const_42_to_fp16 = const()[name = string("const_42_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(57427008)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_37_cast_fp16 = mul(x = normed_output_35_cast_fp16, y = const_42_to_fp16)[name = string("normed_output_37_cast_fp16")];
+            tensor<int32, [2]> var_631 = const()[name = string("op_631"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_632_axis_0 = const()[name = string("op_632_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_632_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_632_cast_fp16_1 = split(axis = var_632_axis_0, split_sizes = var_631, x = normed_output_37_cast_fp16)[name = string("op_632_cast_fp16")];
+            tensor<int32, [2]> var_635 = const()[name = string("op_635"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_636_axis_0 = const()[name = string("op_636_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_636_0, tensor<fp16, [1, 2304, 32]> var_636_1 = split(axis = var_636_axis_0, split_sizes = var_635, x = var_160_cast_fp16)[name = string("op_636")];
+            tensor<int32, [2]> var_639 = const()[name = string("op_639"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_640_axis_0 = const()[name = string("op_640_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_640_0, tensor<fp16, [1, 2304, 32]> var_640_1 = split(axis = var_640_axis_0, split_sizes = var_639, x = var_163_cast_fp16)[name = string("op_640")];
+            tensor<int32, [1]> cos_29_axes_0 = const()[name = string("cos_29_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_29 = expand_dims(axes = cos_29_axes_0, x = var_636_0)[name = string("cos_29")];
+            tensor<int32, [1]> sin_29_axes_0 = const()[name = string("sin_29_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_29 = expand_dims(axes = sin_29_axes_0, x = var_640_0)[name = string("sin_29")];
+            tensor<fp16, [1, 2304, 12, 32]> var_645_cast_fp16 = mul(x = var_632_cast_fp16_0, y = cos_29)[name = string("op_645_cast_fp16")];
+            tensor<int32, [4]> x1_13_begin_0 = const()[name = string("x1_13_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_13_end_0 = const()[name = string("x1_13_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_13_end_mask_0 = const()[name = string("x1_13_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_13_cast_fp16 = slice_by_index(begin = x1_13_begin_0, end = x1_13_end_0, end_mask = x1_13_end_mask_0, x = var_632_cast_fp16_0)[name = string("x1_13_cast_fp16")];
+            tensor<int32, [4]> x2_13_begin_0 = const()[name = string("x2_13_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_13_end_0 = const()[name = string("x2_13_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_13_end_mask_0 = const()[name = string("x2_13_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_13_cast_fp16 = slice_by_index(begin = x2_13_begin_0, end = x2_13_end_0, end_mask = x2_13_end_mask_0, x = var_632_cast_fp16_0)[name = string("x2_13_cast_fp16")];
+            fp16 const_47_promoted_to_fp16 = const()[name = string("const_47_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_656_cast_fp16 = mul(x = x2_13_cast_fp16, y = const_47_promoted_to_fp16)[name = string("op_656_cast_fp16")];
+            bool var_658_interleave_0 = const()[name = string("op_658_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_658_cast_fp16 = concat(axis = var_38, interleave = var_658_interleave_0, values = (var_656_cast_fp16, x1_13_cast_fp16))[name = string("op_658_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_659_cast_fp16 = mul(x = var_658_cast_fp16, y = sin_29)[name = string("op_659_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_660_cast_fp16 = add(x = var_645_cast_fp16, y = var_659_cast_fp16)[name = string("op_660_cast_fp16")];
+            tensor<int32, [1]> cos_33_axes_0 = const()[name = string("cos_33_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_33 = expand_dims(axes = cos_33_axes_0, x = var_636_1)[name = string("cos_33")];
+            tensor<int32, [1]> sin_33_axes_0 = const()[name = string("sin_33_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_33 = expand_dims(axes = sin_33_axes_0, x = var_640_1)[name = string("sin_33")];
+            tensor<fp16, [1, 2304, 12, 32]> var_663_cast_fp16 = mul(x = var_632_cast_fp16_1, y = cos_33)[name = string("op_663_cast_fp16")];
+            tensor<int32, [4]> x1_15_begin_0 = const()[name = string("x1_15_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_15_end_0 = const()[name = string("x1_15_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_15_end_mask_0 = const()[name = string("x1_15_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_15_cast_fp16 = slice_by_index(begin = x1_15_begin_0, end = x1_15_end_0, end_mask = x1_15_end_mask_0, x = var_632_cast_fp16_1)[name = string("x1_15_cast_fp16")];
+            tensor<int32, [4]> x2_15_begin_0 = const()[name = string("x2_15_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_15_end_0 = const()[name = string("x2_15_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_15_end_mask_0 = const()[name = string("x2_15_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_15_cast_fp16 = slice_by_index(begin = x2_15_begin_0, end = x2_15_end_0, end_mask = x2_15_end_mask_0, x = var_632_cast_fp16_1)[name = string("x2_15_cast_fp16")];
+            fp16 const_50_promoted_to_fp16 = const()[name = string("const_50_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_674_cast_fp16 = mul(x = x2_15_cast_fp16, y = const_50_promoted_to_fp16)[name = string("op_674_cast_fp16")];
+            bool var_676_interleave_0 = const()[name = string("op_676_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_676_cast_fp16 = concat(axis = var_38, interleave = var_676_interleave_0, values = (var_674_cast_fp16, x1_15_cast_fp16))[name = string("op_676_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_677_cast_fp16 = mul(x = var_676_cast_fp16, y = sin_33)[name = string("op_677_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_678_cast_fp16 = add(x = var_663_cast_fp16, y = var_677_cast_fp16)[name = string("op_678_cast_fp16")];
+            bool key_states_3_interleave_0 = const()[name = string("key_states_3_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> key_states_3_cast_fp16 = concat(axis = var_38, interleave = key_states_3_interleave_0, values = (var_660_cast_fp16, var_678_cast_fp16))[name = string("key_states_3_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_1_self_attn_v_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_1_self_attn_v_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(57427200)))];
+            tensor<fp16, [1, 2304, 768]> linear_10_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_1_self_attn_v_proj_linear_weight_promoted_to_fp16, x = clip_14_cast_fp16)[name = string("linear_10_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_1_self_attn_v_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_1_self_attn_v_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.66p+3)];
+            fp16 model_vision_tower_encoder_layers_1_self_attn_v_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_1_self_attn_v_proj_output_max_promoted_to_fp16"), val = fp16(0x1.64p+3)];
+            tensor<fp16, [1, 2304, 768]> clip_19_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_1_self_attn_v_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_1_self_attn_v_proj_output_max_promoted_to_fp16, x = linear_10_cast_fp16)[name = string("clip_19_cast_fp16")];
+            tensor<int32, [4]> var_691 = const()[name = string("op_691"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_81_cast_fp16 = reshape(shape = var_691, x = clip_19_cast_fp16)[name = string("hidden_states_81_cast_fp16")];
+            fp16 var_33_promoted_10_to_fp16 = const()[name = string("op_33_promoted_10_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_694_cast_fp16 = pow(x = hidden_states_81_cast_fp16, y = var_33_promoted_10_to_fp16)[name = string("op_694_cast_fp16")];
+            tensor<int32, [1]> var_696_axes_0 = const()[name = string("op_696_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_696_keep_dims_0 = const()[name = string("op_696_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_696_cast_fp16 = reduce_mean(axes = var_696_axes_0, keep_dims = var_696_keep_dims_0, x = var_694_cast_fp16)[name = string("op_696_cast_fp16")];
+            fp16 var_697_to_fp16 = const()[name = string("op_697_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_21_cast_fp16 = add(x = var_696_cast_fp16, y = var_697_to_fp16)[name = string("mean_squared_21_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_699_cast_fp16 = pow(x = mean_squared_21_cast_fp16, y = var_27_to_fp16)[name = string("op_699_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_39_cast_fp16 = mul(x = hidden_states_81_cast_fp16, y = var_699_cast_fp16)[name = string("normed_output_39_cast_fp16")];
+            tensor<int32, [4]> hidden_states_87_perm_0 = const()[name = string("hidden_states_87_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            bool matmul_1_transpose_y_0 = const()[name = string("matmul_1_transpose_y_0"), val = bool(true)];
+            bool matmul_1_transpose_x_0 = const()[name = string("matmul_1_transpose_x_0"), val = bool(false)];
+            tensor<int32, [4]> transpose_66_perm_0 = const()[name = string("transpose_66_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<int32, [4]> transpose_67_perm_0 = const()[name = string("transpose_67_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_67 = transpose(perm = transpose_67_perm_0, x = key_states_3_cast_fp16)[name = string("transpose_153")];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_66 = transpose(perm = transpose_66_perm_0, x = query_states_3_cast_fp16)[name = string("transpose_154")];
+            tensor<fp16, [1, 12, 2304, 2304]> matmul_1_cast_fp16 = matmul(transpose_x = matmul_1_transpose_x_0, transpose_y = matmul_1_transpose_y_0, x = transpose_66, y = transpose_67)[name = string("matmul_1_cast_fp16")];
+            tensor<fp16, [1, 12, 2304, 2304]> add_1_cast_fp16 = add(x = matmul_1_cast_fp16, y = attention_mask_cast_fp16)[name = string("add_1_cast_fp16")];
+            int32 softmax_1_axis_0 = const()[name = string("softmax_1_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 12, 2304, 2304]> softmax_1_cast_fp16 = softmax(axis = softmax_1_axis_0, x = add_1_cast_fp16)[name = string("softmax_1_cast_fp16")];
+            bool attn_output_5_transpose_x_0 = const()[name = string("attn_output_5_transpose_x_0"), val = bool(false)];
+            bool attn_output_5_transpose_y_0 = const()[name = string("attn_output_5_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 12, 2304, 64]> hidden_states_87_cast_fp16 = transpose(perm = hidden_states_87_perm_0, x = normed_output_39_cast_fp16)[name = string("transpose_155")];
+            tensor<fp16, [1, 12, 2304, 64]> attn_output_5_cast_fp16 = matmul(transpose_x = attn_output_5_transpose_x_0, transpose_y = attn_output_5_transpose_y_0, x = softmax_1_cast_fp16, y = hidden_states_87_cast_fp16)[name = string("attn_output_5_cast_fp16")];
+            tensor<int32, [4]> var_704_perm_0 = const()[name = string("op_704_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_706 = const()[name = string("op_706"), val = tensor<int32, [3]>([1, 2304, -1])];
+            tensor<fp16, [1, 2304, 12, 64]> var_704_cast_fp16 = transpose(perm = var_704_perm_0, x = attn_output_5_cast_fp16)[name = string("transpose_152")];
+            tensor<fp16, [1, 2304, 768]> var_707_cast_fp16 = reshape(shape = var_706, x = var_704_cast_fp16)[name = string("op_707_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_1_self_attn_o_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_1_self_attn_o_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.92p+1)];
+            fp16 model_vision_tower_encoder_layers_1_self_attn_o_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_1_self_attn_o_proj_input_max_promoted_to_fp16"), val = fp16(0x1.8ep+1)];
+            tensor<fp16, [1, 2304, 768]> clip_20_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_1_self_attn_o_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_1_self_attn_o_proj_input_max_promoted_to_fp16, x = var_707_cast_fp16)[name = string("clip_20_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_1_self_attn_o_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_1_self_attn_o_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(58606912)))];
+            tensor<fp16, [1, 2304, 768]> linear_11_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_1_self_attn_o_proj_linear_weight_promoted_to_fp16, x = clip_20_cast_fp16)[name = string("linear_11_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_1_self_attn_o_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_1_self_attn_o_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.e8p+3)];
+            fp16 model_vision_tower_encoder_layers_1_self_attn_o_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_1_self_attn_o_proj_output_max_promoted_to_fp16"), val = fp16(0x1.e4p+3)];
+            tensor<fp16, [1, 2304, 768]> clip_21_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_1_self_attn_o_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_1_self_attn_o_proj_output_max_promoted_to_fp16, x = linear_11_cast_fp16)[name = string("clip_21_cast_fp16")];
+            fp16 var_33_promoted_11_to_fp16 = const()[name = string("op_33_promoted_11_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_720_cast_fp16 = pow(x = clip_21_cast_fp16, y = var_33_promoted_11_to_fp16)[name = string("op_720_cast_fp16")];
+            tensor<int32, [1]> var_722_axes_0 = const()[name = string("op_722_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_722_keep_dims_0 = const()[name = string("op_722_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_722_cast_fp16 = reduce_mean(axes = var_722_axes_0, keep_dims = var_722_keep_dims_0, x = var_720_cast_fp16)[name = string("op_722_cast_fp16")];
+            fp16 var_723_to_fp16 = const()[name = string("op_723_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_23_cast_fp16 = add(x = var_722_cast_fp16, y = var_723_to_fp16)[name = string("mean_squared_23_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_725_cast_fp16 = pow(x = mean_squared_23_cast_fp16, y = var_27_to_fp16)[name = string("op_725_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_41_cast_fp16 = mul(x = clip_21_cast_fp16, y = var_725_cast_fp16)[name = string("normed_output_41_cast_fp16")];
+            tensor<fp16, [768]> const_51_to_fp16 = const()[name = string("const_51_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(59786624)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_43_cast_fp16 = mul(x = normed_output_41_cast_fp16, y = const_51_to_fp16)[name = string("normed_output_43_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_99_cast_fp16 = add(x = hidden_states_61_cast_fp16, y = normed_output_43_cast_fp16)[name = string("hidden_states_99_cast_fp16")];
+            fp16 var_33_promoted_12_to_fp16 = const()[name = string("op_33_promoted_12_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_733_cast_fp16 = pow(x = hidden_states_99_cast_fp16, y = var_33_promoted_12_to_fp16)[name = string("op_733_cast_fp16")];
+            tensor<int32, [1]> var_735_axes_0 = const()[name = string("op_735_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_735_keep_dims_0 = const()[name = string("op_735_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_735_cast_fp16 = reduce_mean(axes = var_735_axes_0, keep_dims = var_735_keep_dims_0, x = var_733_cast_fp16)[name = string("op_735_cast_fp16")];
+            fp16 var_736_to_fp16 = const()[name = string("op_736_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_25_cast_fp16 = add(x = var_735_cast_fp16, y = var_736_to_fp16)[name = string("mean_squared_25_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_738_cast_fp16 = pow(x = mean_squared_25_cast_fp16, y = var_27_to_fp16)[name = string("op_738_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_45_cast_fp16 = mul(x = hidden_states_99_cast_fp16, y = var_738_cast_fp16)[name = string("normed_output_45_cast_fp16")];
+            tensor<fp16, [768]> const_52_to_fp16 = const()[name = string("const_52_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(59788224)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_47_cast_fp16 = mul(x = normed_output_45_cast_fp16, y = const_52_to_fp16)[name = string("normed_output_47_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_1_mlp_gate_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_1_mlp_gate_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.16p+2)];
+            fp16 model_vision_tower_encoder_layers_1_mlp_gate_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_1_mlp_gate_proj_input_max_promoted_to_fp16"), val = fp16(0x1.14p+2)];
+            tensor<fp16, [1, 2304, 768]> clip_22_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_1_mlp_gate_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_1_mlp_gate_proj_input_max_promoted_to_fp16, x = normed_output_47_cast_fp16)[name = string("clip_22_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_1_mlp_gate_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_1_mlp_gate_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(59789824)))];
+            tensor<fp16, [1, 2304, 3072]> linear_12_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_1_mlp_gate_proj_linear_weight_promoted_to_fp16, x = clip_22_cast_fp16)[name = string("linear_12_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_1_mlp_gate_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_1_mlp_gate_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.82p+2)];
+            fp16 model_vision_tower_encoder_layers_1_mlp_gate_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_1_mlp_gate_proj_output_max_promoted_to_fp16"), val = fp16(0x1.7ep+2)];
+            tensor<fp16, [1, 2304, 3072]> clip_23_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_1_mlp_gate_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_1_mlp_gate_proj_output_max_promoted_to_fp16, x = linear_12_cast_fp16)[name = string("clip_23_cast_fp16")];
+            string var_755_mode_0 = const()[name = string("op_755_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 2304, 3072]> var_755_cast_fp16 = gelu(mode = var_755_mode_0, x = clip_23_cast_fp16)[name = string("op_755_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_1_mlp_up_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_1_mlp_up_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(64508480)))];
+            tensor<fp16, [1, 2304, 3072]> linear_13_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_1_mlp_up_proj_linear_weight_promoted_to_fp16, x = clip_22_cast_fp16)[name = string("linear_13_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_1_mlp_up_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_1_mlp_up_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.82p+2)];
+            fp16 model_vision_tower_encoder_layers_1_mlp_up_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_1_mlp_up_proj_output_max_promoted_to_fp16"), val = fp16(0x1.7ep+2)];
+            tensor<fp16, [1, 2304, 3072]> clip_25_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_1_mlp_up_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_1_mlp_up_proj_output_max_promoted_to_fp16, x = linear_13_cast_fp16)[name = string("clip_25_cast_fp16")];
+            tensor<fp16, [1, 2304, 3072]> hidden_states_109_cast_fp16 = mul(x = var_755_cast_fp16, y = clip_25_cast_fp16)[name = string("hidden_states_109_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_1_mlp_down_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_1_mlp_down_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.e4p+4)];
+            fp16 model_vision_tower_encoder_layers_1_mlp_down_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_1_mlp_down_proj_input_max_promoted_to_fp16"), val = fp16(0x1.ep+4)];
+            tensor<fp16, [1, 2304, 3072]> clip_26_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_1_mlp_down_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_1_mlp_down_proj_input_max_promoted_to_fp16, x = hidden_states_109_cast_fp16)[name = string("clip_26_cast_fp16")];
+            tensor<fp16, [768, 3072]> model_vision_tower_encoder_layers_1_mlp_down_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_1_mlp_down_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 3072]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(69227136)))];
+            tensor<fp16, [1, 2304, 768]> linear_14_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_1_mlp_down_proj_linear_weight_promoted_to_fp16, x = clip_26_cast_fp16)[name = string("linear_14_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_1_mlp_down_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_1_mlp_down_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.26p+4)];
+            fp16 model_vision_tower_encoder_layers_1_mlp_down_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_1_mlp_down_proj_output_max_promoted_to_fp16"), val = fp16(0x1.24p+4)];
+            tensor<fp16, [1, 2304, 768]> clip_27_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_1_mlp_down_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_1_mlp_down_proj_output_max_promoted_to_fp16, x = linear_14_cast_fp16)[name = string("clip_27_cast_fp16")];
+            fp16 var_33_promoted_13_to_fp16 = const()[name = string("op_33_promoted_13_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_777_cast_fp16 = pow(x = clip_27_cast_fp16, y = var_33_promoted_13_to_fp16)[name = string("op_777_cast_fp16")];
+            tensor<int32, [1]> var_779_axes_0 = const()[name = string("op_779_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_779_keep_dims_0 = const()[name = string("op_779_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_779_cast_fp16 = reduce_mean(axes = var_779_axes_0, keep_dims = var_779_keep_dims_0, x = var_777_cast_fp16)[name = string("op_779_cast_fp16")];
+            fp16 var_780_to_fp16 = const()[name = string("op_780_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_27_cast_fp16 = add(x = var_779_cast_fp16, y = var_780_to_fp16)[name = string("mean_squared_27_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_782_cast_fp16 = pow(x = mean_squared_27_cast_fp16, y = var_27_to_fp16)[name = string("op_782_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_49_cast_fp16 = mul(x = clip_27_cast_fp16, y = var_782_cast_fp16)[name = string("normed_output_49_cast_fp16")];
+            tensor<fp16, [768]> const_53_to_fp16 = const()[name = string("const_53_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(73945792)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_51_cast_fp16 = mul(x = normed_output_49_cast_fp16, y = const_53_to_fp16)[name = string("normed_output_51_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_119_cast_fp16 = add(x = hidden_states_99_cast_fp16, y = normed_output_51_cast_fp16)[name = string("hidden_states_119_cast_fp16")];
+            fp16 var_33_promoted_14_to_fp16 = const()[name = string("op_33_promoted_14_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_796_cast_fp16 = pow(x = hidden_states_119_cast_fp16, y = var_33_promoted_14_to_fp16)[name = string("op_796_cast_fp16")];
+            tensor<int32, [1]> var_798_axes_0 = const()[name = string("op_798_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_798_keep_dims_0 = const()[name = string("op_798_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_798_cast_fp16 = reduce_mean(axes = var_798_axes_0, keep_dims = var_798_keep_dims_0, x = var_796_cast_fp16)[name = string("op_798_cast_fp16")];
+            fp16 var_799_to_fp16 = const()[name = string("op_799_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_29_cast_fp16 = add(x = var_798_cast_fp16, y = var_799_to_fp16)[name = string("mean_squared_29_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_801_cast_fp16 = pow(x = mean_squared_29_cast_fp16, y = var_27_to_fp16)[name = string("op_801_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_53_cast_fp16 = mul(x = hidden_states_119_cast_fp16, y = var_801_cast_fp16)[name = string("normed_output_53_cast_fp16")];
+            tensor<fp16, [768]> const_54_to_fp16 = const()[name = string("const_54_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(73947392)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_55_cast_fp16 = mul(x = normed_output_53_cast_fp16, y = const_54_to_fp16)[name = string("normed_output_55_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_2_self_attn_q_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_2_self_attn_q_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.84p+3)];
+            fp16 model_vision_tower_encoder_layers_2_self_attn_q_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_2_self_attn_q_proj_input_max_promoted_to_fp16"), val = fp16(0x1.82p+3)];
+            tensor<fp16, [1, 2304, 768]> clip_28_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_2_self_attn_q_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_2_self_attn_q_proj_input_max_promoted_to_fp16, x = normed_output_55_cast_fp16)[name = string("clip_28_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_2_self_attn_q_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_2_self_attn_q_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(73948992)))];
+            tensor<fp16, [1, 2304, 768]> linear_15_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_2_self_attn_q_proj_linear_weight_promoted_to_fp16, x = clip_28_cast_fp16)[name = string("linear_15_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_2_self_attn_q_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_2_self_attn_q_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.72p+4)];
+            fp16 model_vision_tower_encoder_layers_2_self_attn_q_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_2_self_attn_q_proj_output_max_promoted_to_fp16"), val = fp16(0x1.6ep+4)];
+            tensor<fp16, [1, 2304, 768]> clip_29_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_2_self_attn_q_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_2_self_attn_q_proj_output_max_promoted_to_fp16, x = linear_15_cast_fp16)[name = string("clip_29_cast_fp16")];
+            tensor<int32, [4]> var_823 = const()[name = string("op_823"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_127_cast_fp16 = reshape(shape = var_823, x = clip_29_cast_fp16)[name = string("hidden_states_127_cast_fp16")];
+            fp16 var_33_promoted_15_to_fp16 = const()[name = string("op_33_promoted_15_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_827_cast_fp16 = pow(x = hidden_states_127_cast_fp16, y = var_33_promoted_15_to_fp16)[name = string("op_827_cast_fp16")];
+            tensor<int32, [1]> var_829_axes_0 = const()[name = string("op_829_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_829_keep_dims_0 = const()[name = string("op_829_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_829_cast_fp16 = reduce_mean(axes = var_829_axes_0, keep_dims = var_829_keep_dims_0, x = var_827_cast_fp16)[name = string("op_829_cast_fp16")];
+            fp16 var_830_to_fp16 = const()[name = string("op_830_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_31_cast_fp16 = add(x = var_829_cast_fp16, y = var_830_to_fp16)[name = string("mean_squared_31_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_832_cast_fp16 = pow(x = mean_squared_31_cast_fp16, y = var_27_to_fp16)[name = string("op_832_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_57_cast_fp16 = mul(x = hidden_states_127_cast_fp16, y = var_832_cast_fp16)[name = string("normed_output_57_cast_fp16")];
+            tensor<fp16, [64]> const_57_to_fp16 = const()[name = string("const_57_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(75128704)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_59_cast_fp16 = mul(x = normed_output_57_cast_fp16, y = const_57_to_fp16)[name = string("normed_output_59_cast_fp16")];
+            tensor<int32, [2]> var_852 = const()[name = string("op_852"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_853_axis_0 = const()[name = string("op_853_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_853_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_853_cast_fp16_1 = split(axis = var_853_axis_0, split_sizes = var_852, x = normed_output_59_cast_fp16)[name = string("op_853_cast_fp16")];
+            tensor<int32, [2]> var_856 = const()[name = string("op_856"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_857_axis_0 = const()[name = string("op_857_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_857_0, tensor<fp16, [1, 2304, 32]> var_857_1 = split(axis = var_857_axis_0, split_sizes = var_856, x = var_160_cast_fp16)[name = string("op_857")];
+            tensor<int32, [2]> var_860 = const()[name = string("op_860"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_861_axis_0 = const()[name = string("op_861_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_861_0, tensor<fp16, [1, 2304, 32]> var_861_1 = split(axis = var_861_axis_0, split_sizes = var_860, x = var_163_cast_fp16)[name = string("op_861")];
+            tensor<int32, [1]> cos_37_axes_0 = const()[name = string("cos_37_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_37 = expand_dims(axes = cos_37_axes_0, x = var_857_0)[name = string("cos_37")];
+            tensor<int32, [1]> sin_37_axes_0 = const()[name = string("sin_37_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_37 = expand_dims(axes = sin_37_axes_0, x = var_861_0)[name = string("sin_37")];
+            tensor<fp16, [1, 2304, 12, 32]> var_866_cast_fp16 = mul(x = var_853_cast_fp16_0, y = cos_37)[name = string("op_866_cast_fp16")];
+            tensor<int32, [4]> x1_17_begin_0 = const()[name = string("x1_17_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_17_end_0 = const()[name = string("x1_17_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_17_end_mask_0 = const()[name = string("x1_17_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_17_cast_fp16 = slice_by_index(begin = x1_17_begin_0, end = x1_17_end_0, end_mask = x1_17_end_mask_0, x = var_853_cast_fp16_0)[name = string("x1_17_cast_fp16")];
+            tensor<int32, [4]> x2_17_begin_0 = const()[name = string("x2_17_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_17_end_0 = const()[name = string("x2_17_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_17_end_mask_0 = const()[name = string("x2_17_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_17_cast_fp16 = slice_by_index(begin = x2_17_begin_0, end = x2_17_end_0, end_mask = x2_17_end_mask_0, x = var_853_cast_fp16_0)[name = string("x2_17_cast_fp16")];
+            fp16 const_62_promoted_to_fp16 = const()[name = string("const_62_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_877_cast_fp16 = mul(x = x2_17_cast_fp16, y = const_62_promoted_to_fp16)[name = string("op_877_cast_fp16")];
+            bool var_879_interleave_0 = const()[name = string("op_879_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_879_cast_fp16 = concat(axis = var_38, interleave = var_879_interleave_0, values = (var_877_cast_fp16, x1_17_cast_fp16))[name = string("op_879_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_880_cast_fp16 = mul(x = var_879_cast_fp16, y = sin_37)[name = string("op_880_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_881_cast_fp16 = add(x = var_866_cast_fp16, y = var_880_cast_fp16)[name = string("op_881_cast_fp16")];
+            tensor<int32, [1]> cos_41_axes_0 = const()[name = string("cos_41_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_41 = expand_dims(axes = cos_41_axes_0, x = var_857_1)[name = string("cos_41")];
+            tensor<int32, [1]> sin_41_axes_0 = const()[name = string("sin_41_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_41 = expand_dims(axes = sin_41_axes_0, x = var_861_1)[name = string("sin_41")];
+            tensor<fp16, [1, 2304, 12, 32]> var_884_cast_fp16 = mul(x = var_853_cast_fp16_1, y = cos_41)[name = string("op_884_cast_fp16")];
+            tensor<int32, [4]> x1_19_begin_0 = const()[name = string("x1_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_19_end_0 = const()[name = string("x1_19_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_19_end_mask_0 = const()[name = string("x1_19_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_19_cast_fp16 = slice_by_index(begin = x1_19_begin_0, end = x1_19_end_0, end_mask = x1_19_end_mask_0, x = var_853_cast_fp16_1)[name = string("x1_19_cast_fp16")];
+            tensor<int32, [4]> x2_19_begin_0 = const()[name = string("x2_19_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_19_end_0 = const()[name = string("x2_19_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_19_end_mask_0 = const()[name = string("x2_19_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_19_cast_fp16 = slice_by_index(begin = x2_19_begin_0, end = x2_19_end_0, end_mask = x2_19_end_mask_0, x = var_853_cast_fp16_1)[name = string("x2_19_cast_fp16")];
+            fp16 const_65_promoted_to_fp16 = const()[name = string("const_65_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_895_cast_fp16 = mul(x = x2_19_cast_fp16, y = const_65_promoted_to_fp16)[name = string("op_895_cast_fp16")];
+            bool var_897_interleave_0 = const()[name = string("op_897_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_897_cast_fp16 = concat(axis = var_38, interleave = var_897_interleave_0, values = (var_895_cast_fp16, x1_19_cast_fp16))[name = string("op_897_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_898_cast_fp16 = mul(x = var_897_cast_fp16, y = sin_41)[name = string("op_898_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_899_cast_fp16 = add(x = var_884_cast_fp16, y = var_898_cast_fp16)[name = string("op_899_cast_fp16")];
+            bool query_states_5_interleave_0 = const()[name = string("query_states_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> query_states_5_cast_fp16 = concat(axis = var_38, interleave = query_states_5_interleave_0, values = (var_881_cast_fp16, var_899_cast_fp16))[name = string("query_states_5_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_2_self_attn_k_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_2_self_attn_k_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(75128896)))];
+            tensor<fp16, [1, 2304, 768]> linear_16_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_2_self_attn_k_proj_linear_weight_promoted_to_fp16, x = clip_28_cast_fp16)[name = string("linear_16_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_2_self_attn_k_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_2_self_attn_k_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.42p+4)];
+            fp16 model_vision_tower_encoder_layers_2_self_attn_k_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_2_self_attn_k_proj_output_max_promoted_to_fp16"), val = fp16(0x1.3ep+4)];
+            tensor<fp16, [1, 2304, 768]> clip_31_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_2_self_attn_k_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_2_self_attn_k_proj_output_max_promoted_to_fp16, x = linear_16_cast_fp16)[name = string("clip_31_cast_fp16")];
+            tensor<int32, [4]> var_912 = const()[name = string("op_912"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_133_cast_fp16 = reshape(shape = var_912, x = clip_31_cast_fp16)[name = string("hidden_states_133_cast_fp16")];
+            fp16 var_33_promoted_16_to_fp16 = const()[name = string("op_33_promoted_16_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_916_cast_fp16 = pow(x = hidden_states_133_cast_fp16, y = var_33_promoted_16_to_fp16)[name = string("op_916_cast_fp16")];
+            tensor<int32, [1]> var_918_axes_0 = const()[name = string("op_918_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_918_keep_dims_0 = const()[name = string("op_918_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_918_cast_fp16 = reduce_mean(axes = var_918_axes_0, keep_dims = var_918_keep_dims_0, x = var_916_cast_fp16)[name = string("op_918_cast_fp16")];
+            fp16 var_919_to_fp16 = const()[name = string("op_919_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_33_cast_fp16 = add(x = var_918_cast_fp16, y = var_919_to_fp16)[name = string("mean_squared_33_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_921_cast_fp16 = pow(x = mean_squared_33_cast_fp16, y = var_27_to_fp16)[name = string("op_921_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_61_cast_fp16 = mul(x = hidden_states_133_cast_fp16, y = var_921_cast_fp16)[name = string("normed_output_61_cast_fp16")];
+            tensor<fp16, [64]> const_66_to_fp16 = const()[name = string("const_66_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(76308608)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_63_cast_fp16 = mul(x = normed_output_61_cast_fp16, y = const_66_to_fp16)[name = string("normed_output_63_cast_fp16")];
+            tensor<int32, [2]> var_941 = const()[name = string("op_941"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_942_axis_0 = const()[name = string("op_942_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_942_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_942_cast_fp16_1 = split(axis = var_942_axis_0, split_sizes = var_941, x = normed_output_63_cast_fp16)[name = string("op_942_cast_fp16")];
+            tensor<int32, [2]> var_945 = const()[name = string("op_945"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_946_axis_0 = const()[name = string("op_946_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_946_0, tensor<fp16, [1, 2304, 32]> var_946_1 = split(axis = var_946_axis_0, split_sizes = var_945, x = var_160_cast_fp16)[name = string("op_946")];
+            tensor<int32, [2]> var_949 = const()[name = string("op_949"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_950_axis_0 = const()[name = string("op_950_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_950_0, tensor<fp16, [1, 2304, 32]> var_950_1 = split(axis = var_950_axis_0, split_sizes = var_949, x = var_163_cast_fp16)[name = string("op_950")];
+            tensor<int32, [1]> cos_45_axes_0 = const()[name = string("cos_45_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_45 = expand_dims(axes = cos_45_axes_0, x = var_946_0)[name = string("cos_45")];
+            tensor<int32, [1]> sin_45_axes_0 = const()[name = string("sin_45_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_45 = expand_dims(axes = sin_45_axes_0, x = var_950_0)[name = string("sin_45")];
+            tensor<fp16, [1, 2304, 12, 32]> var_955_cast_fp16 = mul(x = var_942_cast_fp16_0, y = cos_45)[name = string("op_955_cast_fp16")];
+            tensor<int32, [4]> x1_21_begin_0 = const()[name = string("x1_21_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_21_end_0 = const()[name = string("x1_21_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_21_end_mask_0 = const()[name = string("x1_21_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_21_cast_fp16 = slice_by_index(begin = x1_21_begin_0, end = x1_21_end_0, end_mask = x1_21_end_mask_0, x = var_942_cast_fp16_0)[name = string("x1_21_cast_fp16")];
+            tensor<int32, [4]> x2_21_begin_0 = const()[name = string("x2_21_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_21_end_0 = const()[name = string("x2_21_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_21_end_mask_0 = const()[name = string("x2_21_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_21_cast_fp16 = slice_by_index(begin = x2_21_begin_0, end = x2_21_end_0, end_mask = x2_21_end_mask_0, x = var_942_cast_fp16_0)[name = string("x2_21_cast_fp16")];
+            fp16 const_71_promoted_to_fp16 = const()[name = string("const_71_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_966_cast_fp16 = mul(x = x2_21_cast_fp16, y = const_71_promoted_to_fp16)[name = string("op_966_cast_fp16")];
+            bool var_968_interleave_0 = const()[name = string("op_968_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_968_cast_fp16 = concat(axis = var_38, interleave = var_968_interleave_0, values = (var_966_cast_fp16, x1_21_cast_fp16))[name = string("op_968_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_969_cast_fp16 = mul(x = var_968_cast_fp16, y = sin_45)[name = string("op_969_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_970_cast_fp16 = add(x = var_955_cast_fp16, y = var_969_cast_fp16)[name = string("op_970_cast_fp16")];
+            tensor<int32, [1]> cos_49_axes_0 = const()[name = string("cos_49_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_49 = expand_dims(axes = cos_49_axes_0, x = var_946_1)[name = string("cos_49")];
+            tensor<int32, [1]> sin_49_axes_0 = const()[name = string("sin_49_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_49 = expand_dims(axes = sin_49_axes_0, x = var_950_1)[name = string("sin_49")];
+            tensor<fp16, [1, 2304, 12, 32]> var_973_cast_fp16 = mul(x = var_942_cast_fp16_1, y = cos_49)[name = string("op_973_cast_fp16")];
+            tensor<int32, [4]> x1_23_begin_0 = const()[name = string("x1_23_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_23_end_0 = const()[name = string("x1_23_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_23_end_mask_0 = const()[name = string("x1_23_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_23_cast_fp16 = slice_by_index(begin = x1_23_begin_0, end = x1_23_end_0, end_mask = x1_23_end_mask_0, x = var_942_cast_fp16_1)[name = string("x1_23_cast_fp16")];
+            tensor<int32, [4]> x2_23_begin_0 = const()[name = string("x2_23_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_23_end_0 = const()[name = string("x2_23_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_23_end_mask_0 = const()[name = string("x2_23_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_23_cast_fp16 = slice_by_index(begin = x2_23_begin_0, end = x2_23_end_0, end_mask = x2_23_end_mask_0, x = var_942_cast_fp16_1)[name = string("x2_23_cast_fp16")];
+            fp16 const_74_promoted_to_fp16 = const()[name = string("const_74_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_984_cast_fp16 = mul(x = x2_23_cast_fp16, y = const_74_promoted_to_fp16)[name = string("op_984_cast_fp16")];
+            bool var_986_interleave_0 = const()[name = string("op_986_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_986_cast_fp16 = concat(axis = var_38, interleave = var_986_interleave_0, values = (var_984_cast_fp16, x1_23_cast_fp16))[name = string("op_986_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_987_cast_fp16 = mul(x = var_986_cast_fp16, y = sin_49)[name = string("op_987_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_988_cast_fp16 = add(x = var_973_cast_fp16, y = var_987_cast_fp16)[name = string("op_988_cast_fp16")];
+            bool key_states_5_interleave_0 = const()[name = string("key_states_5_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> key_states_5_cast_fp16 = concat(axis = var_38, interleave = key_states_5_interleave_0, values = (var_970_cast_fp16, var_988_cast_fp16))[name = string("key_states_5_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_2_self_attn_v_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_2_self_attn_v_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(76308800)))];
+            tensor<fp16, [1, 2304, 768]> linear_17_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_2_self_attn_v_proj_linear_weight_promoted_to_fp16, x = clip_28_cast_fp16)[name = string("linear_17_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_2_self_attn_v_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_2_self_attn_v_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.42p+4)];
+            fp16 model_vision_tower_encoder_layers_2_self_attn_v_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_2_self_attn_v_proj_output_max_promoted_to_fp16"), val = fp16(0x1.3ep+4)];
+            tensor<fp16, [1, 2304, 768]> clip_33_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_2_self_attn_v_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_2_self_attn_v_proj_output_max_promoted_to_fp16, x = linear_17_cast_fp16)[name = string("clip_33_cast_fp16")];
+            tensor<int32, [4]> var_1001 = const()[name = string("op_1001"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_139_cast_fp16 = reshape(shape = var_1001, x = clip_33_cast_fp16)[name = string("hidden_states_139_cast_fp16")];
+            fp16 var_33_promoted_17_to_fp16 = const()[name = string("op_33_promoted_17_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_1004_cast_fp16 = pow(x = hidden_states_139_cast_fp16, y = var_33_promoted_17_to_fp16)[name = string("op_1004_cast_fp16")];
+            tensor<int32, [1]> var_1006_axes_0 = const()[name = string("op_1006_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1006_keep_dims_0 = const()[name = string("op_1006_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_1006_cast_fp16 = reduce_mean(axes = var_1006_axes_0, keep_dims = var_1006_keep_dims_0, x = var_1004_cast_fp16)[name = string("op_1006_cast_fp16")];
+            fp16 var_1007_to_fp16 = const()[name = string("op_1007_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_35_cast_fp16 = add(x = var_1006_cast_fp16, y = var_1007_to_fp16)[name = string("mean_squared_35_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_1009_cast_fp16 = pow(x = mean_squared_35_cast_fp16, y = var_27_to_fp16)[name = string("op_1009_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_65_cast_fp16 = mul(x = hidden_states_139_cast_fp16, y = var_1009_cast_fp16)[name = string("normed_output_65_cast_fp16")];
+            tensor<int32, [4]> hidden_states_145_perm_0 = const()[name = string("hidden_states_145_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            bool matmul_2_transpose_y_0 = const()[name = string("matmul_2_transpose_y_0"), val = bool(true)];
+            bool matmul_2_transpose_x_0 = const()[name = string("matmul_2_transpose_x_0"), val = bool(false)];
+            tensor<int32, [4]> transpose_68_perm_0 = const()[name = string("transpose_68_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<int32, [4]> transpose_69_perm_0 = const()[name = string("transpose_69_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_69 = transpose(perm = transpose_69_perm_0, x = key_states_5_cast_fp16)[name = string("transpose_149")];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_68 = transpose(perm = transpose_68_perm_0, x = query_states_5_cast_fp16)[name = string("transpose_150")];
+            tensor<fp16, [1, 12, 2304, 2304]> matmul_2_cast_fp16 = matmul(transpose_x = matmul_2_transpose_x_0, transpose_y = matmul_2_transpose_y_0, x = transpose_68, y = transpose_69)[name = string("matmul_2_cast_fp16")];
+            tensor<fp16, [1, 12, 2304, 2304]> add_2_cast_fp16 = add(x = matmul_2_cast_fp16, y = attention_mask_cast_fp16)[name = string("add_2_cast_fp16")];
+            int32 softmax_2_axis_0 = const()[name = string("softmax_2_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 12, 2304, 2304]> softmax_2_cast_fp16 = softmax(axis = softmax_2_axis_0, x = add_2_cast_fp16)[name = string("softmax_2_cast_fp16")];
+            bool attn_output_9_transpose_x_0 = const()[name = string("attn_output_9_transpose_x_0"), val = bool(false)];
+            bool attn_output_9_transpose_y_0 = const()[name = string("attn_output_9_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 12, 2304, 64]> hidden_states_145_cast_fp16 = transpose(perm = hidden_states_145_perm_0, x = normed_output_65_cast_fp16)[name = string("transpose_151")];
+            tensor<fp16, [1, 12, 2304, 64]> attn_output_9_cast_fp16 = matmul(transpose_x = attn_output_9_transpose_x_0, transpose_y = attn_output_9_transpose_y_0, x = softmax_2_cast_fp16, y = hidden_states_145_cast_fp16)[name = string("attn_output_9_cast_fp16")];
+            tensor<int32, [4]> var_1014_perm_0 = const()[name = string("op_1014_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1016 = const()[name = string("op_1016"), val = tensor<int32, [3]>([1, 2304, -1])];
+            tensor<fp16, [1, 2304, 12, 64]> var_1014_cast_fp16 = transpose(perm = var_1014_perm_0, x = attn_output_9_cast_fp16)[name = string("transpose_148")];
+            tensor<fp16, [1, 2304, 768]> var_1017_cast_fp16 = reshape(shape = var_1016, x = var_1014_cast_fp16)[name = string("op_1017_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_2_self_attn_o_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_2_self_attn_o_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.76p+1)];
+            fp16 model_vision_tower_encoder_layers_2_self_attn_o_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_2_self_attn_o_proj_input_max_promoted_to_fp16"), val = fp16(0x1.72p+1)];
+            tensor<fp16, [1, 2304, 768]> clip_34_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_2_self_attn_o_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_2_self_attn_o_proj_input_max_promoted_to_fp16, x = var_1017_cast_fp16)[name = string("clip_34_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_2_self_attn_o_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_2_self_attn_o_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(77488512)))];
+            tensor<fp16, [1, 2304, 768]> linear_18_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_2_self_attn_o_proj_linear_weight_promoted_to_fp16, x = clip_34_cast_fp16)[name = string("linear_18_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_2_self_attn_o_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_2_self_attn_o_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.aep+3)];
+            fp16 model_vision_tower_encoder_layers_2_self_attn_o_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_2_self_attn_o_proj_output_max_promoted_to_fp16"), val = fp16(0x1.acp+3)];
+            tensor<fp16, [1, 2304, 768]> clip_35_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_2_self_attn_o_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_2_self_attn_o_proj_output_max_promoted_to_fp16, x = linear_18_cast_fp16)[name = string("clip_35_cast_fp16")];
+            fp16 var_33_promoted_18_to_fp16 = const()[name = string("op_33_promoted_18_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_1030_cast_fp16 = pow(x = clip_35_cast_fp16, y = var_33_promoted_18_to_fp16)[name = string("op_1030_cast_fp16")];
+            tensor<int32, [1]> var_1032_axes_0 = const()[name = string("op_1032_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1032_keep_dims_0 = const()[name = string("op_1032_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_1032_cast_fp16 = reduce_mean(axes = var_1032_axes_0, keep_dims = var_1032_keep_dims_0, x = var_1030_cast_fp16)[name = string("op_1032_cast_fp16")];
+            fp16 var_1033_to_fp16 = const()[name = string("op_1033_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_37_cast_fp16 = add(x = var_1032_cast_fp16, y = var_1033_to_fp16)[name = string("mean_squared_37_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_1035_cast_fp16 = pow(x = mean_squared_37_cast_fp16, y = var_27_to_fp16)[name = string("op_1035_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_67_cast_fp16 = mul(x = clip_35_cast_fp16, y = var_1035_cast_fp16)[name = string("normed_output_67_cast_fp16")];
+            tensor<fp16, [768]> const_75_to_fp16 = const()[name = string("const_75_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(78668224)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_69_cast_fp16 = mul(x = normed_output_67_cast_fp16, y = const_75_to_fp16)[name = string("normed_output_69_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_157_cast_fp16 = add(x = hidden_states_119_cast_fp16, y = normed_output_69_cast_fp16)[name = string("hidden_states_157_cast_fp16")];
+            fp16 var_33_promoted_19_to_fp16 = const()[name = string("op_33_promoted_19_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_1043_cast_fp16 = pow(x = hidden_states_157_cast_fp16, y = var_33_promoted_19_to_fp16)[name = string("op_1043_cast_fp16")];
+            tensor<int32, [1]> var_1045_axes_0 = const()[name = string("op_1045_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1045_keep_dims_0 = const()[name = string("op_1045_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_1045_cast_fp16 = reduce_mean(axes = var_1045_axes_0, keep_dims = var_1045_keep_dims_0, x = var_1043_cast_fp16)[name = string("op_1045_cast_fp16")];
+            fp16 var_1046_to_fp16 = const()[name = string("op_1046_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_39_cast_fp16 = add(x = var_1045_cast_fp16, y = var_1046_to_fp16)[name = string("mean_squared_39_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_1048_cast_fp16 = pow(x = mean_squared_39_cast_fp16, y = var_27_to_fp16)[name = string("op_1048_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_71_cast_fp16 = mul(x = hidden_states_157_cast_fp16, y = var_1048_cast_fp16)[name = string("normed_output_71_cast_fp16")];
+            tensor<fp16, [768]> const_76_to_fp16 = const()[name = string("const_76_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(78669824)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_73_cast_fp16 = mul(x = normed_output_71_cast_fp16, y = const_76_to_fp16)[name = string("normed_output_73_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_2_mlp_gate_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_2_mlp_gate_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.38p+3)];
+            fp16 model_vision_tower_encoder_layers_2_mlp_gate_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_2_mlp_gate_proj_input_max_promoted_to_fp16"), val = fp16(0x1.34p+3)];
+            tensor<fp16, [1, 2304, 768]> clip_36_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_2_mlp_gate_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_2_mlp_gate_proj_input_max_promoted_to_fp16, x = normed_output_73_cast_fp16)[name = string("clip_36_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_2_mlp_gate_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_2_mlp_gate_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(78671424)))];
+            tensor<fp16, [1, 2304, 3072]> linear_19_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_2_mlp_gate_proj_linear_weight_promoted_to_fp16, x = clip_36_cast_fp16)[name = string("linear_19_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_2_mlp_gate_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_2_mlp_gate_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.9cp+3)];
+            fp16 model_vision_tower_encoder_layers_2_mlp_gate_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_2_mlp_gate_proj_output_max_promoted_to_fp16"), val = fp16(0x1.98p+3)];
+            tensor<fp16, [1, 2304, 3072]> clip_37_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_2_mlp_gate_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_2_mlp_gate_proj_output_max_promoted_to_fp16, x = linear_19_cast_fp16)[name = string("clip_37_cast_fp16")];
+            string var_1065_mode_0 = const()[name = string("op_1065_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 2304, 3072]> var_1065_cast_fp16 = gelu(mode = var_1065_mode_0, x = clip_37_cast_fp16)[name = string("op_1065_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_2_mlp_up_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_2_mlp_up_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(83390080)))];
+            tensor<fp16, [1, 2304, 3072]> linear_20_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_2_mlp_up_proj_linear_weight_promoted_to_fp16, x = clip_36_cast_fp16)[name = string("linear_20_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_2_mlp_up_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_2_mlp_up_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.9cp+3)];
+            fp16 model_vision_tower_encoder_layers_2_mlp_up_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_2_mlp_up_proj_output_max_promoted_to_fp16"), val = fp16(0x1.98p+3)];
+            tensor<fp16, [1, 2304, 3072]> clip_39_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_2_mlp_up_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_2_mlp_up_proj_output_max_promoted_to_fp16, x = linear_20_cast_fp16)[name = string("clip_39_cast_fp16")];
+            tensor<fp16, [1, 2304, 3072]> hidden_states_167_cast_fp16 = mul(x = var_1065_cast_fp16, y = clip_39_cast_fp16)[name = string("hidden_states_167_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_2_mlp_down_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_2_mlp_down_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.46p+6)];
+            fp16 model_vision_tower_encoder_layers_2_mlp_down_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_2_mlp_down_proj_input_max_promoted_to_fp16"), val = fp16(0x1.44p+6)];
+            tensor<fp16, [1, 2304, 3072]> clip_40_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_2_mlp_down_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_2_mlp_down_proj_input_max_promoted_to_fp16, x = hidden_states_167_cast_fp16)[name = string("clip_40_cast_fp16")];
+            tensor<fp16, [768, 3072]> model_vision_tower_encoder_layers_2_mlp_down_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_2_mlp_down_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 3072]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(88108736)))];
+            tensor<fp16, [1, 2304, 768]> linear_21_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_2_mlp_down_proj_linear_weight_promoted_to_fp16, x = clip_40_cast_fp16)[name = string("linear_21_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_2_mlp_down_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_2_mlp_down_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.6p+6)];
+            fp16 model_vision_tower_encoder_layers_2_mlp_down_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_2_mlp_down_proj_output_max_promoted_to_fp16"), val = fp16(0x1.5ep+6)];
+            tensor<fp16, [1, 2304, 768]> clip_41_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_2_mlp_down_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_2_mlp_down_proj_output_max_promoted_to_fp16, x = linear_21_cast_fp16)[name = string("clip_41_cast_fp16")];
+            fp16 var_33_promoted_20_to_fp16 = const()[name = string("op_33_promoted_20_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_1087_cast_fp16 = pow(x = clip_41_cast_fp16, y = var_33_promoted_20_to_fp16)[name = string("op_1087_cast_fp16")];
+            tensor<int32, [1]> var_1089_axes_0 = const()[name = string("op_1089_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1089_keep_dims_0 = const()[name = string("op_1089_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_1089_cast_fp16 = reduce_mean(axes = var_1089_axes_0, keep_dims = var_1089_keep_dims_0, x = var_1087_cast_fp16)[name = string("op_1089_cast_fp16")];
+            fp16 var_1090_to_fp16 = const()[name = string("op_1090_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_41_cast_fp16 = add(x = var_1089_cast_fp16, y = var_1090_to_fp16)[name = string("mean_squared_41_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_1092_cast_fp16 = pow(x = mean_squared_41_cast_fp16, y = var_27_to_fp16)[name = string("op_1092_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_75_cast_fp16 = mul(x = clip_41_cast_fp16, y = var_1092_cast_fp16)[name = string("normed_output_75_cast_fp16")];
+            tensor<fp16, [768]> const_77_to_fp16 = const()[name = string("const_77_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(92827392)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_77_cast_fp16 = mul(x = normed_output_75_cast_fp16, y = const_77_to_fp16)[name = string("normed_output_77_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_177_cast_fp16 = add(x = hidden_states_157_cast_fp16, y = normed_output_77_cast_fp16)[name = string("hidden_states_177_cast_fp16")];
+            fp16 var_33_promoted_21_to_fp16 = const()[name = string("op_33_promoted_21_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_1106_cast_fp16 = pow(x = hidden_states_177_cast_fp16, y = var_33_promoted_21_to_fp16)[name = string("op_1106_cast_fp16")];
+            tensor<int32, [1]> var_1108_axes_0 = const()[name = string("op_1108_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1108_keep_dims_0 = const()[name = string("op_1108_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_1108_cast_fp16 = reduce_mean(axes = var_1108_axes_0, keep_dims = var_1108_keep_dims_0, x = var_1106_cast_fp16)[name = string("op_1108_cast_fp16")];
+            fp16 var_1109_to_fp16 = const()[name = string("op_1109_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_43_cast_fp16 = add(x = var_1108_cast_fp16, y = var_1109_to_fp16)[name = string("mean_squared_43_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_1111_cast_fp16 = pow(x = mean_squared_43_cast_fp16, y = var_27_to_fp16)[name = string("op_1111_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_79_cast_fp16 = mul(x = hidden_states_177_cast_fp16, y = var_1111_cast_fp16)[name = string("normed_output_79_cast_fp16")];
+            tensor<fp16, [768]> const_78_to_fp16 = const()[name = string("const_78_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(92828992)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_81_cast_fp16 = mul(x = normed_output_79_cast_fp16, y = const_78_to_fp16)[name = string("normed_output_81_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_3_self_attn_q_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_3_self_attn_q_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.a4p+3)];
+            fp16 model_vision_tower_encoder_layers_3_self_attn_q_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_3_self_attn_q_proj_input_max_promoted_to_fp16"), val = fp16(0x1.a2p+3)];
+            tensor<fp16, [1, 2304, 768]> clip_42_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_3_self_attn_q_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_3_self_attn_q_proj_input_max_promoted_to_fp16, x = normed_output_81_cast_fp16)[name = string("clip_42_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_3_self_attn_q_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_3_self_attn_q_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(92830592)))];
+            tensor<fp16, [1, 2304, 768]> linear_22_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_3_self_attn_q_proj_linear_weight_promoted_to_fp16, x = clip_42_cast_fp16)[name = string("linear_22_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_3_self_attn_q_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_3_self_attn_q_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.6p+4)];
+            fp16 model_vision_tower_encoder_layers_3_self_attn_q_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_3_self_attn_q_proj_output_max_promoted_to_fp16"), val = fp16(0x1.5ep+4)];
+            tensor<fp16, [1, 2304, 768]> clip_43_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_3_self_attn_q_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_3_self_attn_q_proj_output_max_promoted_to_fp16, x = linear_22_cast_fp16)[name = string("clip_43_cast_fp16")];
+            tensor<int32, [4]> var_1133 = const()[name = string("op_1133"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_185_cast_fp16 = reshape(shape = var_1133, x = clip_43_cast_fp16)[name = string("hidden_states_185_cast_fp16")];
+            fp16 var_33_promoted_22_to_fp16 = const()[name = string("op_33_promoted_22_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_1137_cast_fp16 = pow(x = hidden_states_185_cast_fp16, y = var_33_promoted_22_to_fp16)[name = string("op_1137_cast_fp16")];
+            tensor<int32, [1]> var_1139_axes_0 = const()[name = string("op_1139_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1139_keep_dims_0 = const()[name = string("op_1139_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_1139_cast_fp16 = reduce_mean(axes = var_1139_axes_0, keep_dims = var_1139_keep_dims_0, x = var_1137_cast_fp16)[name = string("op_1139_cast_fp16")];
+            fp16 var_1140_to_fp16 = const()[name = string("op_1140_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_45_cast_fp16 = add(x = var_1139_cast_fp16, y = var_1140_to_fp16)[name = string("mean_squared_45_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_1142_cast_fp16 = pow(x = mean_squared_45_cast_fp16, y = var_27_to_fp16)[name = string("op_1142_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_83_cast_fp16 = mul(x = hidden_states_185_cast_fp16, y = var_1142_cast_fp16)[name = string("normed_output_83_cast_fp16")];
+            tensor<fp16, [64]> const_81_to_fp16 = const()[name = string("const_81_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(94010304)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_85_cast_fp16 = mul(x = normed_output_83_cast_fp16, y = const_81_to_fp16)[name = string("normed_output_85_cast_fp16")];
+            tensor<int32, [2]> var_1162 = const()[name = string("op_1162"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_1163_axis_0 = const()[name = string("op_1163_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_1163_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_1163_cast_fp16_1 = split(axis = var_1163_axis_0, split_sizes = var_1162, x = normed_output_85_cast_fp16)[name = string("op_1163_cast_fp16")];
+            tensor<int32, [2]> var_1166 = const()[name = string("op_1166"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_1167_axis_0 = const()[name = string("op_1167_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_1167_0, tensor<fp16, [1, 2304, 32]> var_1167_1 = split(axis = var_1167_axis_0, split_sizes = var_1166, x = var_160_cast_fp16)[name = string("op_1167")];
+            tensor<int32, [2]> var_1170 = const()[name = string("op_1170"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_1171_axis_0 = const()[name = string("op_1171_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_1171_0, tensor<fp16, [1, 2304, 32]> var_1171_1 = split(axis = var_1171_axis_0, split_sizes = var_1170, x = var_163_cast_fp16)[name = string("op_1171")];
+            tensor<int32, [1]> cos_53_axes_0 = const()[name = string("cos_53_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_53 = expand_dims(axes = cos_53_axes_0, x = var_1167_0)[name = string("cos_53")];
+            tensor<int32, [1]> sin_53_axes_0 = const()[name = string("sin_53_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_53 = expand_dims(axes = sin_53_axes_0, x = var_1171_0)[name = string("sin_53")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1176_cast_fp16 = mul(x = var_1163_cast_fp16_0, y = cos_53)[name = string("op_1176_cast_fp16")];
+            tensor<int32, [4]> x1_25_begin_0 = const()[name = string("x1_25_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_25_end_0 = const()[name = string("x1_25_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_25_end_mask_0 = const()[name = string("x1_25_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_25_cast_fp16 = slice_by_index(begin = x1_25_begin_0, end = x1_25_end_0, end_mask = x1_25_end_mask_0, x = var_1163_cast_fp16_0)[name = string("x1_25_cast_fp16")];
+            tensor<int32, [4]> x2_25_begin_0 = const()[name = string("x2_25_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_25_end_0 = const()[name = string("x2_25_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_25_end_mask_0 = const()[name = string("x2_25_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_25_cast_fp16 = slice_by_index(begin = x2_25_begin_0, end = x2_25_end_0, end_mask = x2_25_end_mask_0, x = var_1163_cast_fp16_0)[name = string("x2_25_cast_fp16")];
+            fp16 const_86_promoted_to_fp16 = const()[name = string("const_86_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_1187_cast_fp16 = mul(x = x2_25_cast_fp16, y = const_86_promoted_to_fp16)[name = string("op_1187_cast_fp16")];
+            bool var_1189_interleave_0 = const()[name = string("op_1189_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_1189_cast_fp16 = concat(axis = var_38, interleave = var_1189_interleave_0, values = (var_1187_cast_fp16, x1_25_cast_fp16))[name = string("op_1189_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1190_cast_fp16 = mul(x = var_1189_cast_fp16, y = sin_53)[name = string("op_1190_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1191_cast_fp16 = add(x = var_1176_cast_fp16, y = var_1190_cast_fp16)[name = string("op_1191_cast_fp16")];
+            tensor<int32, [1]> cos_57_axes_0 = const()[name = string("cos_57_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_57 = expand_dims(axes = cos_57_axes_0, x = var_1167_1)[name = string("cos_57")];
+            tensor<int32, [1]> sin_57_axes_0 = const()[name = string("sin_57_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_57 = expand_dims(axes = sin_57_axes_0, x = var_1171_1)[name = string("sin_57")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1194_cast_fp16 = mul(x = var_1163_cast_fp16_1, y = cos_57)[name = string("op_1194_cast_fp16")];
+            tensor<int32, [4]> x1_27_begin_0 = const()[name = string("x1_27_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_27_end_0 = const()[name = string("x1_27_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_27_end_mask_0 = const()[name = string("x1_27_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_27_cast_fp16 = slice_by_index(begin = x1_27_begin_0, end = x1_27_end_0, end_mask = x1_27_end_mask_0, x = var_1163_cast_fp16_1)[name = string("x1_27_cast_fp16")];
+            tensor<int32, [4]> x2_27_begin_0 = const()[name = string("x2_27_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_27_end_0 = const()[name = string("x2_27_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_27_end_mask_0 = const()[name = string("x2_27_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_27_cast_fp16 = slice_by_index(begin = x2_27_begin_0, end = x2_27_end_0, end_mask = x2_27_end_mask_0, x = var_1163_cast_fp16_1)[name = string("x2_27_cast_fp16")];
+            fp16 const_89_promoted_to_fp16 = const()[name = string("const_89_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_1205_cast_fp16 = mul(x = x2_27_cast_fp16, y = const_89_promoted_to_fp16)[name = string("op_1205_cast_fp16")];
+            bool var_1207_interleave_0 = const()[name = string("op_1207_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_1207_cast_fp16 = concat(axis = var_38, interleave = var_1207_interleave_0, values = (var_1205_cast_fp16, x1_27_cast_fp16))[name = string("op_1207_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1208_cast_fp16 = mul(x = var_1207_cast_fp16, y = sin_57)[name = string("op_1208_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1209_cast_fp16 = add(x = var_1194_cast_fp16, y = var_1208_cast_fp16)[name = string("op_1209_cast_fp16")];
+            bool query_states_7_interleave_0 = const()[name = string("query_states_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> query_states_7_cast_fp16 = concat(axis = var_38, interleave = query_states_7_interleave_0, values = (var_1191_cast_fp16, var_1209_cast_fp16))[name = string("query_states_7_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_3_self_attn_k_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_3_self_attn_k_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(94010496)))];
+            tensor<fp16, [1, 2304, 768]> linear_23_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_3_self_attn_k_proj_linear_weight_promoted_to_fp16, x = clip_42_cast_fp16)[name = string("linear_23_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_3_self_attn_k_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_3_self_attn_k_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.76p+4)];
+            fp16 model_vision_tower_encoder_layers_3_self_attn_k_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_3_self_attn_k_proj_output_max_promoted_to_fp16"), val = fp16(0x1.74p+4)];
+            tensor<fp16, [1, 2304, 768]> clip_45_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_3_self_attn_k_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_3_self_attn_k_proj_output_max_promoted_to_fp16, x = linear_23_cast_fp16)[name = string("clip_45_cast_fp16")];
+            tensor<int32, [4]> var_1222 = const()[name = string("op_1222"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_191_cast_fp16 = reshape(shape = var_1222, x = clip_45_cast_fp16)[name = string("hidden_states_191_cast_fp16")];
+            fp16 var_33_promoted_23_to_fp16 = const()[name = string("op_33_promoted_23_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_1226_cast_fp16 = pow(x = hidden_states_191_cast_fp16, y = var_33_promoted_23_to_fp16)[name = string("op_1226_cast_fp16")];
+            tensor<int32, [1]> var_1228_axes_0 = const()[name = string("op_1228_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1228_keep_dims_0 = const()[name = string("op_1228_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_1228_cast_fp16 = reduce_mean(axes = var_1228_axes_0, keep_dims = var_1228_keep_dims_0, x = var_1226_cast_fp16)[name = string("op_1228_cast_fp16")];
+            fp16 var_1229_to_fp16 = const()[name = string("op_1229_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_47_cast_fp16 = add(x = var_1228_cast_fp16, y = var_1229_to_fp16)[name = string("mean_squared_47_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_1231_cast_fp16 = pow(x = mean_squared_47_cast_fp16, y = var_27_to_fp16)[name = string("op_1231_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_87_cast_fp16 = mul(x = hidden_states_191_cast_fp16, y = var_1231_cast_fp16)[name = string("normed_output_87_cast_fp16")];
+            tensor<fp16, [64]> const_90_to_fp16 = const()[name = string("const_90_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(95190208)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_89_cast_fp16 = mul(x = normed_output_87_cast_fp16, y = const_90_to_fp16)[name = string("normed_output_89_cast_fp16")];
+            tensor<int32, [2]> var_1251 = const()[name = string("op_1251"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_1252_axis_0 = const()[name = string("op_1252_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_1252_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_1252_cast_fp16_1 = split(axis = var_1252_axis_0, split_sizes = var_1251, x = normed_output_89_cast_fp16)[name = string("op_1252_cast_fp16")];
+            tensor<int32, [2]> var_1255 = const()[name = string("op_1255"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_1256_axis_0 = const()[name = string("op_1256_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_1256_0, tensor<fp16, [1, 2304, 32]> var_1256_1 = split(axis = var_1256_axis_0, split_sizes = var_1255, x = var_160_cast_fp16)[name = string("op_1256")];
+            tensor<int32, [2]> var_1259 = const()[name = string("op_1259"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_1260_axis_0 = const()[name = string("op_1260_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_1260_0, tensor<fp16, [1, 2304, 32]> var_1260_1 = split(axis = var_1260_axis_0, split_sizes = var_1259, x = var_163_cast_fp16)[name = string("op_1260")];
+            tensor<int32, [1]> cos_61_axes_0 = const()[name = string("cos_61_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_61 = expand_dims(axes = cos_61_axes_0, x = var_1256_0)[name = string("cos_61")];
+            tensor<int32, [1]> sin_61_axes_0 = const()[name = string("sin_61_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_61 = expand_dims(axes = sin_61_axes_0, x = var_1260_0)[name = string("sin_61")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1265_cast_fp16 = mul(x = var_1252_cast_fp16_0, y = cos_61)[name = string("op_1265_cast_fp16")];
+            tensor<int32, [4]> x1_29_begin_0 = const()[name = string("x1_29_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_29_end_0 = const()[name = string("x1_29_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_29_end_mask_0 = const()[name = string("x1_29_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_29_cast_fp16 = slice_by_index(begin = x1_29_begin_0, end = x1_29_end_0, end_mask = x1_29_end_mask_0, x = var_1252_cast_fp16_0)[name = string("x1_29_cast_fp16")];
+            tensor<int32, [4]> x2_29_begin_0 = const()[name = string("x2_29_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_29_end_0 = const()[name = string("x2_29_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_29_end_mask_0 = const()[name = string("x2_29_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_29_cast_fp16 = slice_by_index(begin = x2_29_begin_0, end = x2_29_end_0, end_mask = x2_29_end_mask_0, x = var_1252_cast_fp16_0)[name = string("x2_29_cast_fp16")];
+            fp16 const_95_promoted_to_fp16 = const()[name = string("const_95_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_1276_cast_fp16 = mul(x = x2_29_cast_fp16, y = const_95_promoted_to_fp16)[name = string("op_1276_cast_fp16")];
+            bool var_1278_interleave_0 = const()[name = string("op_1278_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_1278_cast_fp16 = concat(axis = var_38, interleave = var_1278_interleave_0, values = (var_1276_cast_fp16, x1_29_cast_fp16))[name = string("op_1278_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1279_cast_fp16 = mul(x = var_1278_cast_fp16, y = sin_61)[name = string("op_1279_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1280_cast_fp16 = add(x = var_1265_cast_fp16, y = var_1279_cast_fp16)[name = string("op_1280_cast_fp16")];
+            tensor<int32, [1]> cos_65_axes_0 = const()[name = string("cos_65_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_65 = expand_dims(axes = cos_65_axes_0, x = var_1256_1)[name = string("cos_65")];
+            tensor<int32, [1]> sin_65_axes_0 = const()[name = string("sin_65_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_65 = expand_dims(axes = sin_65_axes_0, x = var_1260_1)[name = string("sin_65")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1283_cast_fp16 = mul(x = var_1252_cast_fp16_1, y = cos_65)[name = string("op_1283_cast_fp16")];
+            tensor<int32, [4]> x1_31_begin_0 = const()[name = string("x1_31_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_31_end_0 = const()[name = string("x1_31_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_31_end_mask_0 = const()[name = string("x1_31_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_31_cast_fp16 = slice_by_index(begin = x1_31_begin_0, end = x1_31_end_0, end_mask = x1_31_end_mask_0, x = var_1252_cast_fp16_1)[name = string("x1_31_cast_fp16")];
+            tensor<int32, [4]> x2_31_begin_0 = const()[name = string("x2_31_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_31_end_0 = const()[name = string("x2_31_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_31_end_mask_0 = const()[name = string("x2_31_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_31_cast_fp16 = slice_by_index(begin = x2_31_begin_0, end = x2_31_end_0, end_mask = x2_31_end_mask_0, x = var_1252_cast_fp16_1)[name = string("x2_31_cast_fp16")];
+            fp16 const_98_promoted_to_fp16 = const()[name = string("const_98_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_1294_cast_fp16 = mul(x = x2_31_cast_fp16, y = const_98_promoted_to_fp16)[name = string("op_1294_cast_fp16")];
+            bool var_1296_interleave_0 = const()[name = string("op_1296_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_1296_cast_fp16 = concat(axis = var_38, interleave = var_1296_interleave_0, values = (var_1294_cast_fp16, x1_31_cast_fp16))[name = string("op_1296_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1297_cast_fp16 = mul(x = var_1296_cast_fp16, y = sin_65)[name = string("op_1297_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1298_cast_fp16 = add(x = var_1283_cast_fp16, y = var_1297_cast_fp16)[name = string("op_1298_cast_fp16")];
+            bool key_states_7_interleave_0 = const()[name = string("key_states_7_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> key_states_7_cast_fp16 = concat(axis = var_38, interleave = key_states_7_interleave_0, values = (var_1280_cast_fp16, var_1298_cast_fp16))[name = string("key_states_7_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_3_self_attn_v_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_3_self_attn_v_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(95190400)))];
+            tensor<fp16, [1, 2304, 768]> linear_24_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_3_self_attn_v_proj_linear_weight_promoted_to_fp16, x = clip_42_cast_fp16)[name = string("linear_24_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_3_self_attn_v_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_3_self_attn_v_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.76p+4)];
+            fp16 model_vision_tower_encoder_layers_3_self_attn_v_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_3_self_attn_v_proj_output_max_promoted_to_fp16"), val = fp16(0x1.74p+4)];
+            tensor<fp16, [1, 2304, 768]> clip_47_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_3_self_attn_v_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_3_self_attn_v_proj_output_max_promoted_to_fp16, x = linear_24_cast_fp16)[name = string("clip_47_cast_fp16")];
+            tensor<int32, [4]> var_1311 = const()[name = string("op_1311"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_197_cast_fp16 = reshape(shape = var_1311, x = clip_47_cast_fp16)[name = string("hidden_states_197_cast_fp16")];
+            fp16 var_33_promoted_24_to_fp16 = const()[name = string("op_33_promoted_24_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_1314_cast_fp16 = pow(x = hidden_states_197_cast_fp16, y = var_33_promoted_24_to_fp16)[name = string("op_1314_cast_fp16")];
+            tensor<int32, [1]> var_1316_axes_0 = const()[name = string("op_1316_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1316_keep_dims_0 = const()[name = string("op_1316_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_1316_cast_fp16 = reduce_mean(axes = var_1316_axes_0, keep_dims = var_1316_keep_dims_0, x = var_1314_cast_fp16)[name = string("op_1316_cast_fp16")];
+            fp16 var_1317_to_fp16 = const()[name = string("op_1317_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_49_cast_fp16 = add(x = var_1316_cast_fp16, y = var_1317_to_fp16)[name = string("mean_squared_49_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_1319_cast_fp16 = pow(x = mean_squared_49_cast_fp16, y = var_27_to_fp16)[name = string("op_1319_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_91_cast_fp16 = mul(x = hidden_states_197_cast_fp16, y = var_1319_cast_fp16)[name = string("normed_output_91_cast_fp16")];
+            tensor<int32, [4]> hidden_states_203_perm_0 = const()[name = string("hidden_states_203_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            bool matmul_3_transpose_y_0 = const()[name = string("matmul_3_transpose_y_0"), val = bool(true)];
+            bool matmul_3_transpose_x_0 = const()[name = string("matmul_3_transpose_x_0"), val = bool(false)];
+            tensor<int32, [4]> transpose_70_perm_0 = const()[name = string("transpose_70_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<int32, [4]> transpose_71_perm_0 = const()[name = string("transpose_71_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_71 = transpose(perm = transpose_71_perm_0, x = key_states_7_cast_fp16)[name = string("transpose_145")];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_70 = transpose(perm = transpose_70_perm_0, x = query_states_7_cast_fp16)[name = string("transpose_146")];
+            tensor<fp16, [1, 12, 2304, 2304]> matmul_3_cast_fp16 = matmul(transpose_x = matmul_3_transpose_x_0, transpose_y = matmul_3_transpose_y_0, x = transpose_70, y = transpose_71)[name = string("matmul_3_cast_fp16")];
+            tensor<fp16, [1, 12, 2304, 2304]> add_3_cast_fp16 = add(x = matmul_3_cast_fp16, y = attention_mask_cast_fp16)[name = string("add_3_cast_fp16")];
+            int32 softmax_3_axis_0 = const()[name = string("softmax_3_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 12, 2304, 2304]> softmax_3_cast_fp16 = softmax(axis = softmax_3_axis_0, x = add_3_cast_fp16)[name = string("softmax_3_cast_fp16")];
+            bool attn_output_13_transpose_x_0 = const()[name = string("attn_output_13_transpose_x_0"), val = bool(false)];
+            bool attn_output_13_transpose_y_0 = const()[name = string("attn_output_13_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 12, 2304, 64]> hidden_states_203_cast_fp16 = transpose(perm = hidden_states_203_perm_0, x = normed_output_91_cast_fp16)[name = string("transpose_147")];
+            tensor<fp16, [1, 12, 2304, 64]> attn_output_13_cast_fp16 = matmul(transpose_x = attn_output_13_transpose_x_0, transpose_y = attn_output_13_transpose_y_0, x = softmax_3_cast_fp16, y = hidden_states_203_cast_fp16)[name = string("attn_output_13_cast_fp16")];
+            tensor<int32, [4]> var_1324_perm_0 = const()[name = string("op_1324_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1326 = const()[name = string("op_1326"), val = tensor<int32, [3]>([1, 2304, -1])];
+            tensor<fp16, [1, 2304, 12, 64]> var_1324_cast_fp16 = transpose(perm = var_1324_perm_0, x = attn_output_13_cast_fp16)[name = string("transpose_144")];
+            tensor<fp16, [1, 2304, 768]> var_1327_cast_fp16 = reshape(shape = var_1326, x = var_1324_cast_fp16)[name = string("op_1327_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_3_self_attn_o_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_3_self_attn_o_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.6cp+1)];
+            fp16 model_vision_tower_encoder_layers_3_self_attn_o_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_3_self_attn_o_proj_input_max_promoted_to_fp16"), val = fp16(0x1.6ap+1)];
+            tensor<fp16, [1, 2304, 768]> clip_48_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_3_self_attn_o_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_3_self_attn_o_proj_input_max_promoted_to_fp16, x = var_1327_cast_fp16)[name = string("clip_48_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_3_self_attn_o_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_3_self_attn_o_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(96370112)))];
+            tensor<fp16, [1, 2304, 768]> linear_25_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_3_self_attn_o_proj_linear_weight_promoted_to_fp16, x = clip_48_cast_fp16)[name = string("linear_25_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_3_self_attn_o_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_3_self_attn_o_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.02p+3)];
+            fp16 model_vision_tower_encoder_layers_3_self_attn_o_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_3_self_attn_o_proj_output_max_promoted_to_fp16"), val = fp16(0x1p+3)];
+            tensor<fp16, [1, 2304, 768]> clip_49_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_3_self_attn_o_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_3_self_attn_o_proj_output_max_promoted_to_fp16, x = linear_25_cast_fp16)[name = string("clip_49_cast_fp16")];
+            fp16 var_33_promoted_25_to_fp16 = const()[name = string("op_33_promoted_25_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_1340_cast_fp16 = pow(x = clip_49_cast_fp16, y = var_33_promoted_25_to_fp16)[name = string("op_1340_cast_fp16")];
+            tensor<int32, [1]> var_1342_axes_0 = const()[name = string("op_1342_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1342_keep_dims_0 = const()[name = string("op_1342_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_1342_cast_fp16 = reduce_mean(axes = var_1342_axes_0, keep_dims = var_1342_keep_dims_0, x = var_1340_cast_fp16)[name = string("op_1342_cast_fp16")];
+            fp16 var_1343_to_fp16 = const()[name = string("op_1343_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_51_cast_fp16 = add(x = var_1342_cast_fp16, y = var_1343_to_fp16)[name = string("mean_squared_51_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_1345_cast_fp16 = pow(x = mean_squared_51_cast_fp16, y = var_27_to_fp16)[name = string("op_1345_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_93_cast_fp16 = mul(x = clip_49_cast_fp16, y = var_1345_cast_fp16)[name = string("normed_output_93_cast_fp16")];
+            tensor<fp16, [768]> const_99_to_fp16 = const()[name = string("const_99_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(97549824)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_95_cast_fp16 = mul(x = normed_output_93_cast_fp16, y = const_99_to_fp16)[name = string("normed_output_95_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_215_cast_fp16 = add(x = hidden_states_177_cast_fp16, y = normed_output_95_cast_fp16)[name = string("hidden_states_215_cast_fp16")];
+            fp16 var_33_promoted_26_to_fp16 = const()[name = string("op_33_promoted_26_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_1353_cast_fp16 = pow(x = hidden_states_215_cast_fp16, y = var_33_promoted_26_to_fp16)[name = string("op_1353_cast_fp16")];
+            tensor<int32, [1]> var_1355_axes_0 = const()[name = string("op_1355_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1355_keep_dims_0 = const()[name = string("op_1355_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_1355_cast_fp16 = reduce_mean(axes = var_1355_axes_0, keep_dims = var_1355_keep_dims_0, x = var_1353_cast_fp16)[name = string("op_1355_cast_fp16")];
+            fp16 var_1356_to_fp16 = const()[name = string("op_1356_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_53_cast_fp16 = add(x = var_1355_cast_fp16, y = var_1356_to_fp16)[name = string("mean_squared_53_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_1358_cast_fp16 = pow(x = mean_squared_53_cast_fp16, y = var_27_to_fp16)[name = string("op_1358_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_97_cast_fp16 = mul(x = hidden_states_215_cast_fp16, y = var_1358_cast_fp16)[name = string("normed_output_97_cast_fp16")];
+            tensor<fp16, [768]> const_100_to_fp16 = const()[name = string("const_100_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(97551424)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_99_cast_fp16 = mul(x = normed_output_97_cast_fp16, y = const_100_to_fp16)[name = string("normed_output_99_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_3_mlp_gate_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_3_mlp_gate_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.8cp+3)];
+            fp16 model_vision_tower_encoder_layers_3_mlp_gate_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_3_mlp_gate_proj_input_max_promoted_to_fp16"), val = fp16(0x1.88p+3)];
+            tensor<fp16, [1, 2304, 768]> clip_50_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_3_mlp_gate_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_3_mlp_gate_proj_input_max_promoted_to_fp16, x = normed_output_99_cast_fp16)[name = string("clip_50_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_3_mlp_gate_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_3_mlp_gate_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(97553024)))];
+            tensor<fp16, [1, 2304, 3072]> linear_26_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_3_mlp_gate_proj_linear_weight_promoted_to_fp16, x = clip_50_cast_fp16)[name = string("linear_26_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_3_mlp_gate_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_3_mlp_gate_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.02p+4)];
+            fp16 model_vision_tower_encoder_layers_3_mlp_gate_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_3_mlp_gate_proj_output_max_promoted_to_fp16"), val = fp16(0x1p+4)];
+            tensor<fp16, [1, 2304, 3072]> clip_51_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_3_mlp_gate_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_3_mlp_gate_proj_output_max_promoted_to_fp16, x = linear_26_cast_fp16)[name = string("clip_51_cast_fp16")];
+            string var_1375_mode_0 = const()[name = string("op_1375_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 2304, 3072]> var_1375_cast_fp16 = gelu(mode = var_1375_mode_0, x = clip_51_cast_fp16)[name = string("op_1375_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_3_mlp_up_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_3_mlp_up_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(102271680)))];
+            tensor<fp16, [1, 2304, 3072]> linear_27_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_3_mlp_up_proj_linear_weight_promoted_to_fp16, x = clip_50_cast_fp16)[name = string("linear_27_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_3_mlp_up_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_3_mlp_up_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.02p+4)];
+            fp16 model_vision_tower_encoder_layers_3_mlp_up_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_3_mlp_up_proj_output_max_promoted_to_fp16"), val = fp16(0x1p+4)];
+            tensor<fp16, [1, 2304, 3072]> clip_53_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_3_mlp_up_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_3_mlp_up_proj_output_max_promoted_to_fp16, x = linear_27_cast_fp16)[name = string("clip_53_cast_fp16")];
+            tensor<fp16, [1, 2304, 3072]> hidden_states_225_cast_fp16 = mul(x = var_1375_cast_fp16, y = clip_53_cast_fp16)[name = string("hidden_states_225_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_3_mlp_down_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_3_mlp_down_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.5ep+6)];
+            fp16 model_vision_tower_encoder_layers_3_mlp_down_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_3_mlp_down_proj_input_max_promoted_to_fp16"), val = fp16(0x1.5cp+6)];
+            tensor<fp16, [1, 2304, 3072]> clip_54_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_3_mlp_down_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_3_mlp_down_proj_input_max_promoted_to_fp16, x = hidden_states_225_cast_fp16)[name = string("clip_54_cast_fp16")];
+            tensor<fp16, [768, 3072]> model_vision_tower_encoder_layers_3_mlp_down_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_3_mlp_down_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 3072]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(106990336)))];
+            tensor<fp16, [1, 2304, 768]> linear_28_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_3_mlp_down_proj_linear_weight_promoted_to_fp16, x = clip_54_cast_fp16)[name = string("linear_28_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_3_mlp_down_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_3_mlp_down_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.f6p+5)];
+            fp16 model_vision_tower_encoder_layers_3_mlp_down_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_3_mlp_down_proj_output_max_promoted_to_fp16"), val = fp16(0x1.f2p+5)];
+            tensor<fp16, [1, 2304, 768]> clip_55_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_3_mlp_down_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_3_mlp_down_proj_output_max_promoted_to_fp16, x = linear_28_cast_fp16)[name = string("clip_55_cast_fp16")];
+            fp16 var_33_promoted_27_to_fp16 = const()[name = string("op_33_promoted_27_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_1397_cast_fp16 = pow(x = clip_55_cast_fp16, y = var_33_promoted_27_to_fp16)[name = string("op_1397_cast_fp16")];
+            tensor<int32, [1]> var_1399_axes_0 = const()[name = string("op_1399_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1399_keep_dims_0 = const()[name = string("op_1399_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_1399_cast_fp16 = reduce_mean(axes = var_1399_axes_0, keep_dims = var_1399_keep_dims_0, x = var_1397_cast_fp16)[name = string("op_1399_cast_fp16")];
+            fp16 var_1400_to_fp16 = const()[name = string("op_1400_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_55_cast_fp16 = add(x = var_1399_cast_fp16, y = var_1400_to_fp16)[name = string("mean_squared_55_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_1402_cast_fp16 = pow(x = mean_squared_55_cast_fp16, y = var_27_to_fp16)[name = string("op_1402_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_101_cast_fp16 = mul(x = clip_55_cast_fp16, y = var_1402_cast_fp16)[name = string("normed_output_101_cast_fp16")];
+            tensor<fp16, [768]> const_101_to_fp16 = const()[name = string("const_101_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(111708992)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_103_cast_fp16 = mul(x = normed_output_101_cast_fp16, y = const_101_to_fp16)[name = string("normed_output_103_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_235_cast_fp16 = add(x = hidden_states_215_cast_fp16, y = normed_output_103_cast_fp16)[name = string("hidden_states_235_cast_fp16")];
+            fp16 var_33_promoted_28_to_fp16 = const()[name = string("op_33_promoted_28_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_1416_cast_fp16 = pow(x = hidden_states_235_cast_fp16, y = var_33_promoted_28_to_fp16)[name = string("op_1416_cast_fp16")];
+            tensor<int32, [1]> var_1418_axes_0 = const()[name = string("op_1418_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1418_keep_dims_0 = const()[name = string("op_1418_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_1418_cast_fp16 = reduce_mean(axes = var_1418_axes_0, keep_dims = var_1418_keep_dims_0, x = var_1416_cast_fp16)[name = string("op_1418_cast_fp16")];
+            fp16 var_1419_to_fp16 = const()[name = string("op_1419_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_57_cast_fp16 = add(x = var_1418_cast_fp16, y = var_1419_to_fp16)[name = string("mean_squared_57_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_1421_cast_fp16 = pow(x = mean_squared_57_cast_fp16, y = var_27_to_fp16)[name = string("op_1421_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_105_cast_fp16 = mul(x = hidden_states_235_cast_fp16, y = var_1421_cast_fp16)[name = string("normed_output_105_cast_fp16")];
+            tensor<fp16, [768]> const_102_to_fp16 = const()[name = string("const_102_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(111710592)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_107_cast_fp16 = mul(x = normed_output_105_cast_fp16, y = const_102_to_fp16)[name = string("normed_output_107_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_4_self_attn_q_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_4_self_attn_q_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.9ap+3)];
+            fp16 model_vision_tower_encoder_layers_4_self_attn_q_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_4_self_attn_q_proj_input_max_promoted_to_fp16"), val = fp16(0x1.96p+3)];
+            tensor<fp16, [1, 2304, 768]> clip_56_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_4_self_attn_q_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_4_self_attn_q_proj_input_max_promoted_to_fp16, x = normed_output_107_cast_fp16)[name = string("clip_56_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_4_self_attn_q_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_4_self_attn_q_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(111712192)))];
+            tensor<fp16, [1, 2304, 768]> linear_29_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_4_self_attn_q_proj_linear_weight_promoted_to_fp16, x = clip_56_cast_fp16)[name = string("linear_29_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_4_self_attn_q_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_4_self_attn_q_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.32p+4)];
+            fp16 model_vision_tower_encoder_layers_4_self_attn_q_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_4_self_attn_q_proj_output_max_promoted_to_fp16"), val = fp16(0x1.3p+4)];
+            tensor<fp16, [1, 2304, 768]> clip_57_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_4_self_attn_q_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_4_self_attn_q_proj_output_max_promoted_to_fp16, x = linear_29_cast_fp16)[name = string("clip_57_cast_fp16")];
+            tensor<int32, [4]> var_1443 = const()[name = string("op_1443"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_243_cast_fp16 = reshape(shape = var_1443, x = clip_57_cast_fp16)[name = string("hidden_states_243_cast_fp16")];
+            fp16 var_33_promoted_29_to_fp16 = const()[name = string("op_33_promoted_29_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_1447_cast_fp16 = pow(x = hidden_states_243_cast_fp16, y = var_33_promoted_29_to_fp16)[name = string("op_1447_cast_fp16")];
+            tensor<int32, [1]> var_1449_axes_0 = const()[name = string("op_1449_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1449_keep_dims_0 = const()[name = string("op_1449_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_1449_cast_fp16 = reduce_mean(axes = var_1449_axes_0, keep_dims = var_1449_keep_dims_0, x = var_1447_cast_fp16)[name = string("op_1449_cast_fp16")];
+            fp16 var_1450_to_fp16 = const()[name = string("op_1450_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_59_cast_fp16 = add(x = var_1449_cast_fp16, y = var_1450_to_fp16)[name = string("mean_squared_59_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_1452_cast_fp16 = pow(x = mean_squared_59_cast_fp16, y = var_27_to_fp16)[name = string("op_1452_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_109_cast_fp16 = mul(x = hidden_states_243_cast_fp16, y = var_1452_cast_fp16)[name = string("normed_output_109_cast_fp16")];
+            tensor<fp16, [64]> const_105_to_fp16 = const()[name = string("const_105_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(112891904)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_111_cast_fp16 = mul(x = normed_output_109_cast_fp16, y = const_105_to_fp16)[name = string("normed_output_111_cast_fp16")];
+            tensor<int32, [2]> var_1472 = const()[name = string("op_1472"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_1473_axis_0 = const()[name = string("op_1473_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_1473_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_1473_cast_fp16_1 = split(axis = var_1473_axis_0, split_sizes = var_1472, x = normed_output_111_cast_fp16)[name = string("op_1473_cast_fp16")];
+            tensor<int32, [2]> var_1476 = const()[name = string("op_1476"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_1477_axis_0 = const()[name = string("op_1477_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_1477_0, tensor<fp16, [1, 2304, 32]> var_1477_1 = split(axis = var_1477_axis_0, split_sizes = var_1476, x = var_160_cast_fp16)[name = string("op_1477")];
+            tensor<int32, [2]> var_1480 = const()[name = string("op_1480"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_1481_axis_0 = const()[name = string("op_1481_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_1481_0, tensor<fp16, [1, 2304, 32]> var_1481_1 = split(axis = var_1481_axis_0, split_sizes = var_1480, x = var_163_cast_fp16)[name = string("op_1481")];
+            tensor<int32, [1]> cos_69_axes_0 = const()[name = string("cos_69_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_69 = expand_dims(axes = cos_69_axes_0, x = var_1477_0)[name = string("cos_69")];
+            tensor<int32, [1]> sin_69_axes_0 = const()[name = string("sin_69_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_69 = expand_dims(axes = sin_69_axes_0, x = var_1481_0)[name = string("sin_69")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1486_cast_fp16 = mul(x = var_1473_cast_fp16_0, y = cos_69)[name = string("op_1486_cast_fp16")];
+            tensor<int32, [4]> x1_33_begin_0 = const()[name = string("x1_33_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_33_end_0 = const()[name = string("x1_33_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_33_end_mask_0 = const()[name = string("x1_33_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_33_cast_fp16 = slice_by_index(begin = x1_33_begin_0, end = x1_33_end_0, end_mask = x1_33_end_mask_0, x = var_1473_cast_fp16_0)[name = string("x1_33_cast_fp16")];
+            tensor<int32, [4]> x2_33_begin_0 = const()[name = string("x2_33_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_33_end_0 = const()[name = string("x2_33_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_33_end_mask_0 = const()[name = string("x2_33_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_33_cast_fp16 = slice_by_index(begin = x2_33_begin_0, end = x2_33_end_0, end_mask = x2_33_end_mask_0, x = var_1473_cast_fp16_0)[name = string("x2_33_cast_fp16")];
+            fp16 const_110_promoted_to_fp16 = const()[name = string("const_110_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_1497_cast_fp16 = mul(x = x2_33_cast_fp16, y = const_110_promoted_to_fp16)[name = string("op_1497_cast_fp16")];
+            bool var_1499_interleave_0 = const()[name = string("op_1499_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_1499_cast_fp16 = concat(axis = var_38, interleave = var_1499_interleave_0, values = (var_1497_cast_fp16, x1_33_cast_fp16))[name = string("op_1499_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1500_cast_fp16 = mul(x = var_1499_cast_fp16, y = sin_69)[name = string("op_1500_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1501_cast_fp16 = add(x = var_1486_cast_fp16, y = var_1500_cast_fp16)[name = string("op_1501_cast_fp16")];
+            tensor<int32, [1]> cos_73_axes_0 = const()[name = string("cos_73_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_73 = expand_dims(axes = cos_73_axes_0, x = var_1477_1)[name = string("cos_73")];
+            tensor<int32, [1]> sin_73_axes_0 = const()[name = string("sin_73_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_73 = expand_dims(axes = sin_73_axes_0, x = var_1481_1)[name = string("sin_73")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1504_cast_fp16 = mul(x = var_1473_cast_fp16_1, y = cos_73)[name = string("op_1504_cast_fp16")];
+            tensor<int32, [4]> x1_35_begin_0 = const()[name = string("x1_35_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_35_end_0 = const()[name = string("x1_35_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_35_end_mask_0 = const()[name = string("x1_35_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_35_cast_fp16 = slice_by_index(begin = x1_35_begin_0, end = x1_35_end_0, end_mask = x1_35_end_mask_0, x = var_1473_cast_fp16_1)[name = string("x1_35_cast_fp16")];
+            tensor<int32, [4]> x2_35_begin_0 = const()[name = string("x2_35_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_35_end_0 = const()[name = string("x2_35_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_35_end_mask_0 = const()[name = string("x2_35_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_35_cast_fp16 = slice_by_index(begin = x2_35_begin_0, end = x2_35_end_0, end_mask = x2_35_end_mask_0, x = var_1473_cast_fp16_1)[name = string("x2_35_cast_fp16")];
+            fp16 const_113_promoted_to_fp16 = const()[name = string("const_113_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_1515_cast_fp16 = mul(x = x2_35_cast_fp16, y = const_113_promoted_to_fp16)[name = string("op_1515_cast_fp16")];
+            bool var_1517_interleave_0 = const()[name = string("op_1517_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_1517_cast_fp16 = concat(axis = var_38, interleave = var_1517_interleave_0, values = (var_1515_cast_fp16, x1_35_cast_fp16))[name = string("op_1517_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1518_cast_fp16 = mul(x = var_1517_cast_fp16, y = sin_73)[name = string("op_1518_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1519_cast_fp16 = add(x = var_1504_cast_fp16, y = var_1518_cast_fp16)[name = string("op_1519_cast_fp16")];
+            bool query_states_9_interleave_0 = const()[name = string("query_states_9_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> query_states_9_cast_fp16 = concat(axis = var_38, interleave = query_states_9_interleave_0, values = (var_1501_cast_fp16, var_1519_cast_fp16))[name = string("query_states_9_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_4_self_attn_k_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_4_self_attn_k_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(112892096)))];
+            tensor<fp16, [1, 2304, 768]> linear_30_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_4_self_attn_k_proj_linear_weight_promoted_to_fp16, x = clip_56_cast_fp16)[name = string("linear_30_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_4_self_attn_k_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_4_self_attn_k_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.6p+4)];
+            fp16 model_vision_tower_encoder_layers_4_self_attn_k_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_4_self_attn_k_proj_output_max_promoted_to_fp16"), val = fp16(0x1.5ep+4)];
+            tensor<fp16, [1, 2304, 768]> clip_59_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_4_self_attn_k_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_4_self_attn_k_proj_output_max_promoted_to_fp16, x = linear_30_cast_fp16)[name = string("clip_59_cast_fp16")];
+            tensor<int32, [4]> var_1532 = const()[name = string("op_1532"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_249_cast_fp16 = reshape(shape = var_1532, x = clip_59_cast_fp16)[name = string("hidden_states_249_cast_fp16")];
+            fp16 var_33_promoted_30_to_fp16 = const()[name = string("op_33_promoted_30_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_1536_cast_fp16 = pow(x = hidden_states_249_cast_fp16, y = var_33_promoted_30_to_fp16)[name = string("op_1536_cast_fp16")];
+            tensor<int32, [1]> var_1538_axes_0 = const()[name = string("op_1538_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1538_keep_dims_0 = const()[name = string("op_1538_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_1538_cast_fp16 = reduce_mean(axes = var_1538_axes_0, keep_dims = var_1538_keep_dims_0, x = var_1536_cast_fp16)[name = string("op_1538_cast_fp16")];
+            fp16 var_1539_to_fp16 = const()[name = string("op_1539_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_61_cast_fp16 = add(x = var_1538_cast_fp16, y = var_1539_to_fp16)[name = string("mean_squared_61_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_1541_cast_fp16 = pow(x = mean_squared_61_cast_fp16, y = var_27_to_fp16)[name = string("op_1541_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_113_cast_fp16 = mul(x = hidden_states_249_cast_fp16, y = var_1541_cast_fp16)[name = string("normed_output_113_cast_fp16")];
+            tensor<fp16, [64]> const_114_to_fp16 = const()[name = string("const_114_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114071808)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_115_cast_fp16 = mul(x = normed_output_113_cast_fp16, y = const_114_to_fp16)[name = string("normed_output_115_cast_fp16")];
+            tensor<int32, [2]> var_1561 = const()[name = string("op_1561"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_1562_axis_0 = const()[name = string("op_1562_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_1562_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_1562_cast_fp16_1 = split(axis = var_1562_axis_0, split_sizes = var_1561, x = normed_output_115_cast_fp16)[name = string("op_1562_cast_fp16")];
+            tensor<int32, [2]> var_1565 = const()[name = string("op_1565"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_1566_axis_0 = const()[name = string("op_1566_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_1566_0, tensor<fp16, [1, 2304, 32]> var_1566_1 = split(axis = var_1566_axis_0, split_sizes = var_1565, x = var_160_cast_fp16)[name = string("op_1566")];
+            tensor<int32, [2]> var_1569 = const()[name = string("op_1569"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_1570_axis_0 = const()[name = string("op_1570_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_1570_0, tensor<fp16, [1, 2304, 32]> var_1570_1 = split(axis = var_1570_axis_0, split_sizes = var_1569, x = var_163_cast_fp16)[name = string("op_1570")];
+            tensor<int32, [1]> cos_77_axes_0 = const()[name = string("cos_77_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_77 = expand_dims(axes = cos_77_axes_0, x = var_1566_0)[name = string("cos_77")];
+            tensor<int32, [1]> sin_77_axes_0 = const()[name = string("sin_77_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_77 = expand_dims(axes = sin_77_axes_0, x = var_1570_0)[name = string("sin_77")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1575_cast_fp16 = mul(x = var_1562_cast_fp16_0, y = cos_77)[name = string("op_1575_cast_fp16")];
+            tensor<int32, [4]> x1_37_begin_0 = const()[name = string("x1_37_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_37_end_0 = const()[name = string("x1_37_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_37_end_mask_0 = const()[name = string("x1_37_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_37_cast_fp16 = slice_by_index(begin = x1_37_begin_0, end = x1_37_end_0, end_mask = x1_37_end_mask_0, x = var_1562_cast_fp16_0)[name = string("x1_37_cast_fp16")];
+            tensor<int32, [4]> x2_37_begin_0 = const()[name = string("x2_37_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_37_end_0 = const()[name = string("x2_37_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_37_end_mask_0 = const()[name = string("x2_37_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_37_cast_fp16 = slice_by_index(begin = x2_37_begin_0, end = x2_37_end_0, end_mask = x2_37_end_mask_0, x = var_1562_cast_fp16_0)[name = string("x2_37_cast_fp16")];
+            fp16 const_119_promoted_to_fp16 = const()[name = string("const_119_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_1586_cast_fp16 = mul(x = x2_37_cast_fp16, y = const_119_promoted_to_fp16)[name = string("op_1586_cast_fp16")];
+            bool var_1588_interleave_0 = const()[name = string("op_1588_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_1588_cast_fp16 = concat(axis = var_38, interleave = var_1588_interleave_0, values = (var_1586_cast_fp16, x1_37_cast_fp16))[name = string("op_1588_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1589_cast_fp16 = mul(x = var_1588_cast_fp16, y = sin_77)[name = string("op_1589_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1590_cast_fp16 = add(x = var_1575_cast_fp16, y = var_1589_cast_fp16)[name = string("op_1590_cast_fp16")];
+            tensor<int32, [1]> cos_81_axes_0 = const()[name = string("cos_81_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_81 = expand_dims(axes = cos_81_axes_0, x = var_1566_1)[name = string("cos_81")];
+            tensor<int32, [1]> sin_81_axes_0 = const()[name = string("sin_81_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_81 = expand_dims(axes = sin_81_axes_0, x = var_1570_1)[name = string("sin_81")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1593_cast_fp16 = mul(x = var_1562_cast_fp16_1, y = cos_81)[name = string("op_1593_cast_fp16")];
+            tensor<int32, [4]> x1_39_begin_0 = const()[name = string("x1_39_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_39_end_0 = const()[name = string("x1_39_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_39_end_mask_0 = const()[name = string("x1_39_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_39_cast_fp16 = slice_by_index(begin = x1_39_begin_0, end = x1_39_end_0, end_mask = x1_39_end_mask_0, x = var_1562_cast_fp16_1)[name = string("x1_39_cast_fp16")];
+            tensor<int32, [4]> x2_39_begin_0 = const()[name = string("x2_39_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_39_end_0 = const()[name = string("x2_39_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_39_end_mask_0 = const()[name = string("x2_39_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_39_cast_fp16 = slice_by_index(begin = x2_39_begin_0, end = x2_39_end_0, end_mask = x2_39_end_mask_0, x = var_1562_cast_fp16_1)[name = string("x2_39_cast_fp16")];
+            fp16 const_122_promoted_to_fp16 = const()[name = string("const_122_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_1604_cast_fp16 = mul(x = x2_39_cast_fp16, y = const_122_promoted_to_fp16)[name = string("op_1604_cast_fp16")];
+            bool var_1606_interleave_0 = const()[name = string("op_1606_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_1606_cast_fp16 = concat(axis = var_38, interleave = var_1606_interleave_0, values = (var_1604_cast_fp16, x1_39_cast_fp16))[name = string("op_1606_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1607_cast_fp16 = mul(x = var_1606_cast_fp16, y = sin_81)[name = string("op_1607_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1608_cast_fp16 = add(x = var_1593_cast_fp16, y = var_1607_cast_fp16)[name = string("op_1608_cast_fp16")];
+            bool key_states_9_interleave_0 = const()[name = string("key_states_9_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> key_states_9_cast_fp16 = concat(axis = var_38, interleave = key_states_9_interleave_0, values = (var_1590_cast_fp16, var_1608_cast_fp16))[name = string("key_states_9_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_4_self_attn_v_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_4_self_attn_v_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(114072000)))];
+            tensor<fp16, [1, 2304, 768]> linear_31_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_4_self_attn_v_proj_linear_weight_promoted_to_fp16, x = clip_56_cast_fp16)[name = string("linear_31_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_4_self_attn_v_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_4_self_attn_v_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.6p+4)];
+            fp16 model_vision_tower_encoder_layers_4_self_attn_v_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_4_self_attn_v_proj_output_max_promoted_to_fp16"), val = fp16(0x1.5ep+4)];
+            tensor<fp16, [1, 2304, 768]> clip_61_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_4_self_attn_v_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_4_self_attn_v_proj_output_max_promoted_to_fp16, x = linear_31_cast_fp16)[name = string("clip_61_cast_fp16")];
+            tensor<int32, [4]> var_1621 = const()[name = string("op_1621"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_255_cast_fp16 = reshape(shape = var_1621, x = clip_61_cast_fp16)[name = string("hidden_states_255_cast_fp16")];
+            fp16 var_33_promoted_31_to_fp16 = const()[name = string("op_33_promoted_31_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_1624_cast_fp16 = pow(x = hidden_states_255_cast_fp16, y = var_33_promoted_31_to_fp16)[name = string("op_1624_cast_fp16")];
+            tensor<int32, [1]> var_1626_axes_0 = const()[name = string("op_1626_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1626_keep_dims_0 = const()[name = string("op_1626_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_1626_cast_fp16 = reduce_mean(axes = var_1626_axes_0, keep_dims = var_1626_keep_dims_0, x = var_1624_cast_fp16)[name = string("op_1626_cast_fp16")];
+            fp16 var_1627_to_fp16 = const()[name = string("op_1627_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_63_cast_fp16 = add(x = var_1626_cast_fp16, y = var_1627_to_fp16)[name = string("mean_squared_63_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_1629_cast_fp16 = pow(x = mean_squared_63_cast_fp16, y = var_27_to_fp16)[name = string("op_1629_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_117_cast_fp16 = mul(x = hidden_states_255_cast_fp16, y = var_1629_cast_fp16)[name = string("normed_output_117_cast_fp16")];
+            tensor<int32, [4]> hidden_states_261_perm_0 = const()[name = string("hidden_states_261_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            bool matmul_4_transpose_y_0 = const()[name = string("matmul_4_transpose_y_0"), val = bool(true)];
+            bool matmul_4_transpose_x_0 = const()[name = string("matmul_4_transpose_x_0"), val = bool(false)];
+            tensor<int32, [4]> transpose_72_perm_0 = const()[name = string("transpose_72_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<int32, [4]> transpose_73_perm_0 = const()[name = string("transpose_73_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_73 = transpose(perm = transpose_73_perm_0, x = key_states_9_cast_fp16)[name = string("transpose_141")];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_72 = transpose(perm = transpose_72_perm_0, x = query_states_9_cast_fp16)[name = string("transpose_142")];
+            tensor<fp16, [1, 12, 2304, 2304]> matmul_4_cast_fp16 = matmul(transpose_x = matmul_4_transpose_x_0, transpose_y = matmul_4_transpose_y_0, x = transpose_72, y = transpose_73)[name = string("matmul_4_cast_fp16")];
+            tensor<fp16, [1, 12, 2304, 2304]> add_4_cast_fp16 = add(x = matmul_4_cast_fp16, y = attention_mask_cast_fp16)[name = string("add_4_cast_fp16")];
+            int32 softmax_4_axis_0 = const()[name = string("softmax_4_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 12, 2304, 2304]> softmax_4_cast_fp16 = softmax(axis = softmax_4_axis_0, x = add_4_cast_fp16)[name = string("softmax_4_cast_fp16")];
+            bool attn_output_17_transpose_x_0 = const()[name = string("attn_output_17_transpose_x_0"), val = bool(false)];
+            bool attn_output_17_transpose_y_0 = const()[name = string("attn_output_17_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 12, 2304, 64]> hidden_states_261_cast_fp16 = transpose(perm = hidden_states_261_perm_0, x = normed_output_117_cast_fp16)[name = string("transpose_143")];
+            tensor<fp16, [1, 12, 2304, 64]> attn_output_17_cast_fp16 = matmul(transpose_x = attn_output_17_transpose_x_0, transpose_y = attn_output_17_transpose_y_0, x = softmax_4_cast_fp16, y = hidden_states_261_cast_fp16)[name = string("attn_output_17_cast_fp16")];
+            tensor<int32, [4]> var_1634_perm_0 = const()[name = string("op_1634_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1636 = const()[name = string("op_1636"), val = tensor<int32, [3]>([1, 2304, -1])];
+            tensor<fp16, [1, 2304, 12, 64]> var_1634_cast_fp16 = transpose(perm = var_1634_perm_0, x = attn_output_17_cast_fp16)[name = string("transpose_140")];
+            tensor<fp16, [1, 2304, 768]> var_1637_cast_fp16 = reshape(shape = var_1636, x = var_1634_cast_fp16)[name = string("op_1637_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_4_self_attn_o_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_4_self_attn_o_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.32p+1)];
+            fp16 model_vision_tower_encoder_layers_4_self_attn_o_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_4_self_attn_o_proj_input_max_promoted_to_fp16"), val = fp16(0x1.3p+1)];
+            tensor<fp16, [1, 2304, 768]> clip_62_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_4_self_attn_o_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_4_self_attn_o_proj_input_max_promoted_to_fp16, x = var_1637_cast_fp16)[name = string("clip_62_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_4_self_attn_o_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_4_self_attn_o_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(115251712)))];
+            tensor<fp16, [1, 2304, 768]> linear_32_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_4_self_attn_o_proj_linear_weight_promoted_to_fp16, x = clip_62_cast_fp16)[name = string("linear_32_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_4_self_attn_o_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_4_self_attn_o_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.72p+2)];
+            fp16 model_vision_tower_encoder_layers_4_self_attn_o_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_4_self_attn_o_proj_output_max_promoted_to_fp16"), val = fp16(0x1.6ep+2)];
+            tensor<fp16, [1, 2304, 768]> clip_63_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_4_self_attn_o_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_4_self_attn_o_proj_output_max_promoted_to_fp16, x = linear_32_cast_fp16)[name = string("clip_63_cast_fp16")];
+            fp16 var_33_promoted_32_to_fp16 = const()[name = string("op_33_promoted_32_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_1650_cast_fp16 = pow(x = clip_63_cast_fp16, y = var_33_promoted_32_to_fp16)[name = string("op_1650_cast_fp16")];
+            tensor<int32, [1]> var_1652_axes_0 = const()[name = string("op_1652_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1652_keep_dims_0 = const()[name = string("op_1652_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_1652_cast_fp16 = reduce_mean(axes = var_1652_axes_0, keep_dims = var_1652_keep_dims_0, x = var_1650_cast_fp16)[name = string("op_1652_cast_fp16")];
+            fp16 var_1653_to_fp16 = const()[name = string("op_1653_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_65_cast_fp16 = add(x = var_1652_cast_fp16, y = var_1653_to_fp16)[name = string("mean_squared_65_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_1655_cast_fp16 = pow(x = mean_squared_65_cast_fp16, y = var_27_to_fp16)[name = string("op_1655_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_119_cast_fp16 = mul(x = clip_63_cast_fp16, y = var_1655_cast_fp16)[name = string("normed_output_119_cast_fp16")];
+            tensor<fp16, [768]> const_123_to_fp16 = const()[name = string("const_123_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(116431424)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_121_cast_fp16 = mul(x = normed_output_119_cast_fp16, y = const_123_to_fp16)[name = string("normed_output_121_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_273_cast_fp16 = add(x = hidden_states_235_cast_fp16, y = normed_output_121_cast_fp16)[name = string("hidden_states_273_cast_fp16")];
+            fp16 var_33_promoted_33_to_fp16 = const()[name = string("op_33_promoted_33_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_1663_cast_fp16 = pow(x = hidden_states_273_cast_fp16, y = var_33_promoted_33_to_fp16)[name = string("op_1663_cast_fp16")];
+            tensor<int32, [1]> var_1665_axes_0 = const()[name = string("op_1665_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1665_keep_dims_0 = const()[name = string("op_1665_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_1665_cast_fp16 = reduce_mean(axes = var_1665_axes_0, keep_dims = var_1665_keep_dims_0, x = var_1663_cast_fp16)[name = string("op_1665_cast_fp16")];
+            fp16 var_1666_to_fp16 = const()[name = string("op_1666_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_67_cast_fp16 = add(x = var_1665_cast_fp16, y = var_1666_to_fp16)[name = string("mean_squared_67_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_1668_cast_fp16 = pow(x = mean_squared_67_cast_fp16, y = var_27_to_fp16)[name = string("op_1668_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_123_cast_fp16 = mul(x = hidden_states_273_cast_fp16, y = var_1668_cast_fp16)[name = string("normed_output_123_cast_fp16")];
+            tensor<fp16, [768]> const_124_to_fp16 = const()[name = string("const_124_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(116433024)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_125_cast_fp16 = mul(x = normed_output_123_cast_fp16, y = const_124_to_fp16)[name = string("normed_output_125_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_4_mlp_gate_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_4_mlp_gate_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.9ap+3)];
+            fp16 model_vision_tower_encoder_layers_4_mlp_gate_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_4_mlp_gate_proj_input_max_promoted_to_fp16"), val = fp16(0x1.96p+3)];
+            tensor<fp16, [1, 2304, 768]> clip_64_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_4_mlp_gate_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_4_mlp_gate_proj_input_max_promoted_to_fp16, x = normed_output_125_cast_fp16)[name = string("clip_64_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_4_mlp_gate_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_4_mlp_gate_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(116434624)))];
+            tensor<fp16, [1, 2304, 3072]> linear_33_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_4_mlp_gate_proj_linear_weight_promoted_to_fp16, x = clip_64_cast_fp16)[name = string("linear_33_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_4_mlp_gate_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_4_mlp_gate_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.16p+4)];
+            fp16 model_vision_tower_encoder_layers_4_mlp_gate_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_4_mlp_gate_proj_output_max_promoted_to_fp16"), val = fp16(0x1.14p+4)];
+            tensor<fp16, [1, 2304, 3072]> clip_65_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_4_mlp_gate_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_4_mlp_gate_proj_output_max_promoted_to_fp16, x = linear_33_cast_fp16)[name = string("clip_65_cast_fp16")];
+            string var_1685_mode_0 = const()[name = string("op_1685_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 2304, 3072]> var_1685_cast_fp16 = gelu(mode = var_1685_mode_0, x = clip_65_cast_fp16)[name = string("op_1685_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_4_mlp_up_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_4_mlp_up_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(121153280)))];
+            tensor<fp16, [1, 2304, 3072]> linear_34_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_4_mlp_up_proj_linear_weight_promoted_to_fp16, x = clip_64_cast_fp16)[name = string("linear_34_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_4_mlp_up_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_4_mlp_up_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.16p+4)];
+            fp16 model_vision_tower_encoder_layers_4_mlp_up_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_4_mlp_up_proj_output_max_promoted_to_fp16"), val = fp16(0x1.14p+4)];
+            tensor<fp16, [1, 2304, 3072]> clip_67_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_4_mlp_up_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_4_mlp_up_proj_output_max_promoted_to_fp16, x = linear_34_cast_fp16)[name = string("clip_67_cast_fp16")];
+            tensor<fp16, [1, 2304, 3072]> hidden_states_283_cast_fp16 = mul(x = var_1685_cast_fp16, y = clip_67_cast_fp16)[name = string("hidden_states_283_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_4_mlp_down_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_4_mlp_down_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.6cp+6)];
+            fp16 model_vision_tower_encoder_layers_4_mlp_down_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_4_mlp_down_proj_input_max_promoted_to_fp16"), val = fp16(0x1.68p+6)];
+            tensor<fp16, [1, 2304, 3072]> clip_68_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_4_mlp_down_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_4_mlp_down_proj_input_max_promoted_to_fp16, x = hidden_states_283_cast_fp16)[name = string("clip_68_cast_fp16")];
+            tensor<fp16, [768, 3072]> model_vision_tower_encoder_layers_4_mlp_down_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_4_mlp_down_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 3072]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(125871936)))];
+            tensor<fp16, [1, 2304, 768]> linear_35_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_4_mlp_down_proj_linear_weight_promoted_to_fp16, x = clip_68_cast_fp16)[name = string("linear_35_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_4_mlp_down_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_4_mlp_down_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.28p+6)];
+            fp16 model_vision_tower_encoder_layers_4_mlp_down_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_4_mlp_down_proj_output_max_promoted_to_fp16"), val = fp16(0x1.26p+6)];
+            tensor<fp16, [1, 2304, 768]> clip_69_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_4_mlp_down_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_4_mlp_down_proj_output_max_promoted_to_fp16, x = linear_35_cast_fp16)[name = string("clip_69_cast_fp16")];
+            fp16 var_33_promoted_34_to_fp16 = const()[name = string("op_33_promoted_34_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_1707_cast_fp16 = pow(x = clip_69_cast_fp16, y = var_33_promoted_34_to_fp16)[name = string("op_1707_cast_fp16")];
+            tensor<int32, [1]> var_1709_axes_0 = const()[name = string("op_1709_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1709_keep_dims_0 = const()[name = string("op_1709_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_1709_cast_fp16 = reduce_mean(axes = var_1709_axes_0, keep_dims = var_1709_keep_dims_0, x = var_1707_cast_fp16)[name = string("op_1709_cast_fp16")];
+            fp16 var_1710_to_fp16 = const()[name = string("op_1710_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_69_cast_fp16 = add(x = var_1709_cast_fp16, y = var_1710_to_fp16)[name = string("mean_squared_69_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_1712_cast_fp16 = pow(x = mean_squared_69_cast_fp16, y = var_27_to_fp16)[name = string("op_1712_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_127_cast_fp16 = mul(x = clip_69_cast_fp16, y = var_1712_cast_fp16)[name = string("normed_output_127_cast_fp16")];
+            tensor<fp16, [768]> const_125_to_fp16 = const()[name = string("const_125_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(130590592)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_129_cast_fp16 = mul(x = normed_output_127_cast_fp16, y = const_125_to_fp16)[name = string("normed_output_129_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_293_cast_fp16 = add(x = hidden_states_273_cast_fp16, y = normed_output_129_cast_fp16)[name = string("hidden_states_293_cast_fp16")];
+            fp16 var_33_promoted_35_to_fp16 = const()[name = string("op_33_promoted_35_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_1726_cast_fp16 = pow(x = hidden_states_293_cast_fp16, y = var_33_promoted_35_to_fp16)[name = string("op_1726_cast_fp16")];
+            tensor<int32, [1]> var_1728_axes_0 = const()[name = string("op_1728_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1728_keep_dims_0 = const()[name = string("op_1728_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_1728_cast_fp16 = reduce_mean(axes = var_1728_axes_0, keep_dims = var_1728_keep_dims_0, x = var_1726_cast_fp16)[name = string("op_1728_cast_fp16")];
+            fp16 var_1729_to_fp16 = const()[name = string("op_1729_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_71_cast_fp16 = add(x = var_1728_cast_fp16, y = var_1729_to_fp16)[name = string("mean_squared_71_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_1731_cast_fp16 = pow(x = mean_squared_71_cast_fp16, y = var_27_to_fp16)[name = string("op_1731_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_131_cast_fp16 = mul(x = hidden_states_293_cast_fp16, y = var_1731_cast_fp16)[name = string("normed_output_131_cast_fp16")];
+            tensor<fp16, [768]> const_126_to_fp16 = const()[name = string("const_126_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(130592192)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_133_cast_fp16 = mul(x = normed_output_131_cast_fp16, y = const_126_to_fp16)[name = string("normed_output_133_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_5_self_attn_q_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_5_self_attn_q_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.84p+3)];
+            fp16 model_vision_tower_encoder_layers_5_self_attn_q_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_5_self_attn_q_proj_input_max_promoted_to_fp16"), val = fp16(0x1.82p+3)];
+            tensor<fp16, [1, 2304, 768]> clip_70_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_5_self_attn_q_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_5_self_attn_q_proj_input_max_promoted_to_fp16, x = normed_output_133_cast_fp16)[name = string("clip_70_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_5_self_attn_q_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_5_self_attn_q_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(130593792)))];
+            tensor<fp16, [1, 2304, 768]> linear_36_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_5_self_attn_q_proj_linear_weight_promoted_to_fp16, x = clip_70_cast_fp16)[name = string("linear_36_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_5_self_attn_q_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_5_self_attn_q_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.fcp+3)];
+            fp16 model_vision_tower_encoder_layers_5_self_attn_q_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_5_self_attn_q_proj_output_max_promoted_to_fp16"), val = fp16(0x1.f8p+3)];
+            tensor<fp16, [1, 2304, 768]> clip_71_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_5_self_attn_q_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_5_self_attn_q_proj_output_max_promoted_to_fp16, x = linear_36_cast_fp16)[name = string("clip_71_cast_fp16")];
+            tensor<int32, [4]> var_1753 = const()[name = string("op_1753"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_301_cast_fp16 = reshape(shape = var_1753, x = clip_71_cast_fp16)[name = string("hidden_states_301_cast_fp16")];
+            fp16 var_33_promoted_36_to_fp16 = const()[name = string("op_33_promoted_36_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_1757_cast_fp16 = pow(x = hidden_states_301_cast_fp16, y = var_33_promoted_36_to_fp16)[name = string("op_1757_cast_fp16")];
+            tensor<int32, [1]> var_1759_axes_0 = const()[name = string("op_1759_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1759_keep_dims_0 = const()[name = string("op_1759_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_1759_cast_fp16 = reduce_mean(axes = var_1759_axes_0, keep_dims = var_1759_keep_dims_0, x = var_1757_cast_fp16)[name = string("op_1759_cast_fp16")];
+            fp16 var_1760_to_fp16 = const()[name = string("op_1760_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_73_cast_fp16 = add(x = var_1759_cast_fp16, y = var_1760_to_fp16)[name = string("mean_squared_73_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_1762_cast_fp16 = pow(x = mean_squared_73_cast_fp16, y = var_27_to_fp16)[name = string("op_1762_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_135_cast_fp16 = mul(x = hidden_states_301_cast_fp16, y = var_1762_cast_fp16)[name = string("normed_output_135_cast_fp16")];
+            tensor<fp16, [64]> const_129_to_fp16 = const()[name = string("const_129_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(131773504)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_137_cast_fp16 = mul(x = normed_output_135_cast_fp16, y = const_129_to_fp16)[name = string("normed_output_137_cast_fp16")];
+            tensor<int32, [2]> var_1782 = const()[name = string("op_1782"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_1783_axis_0 = const()[name = string("op_1783_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_1783_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_1783_cast_fp16_1 = split(axis = var_1783_axis_0, split_sizes = var_1782, x = normed_output_137_cast_fp16)[name = string("op_1783_cast_fp16")];
+            tensor<int32, [2]> var_1786 = const()[name = string("op_1786"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_1787_axis_0 = const()[name = string("op_1787_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_1787_0, tensor<fp16, [1, 2304, 32]> var_1787_1 = split(axis = var_1787_axis_0, split_sizes = var_1786, x = var_160_cast_fp16)[name = string("op_1787")];
+            tensor<int32, [2]> var_1790 = const()[name = string("op_1790"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_1791_axis_0 = const()[name = string("op_1791_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_1791_0, tensor<fp16, [1, 2304, 32]> var_1791_1 = split(axis = var_1791_axis_0, split_sizes = var_1790, x = var_163_cast_fp16)[name = string("op_1791")];
+            tensor<int32, [1]> cos_85_axes_0 = const()[name = string("cos_85_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_85 = expand_dims(axes = cos_85_axes_0, x = var_1787_0)[name = string("cos_85")];
+            tensor<int32, [1]> sin_85_axes_0 = const()[name = string("sin_85_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_85 = expand_dims(axes = sin_85_axes_0, x = var_1791_0)[name = string("sin_85")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1796_cast_fp16 = mul(x = var_1783_cast_fp16_0, y = cos_85)[name = string("op_1796_cast_fp16")];
+            tensor<int32, [4]> x1_41_begin_0 = const()[name = string("x1_41_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_41_end_0 = const()[name = string("x1_41_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_41_end_mask_0 = const()[name = string("x1_41_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_41_cast_fp16 = slice_by_index(begin = x1_41_begin_0, end = x1_41_end_0, end_mask = x1_41_end_mask_0, x = var_1783_cast_fp16_0)[name = string("x1_41_cast_fp16")];
+            tensor<int32, [4]> x2_41_begin_0 = const()[name = string("x2_41_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_41_end_0 = const()[name = string("x2_41_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_41_end_mask_0 = const()[name = string("x2_41_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_41_cast_fp16 = slice_by_index(begin = x2_41_begin_0, end = x2_41_end_0, end_mask = x2_41_end_mask_0, x = var_1783_cast_fp16_0)[name = string("x2_41_cast_fp16")];
+            fp16 const_134_promoted_to_fp16 = const()[name = string("const_134_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_1807_cast_fp16 = mul(x = x2_41_cast_fp16, y = const_134_promoted_to_fp16)[name = string("op_1807_cast_fp16")];
+            bool var_1809_interleave_0 = const()[name = string("op_1809_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_1809_cast_fp16 = concat(axis = var_38, interleave = var_1809_interleave_0, values = (var_1807_cast_fp16, x1_41_cast_fp16))[name = string("op_1809_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1810_cast_fp16 = mul(x = var_1809_cast_fp16, y = sin_85)[name = string("op_1810_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1811_cast_fp16 = add(x = var_1796_cast_fp16, y = var_1810_cast_fp16)[name = string("op_1811_cast_fp16")];
+            tensor<int32, [1]> cos_89_axes_0 = const()[name = string("cos_89_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_89 = expand_dims(axes = cos_89_axes_0, x = var_1787_1)[name = string("cos_89")];
+            tensor<int32, [1]> sin_89_axes_0 = const()[name = string("sin_89_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_89 = expand_dims(axes = sin_89_axes_0, x = var_1791_1)[name = string("sin_89")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1814_cast_fp16 = mul(x = var_1783_cast_fp16_1, y = cos_89)[name = string("op_1814_cast_fp16")];
+            tensor<int32, [4]> x1_43_begin_0 = const()[name = string("x1_43_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_43_end_0 = const()[name = string("x1_43_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_43_end_mask_0 = const()[name = string("x1_43_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_43_cast_fp16 = slice_by_index(begin = x1_43_begin_0, end = x1_43_end_0, end_mask = x1_43_end_mask_0, x = var_1783_cast_fp16_1)[name = string("x1_43_cast_fp16")];
+            tensor<int32, [4]> x2_43_begin_0 = const()[name = string("x2_43_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_43_end_0 = const()[name = string("x2_43_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_43_end_mask_0 = const()[name = string("x2_43_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_43_cast_fp16 = slice_by_index(begin = x2_43_begin_0, end = x2_43_end_0, end_mask = x2_43_end_mask_0, x = var_1783_cast_fp16_1)[name = string("x2_43_cast_fp16")];
+            fp16 const_137_promoted_to_fp16 = const()[name = string("const_137_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_1825_cast_fp16 = mul(x = x2_43_cast_fp16, y = const_137_promoted_to_fp16)[name = string("op_1825_cast_fp16")];
+            bool var_1827_interleave_0 = const()[name = string("op_1827_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_1827_cast_fp16 = concat(axis = var_38, interleave = var_1827_interleave_0, values = (var_1825_cast_fp16, x1_43_cast_fp16))[name = string("op_1827_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1828_cast_fp16 = mul(x = var_1827_cast_fp16, y = sin_89)[name = string("op_1828_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1829_cast_fp16 = add(x = var_1814_cast_fp16, y = var_1828_cast_fp16)[name = string("op_1829_cast_fp16")];
+            bool query_states_11_interleave_0 = const()[name = string("query_states_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> query_states_11_cast_fp16 = concat(axis = var_38, interleave = query_states_11_interleave_0, values = (var_1811_cast_fp16, var_1829_cast_fp16))[name = string("query_states_11_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_5_self_attn_k_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_5_self_attn_k_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(131773696)))];
+            tensor<fp16, [1, 2304, 768]> linear_37_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_5_self_attn_k_proj_linear_weight_promoted_to_fp16, x = clip_70_cast_fp16)[name = string("linear_37_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_5_self_attn_k_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_5_self_attn_k_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.36p+4)];
+            fp16 model_vision_tower_encoder_layers_5_self_attn_k_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_5_self_attn_k_proj_output_max_promoted_to_fp16"), val = fp16(0x1.34p+4)];
+            tensor<fp16, [1, 2304, 768]> clip_73_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_5_self_attn_k_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_5_self_attn_k_proj_output_max_promoted_to_fp16, x = linear_37_cast_fp16)[name = string("clip_73_cast_fp16")];
+            tensor<int32, [4]> var_1842 = const()[name = string("op_1842"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_307_cast_fp16 = reshape(shape = var_1842, x = clip_73_cast_fp16)[name = string("hidden_states_307_cast_fp16")];
+            fp16 var_33_promoted_37_to_fp16 = const()[name = string("op_33_promoted_37_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_1846_cast_fp16 = pow(x = hidden_states_307_cast_fp16, y = var_33_promoted_37_to_fp16)[name = string("op_1846_cast_fp16")];
+            tensor<int32, [1]> var_1848_axes_0 = const()[name = string("op_1848_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1848_keep_dims_0 = const()[name = string("op_1848_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_1848_cast_fp16 = reduce_mean(axes = var_1848_axes_0, keep_dims = var_1848_keep_dims_0, x = var_1846_cast_fp16)[name = string("op_1848_cast_fp16")];
+            fp16 var_1849_to_fp16 = const()[name = string("op_1849_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_75_cast_fp16 = add(x = var_1848_cast_fp16, y = var_1849_to_fp16)[name = string("mean_squared_75_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_1851_cast_fp16 = pow(x = mean_squared_75_cast_fp16, y = var_27_to_fp16)[name = string("op_1851_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_139_cast_fp16 = mul(x = hidden_states_307_cast_fp16, y = var_1851_cast_fp16)[name = string("normed_output_139_cast_fp16")];
+            tensor<fp16, [64]> const_138_to_fp16 = const()[name = string("const_138_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(132953408)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_141_cast_fp16 = mul(x = normed_output_139_cast_fp16, y = const_138_to_fp16)[name = string("normed_output_141_cast_fp16")];
+            tensor<int32, [2]> var_1871 = const()[name = string("op_1871"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_1872_axis_0 = const()[name = string("op_1872_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_1872_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_1872_cast_fp16_1 = split(axis = var_1872_axis_0, split_sizes = var_1871, x = normed_output_141_cast_fp16)[name = string("op_1872_cast_fp16")];
+            tensor<int32, [2]> var_1875 = const()[name = string("op_1875"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_1876_axis_0 = const()[name = string("op_1876_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_1876_0, tensor<fp16, [1, 2304, 32]> var_1876_1 = split(axis = var_1876_axis_0, split_sizes = var_1875, x = var_160_cast_fp16)[name = string("op_1876")];
+            tensor<int32, [2]> var_1879 = const()[name = string("op_1879"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_1880_axis_0 = const()[name = string("op_1880_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_1880_0, tensor<fp16, [1, 2304, 32]> var_1880_1 = split(axis = var_1880_axis_0, split_sizes = var_1879, x = var_163_cast_fp16)[name = string("op_1880")];
+            tensor<int32, [1]> cos_93_axes_0 = const()[name = string("cos_93_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_93 = expand_dims(axes = cos_93_axes_0, x = var_1876_0)[name = string("cos_93")];
+            tensor<int32, [1]> sin_93_axes_0 = const()[name = string("sin_93_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_93 = expand_dims(axes = sin_93_axes_0, x = var_1880_0)[name = string("sin_93")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1885_cast_fp16 = mul(x = var_1872_cast_fp16_0, y = cos_93)[name = string("op_1885_cast_fp16")];
+            tensor<int32, [4]> x1_45_begin_0 = const()[name = string("x1_45_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_45_end_0 = const()[name = string("x1_45_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_45_end_mask_0 = const()[name = string("x1_45_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_45_cast_fp16 = slice_by_index(begin = x1_45_begin_0, end = x1_45_end_0, end_mask = x1_45_end_mask_0, x = var_1872_cast_fp16_0)[name = string("x1_45_cast_fp16")];
+            tensor<int32, [4]> x2_45_begin_0 = const()[name = string("x2_45_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_45_end_0 = const()[name = string("x2_45_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_45_end_mask_0 = const()[name = string("x2_45_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_45_cast_fp16 = slice_by_index(begin = x2_45_begin_0, end = x2_45_end_0, end_mask = x2_45_end_mask_0, x = var_1872_cast_fp16_0)[name = string("x2_45_cast_fp16")];
+            fp16 const_143_promoted_to_fp16 = const()[name = string("const_143_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_1896_cast_fp16 = mul(x = x2_45_cast_fp16, y = const_143_promoted_to_fp16)[name = string("op_1896_cast_fp16")];
+            bool var_1898_interleave_0 = const()[name = string("op_1898_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_1898_cast_fp16 = concat(axis = var_38, interleave = var_1898_interleave_0, values = (var_1896_cast_fp16, x1_45_cast_fp16))[name = string("op_1898_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1899_cast_fp16 = mul(x = var_1898_cast_fp16, y = sin_93)[name = string("op_1899_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1900_cast_fp16 = add(x = var_1885_cast_fp16, y = var_1899_cast_fp16)[name = string("op_1900_cast_fp16")];
+            tensor<int32, [1]> cos_97_axes_0 = const()[name = string("cos_97_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_97 = expand_dims(axes = cos_97_axes_0, x = var_1876_1)[name = string("cos_97")];
+            tensor<int32, [1]> sin_97_axes_0 = const()[name = string("sin_97_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_97 = expand_dims(axes = sin_97_axes_0, x = var_1880_1)[name = string("sin_97")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1903_cast_fp16 = mul(x = var_1872_cast_fp16_1, y = cos_97)[name = string("op_1903_cast_fp16")];
+            tensor<int32, [4]> x1_47_begin_0 = const()[name = string("x1_47_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_47_end_0 = const()[name = string("x1_47_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_47_end_mask_0 = const()[name = string("x1_47_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_47_cast_fp16 = slice_by_index(begin = x1_47_begin_0, end = x1_47_end_0, end_mask = x1_47_end_mask_0, x = var_1872_cast_fp16_1)[name = string("x1_47_cast_fp16")];
+            tensor<int32, [4]> x2_47_begin_0 = const()[name = string("x2_47_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_47_end_0 = const()[name = string("x2_47_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_47_end_mask_0 = const()[name = string("x2_47_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_47_cast_fp16 = slice_by_index(begin = x2_47_begin_0, end = x2_47_end_0, end_mask = x2_47_end_mask_0, x = var_1872_cast_fp16_1)[name = string("x2_47_cast_fp16")];
+            fp16 const_146_promoted_to_fp16 = const()[name = string("const_146_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_1914_cast_fp16 = mul(x = x2_47_cast_fp16, y = const_146_promoted_to_fp16)[name = string("op_1914_cast_fp16")];
+            bool var_1916_interleave_0 = const()[name = string("op_1916_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_1916_cast_fp16 = concat(axis = var_38, interleave = var_1916_interleave_0, values = (var_1914_cast_fp16, x1_47_cast_fp16))[name = string("op_1916_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1917_cast_fp16 = mul(x = var_1916_cast_fp16, y = sin_97)[name = string("op_1917_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_1918_cast_fp16 = add(x = var_1903_cast_fp16, y = var_1917_cast_fp16)[name = string("op_1918_cast_fp16")];
+            bool key_states_11_interleave_0 = const()[name = string("key_states_11_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> key_states_11_cast_fp16 = concat(axis = var_38, interleave = key_states_11_interleave_0, values = (var_1900_cast_fp16, var_1918_cast_fp16))[name = string("key_states_11_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_5_self_attn_v_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_5_self_attn_v_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(132953600)))];
+            tensor<fp16, [1, 2304, 768]> linear_38_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_5_self_attn_v_proj_linear_weight_promoted_to_fp16, x = clip_70_cast_fp16)[name = string("linear_38_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_5_self_attn_v_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_5_self_attn_v_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.36p+4)];
+            fp16 model_vision_tower_encoder_layers_5_self_attn_v_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_5_self_attn_v_proj_output_max_promoted_to_fp16"), val = fp16(0x1.34p+4)];
+            tensor<fp16, [1, 2304, 768]> clip_75_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_5_self_attn_v_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_5_self_attn_v_proj_output_max_promoted_to_fp16, x = linear_38_cast_fp16)[name = string("clip_75_cast_fp16")];
+            tensor<int32, [4]> var_1931 = const()[name = string("op_1931"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_313_cast_fp16 = reshape(shape = var_1931, x = clip_75_cast_fp16)[name = string("hidden_states_313_cast_fp16")];
+            fp16 var_33_promoted_38_to_fp16 = const()[name = string("op_33_promoted_38_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_1934_cast_fp16 = pow(x = hidden_states_313_cast_fp16, y = var_33_promoted_38_to_fp16)[name = string("op_1934_cast_fp16")];
+            tensor<int32, [1]> var_1936_axes_0 = const()[name = string("op_1936_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1936_keep_dims_0 = const()[name = string("op_1936_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_1936_cast_fp16 = reduce_mean(axes = var_1936_axes_0, keep_dims = var_1936_keep_dims_0, x = var_1934_cast_fp16)[name = string("op_1936_cast_fp16")];
+            fp16 var_1937_to_fp16 = const()[name = string("op_1937_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_77_cast_fp16 = add(x = var_1936_cast_fp16, y = var_1937_to_fp16)[name = string("mean_squared_77_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_1939_cast_fp16 = pow(x = mean_squared_77_cast_fp16, y = var_27_to_fp16)[name = string("op_1939_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_143_cast_fp16 = mul(x = hidden_states_313_cast_fp16, y = var_1939_cast_fp16)[name = string("normed_output_143_cast_fp16")];
+            tensor<int32, [4]> hidden_states_319_perm_0 = const()[name = string("hidden_states_319_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            bool matmul_5_transpose_y_0 = const()[name = string("matmul_5_transpose_y_0"), val = bool(true)];
+            bool matmul_5_transpose_x_0 = const()[name = string("matmul_5_transpose_x_0"), val = bool(false)];
+            tensor<int32, [4]> transpose_74_perm_0 = const()[name = string("transpose_74_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<int32, [4]> transpose_75_perm_0 = const()[name = string("transpose_75_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_75 = transpose(perm = transpose_75_perm_0, x = key_states_11_cast_fp16)[name = string("transpose_137")];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_74 = transpose(perm = transpose_74_perm_0, x = query_states_11_cast_fp16)[name = string("transpose_138")];
+            tensor<fp16, [1, 12, 2304, 2304]> matmul_5_cast_fp16 = matmul(transpose_x = matmul_5_transpose_x_0, transpose_y = matmul_5_transpose_y_0, x = transpose_74, y = transpose_75)[name = string("matmul_5_cast_fp16")];
+            tensor<fp16, [1, 12, 2304, 2304]> add_5_cast_fp16 = add(x = matmul_5_cast_fp16, y = attention_mask_cast_fp16)[name = string("add_5_cast_fp16")];
+            int32 softmax_5_axis_0 = const()[name = string("softmax_5_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 12, 2304, 2304]> softmax_5_cast_fp16 = softmax(axis = softmax_5_axis_0, x = add_5_cast_fp16)[name = string("softmax_5_cast_fp16")];
+            bool attn_output_21_transpose_x_0 = const()[name = string("attn_output_21_transpose_x_0"), val = bool(false)];
+            bool attn_output_21_transpose_y_0 = const()[name = string("attn_output_21_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 12, 2304, 64]> hidden_states_319_cast_fp16 = transpose(perm = hidden_states_319_perm_0, x = normed_output_143_cast_fp16)[name = string("transpose_139")];
+            tensor<fp16, [1, 12, 2304, 64]> attn_output_21_cast_fp16 = matmul(transpose_x = attn_output_21_transpose_x_0, transpose_y = attn_output_21_transpose_y_0, x = softmax_5_cast_fp16, y = hidden_states_319_cast_fp16)[name = string("attn_output_21_cast_fp16")];
+            tensor<int32, [4]> var_1944_perm_0 = const()[name = string("op_1944_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_1946 = const()[name = string("op_1946"), val = tensor<int32, [3]>([1, 2304, -1])];
+            tensor<fp16, [1, 2304, 12, 64]> var_1944_cast_fp16 = transpose(perm = var_1944_perm_0, x = attn_output_21_cast_fp16)[name = string("transpose_136")];
+            tensor<fp16, [1, 2304, 768]> var_1947_cast_fp16 = reshape(shape = var_1946, x = var_1944_cast_fp16)[name = string("op_1947_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_5_self_attn_o_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_5_self_attn_o_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.6cp+1)];
+            fp16 model_vision_tower_encoder_layers_5_self_attn_o_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_5_self_attn_o_proj_input_max_promoted_to_fp16"), val = fp16(0x1.6ap+1)];
+            tensor<fp16, [1, 2304, 768]> clip_76_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_5_self_attn_o_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_5_self_attn_o_proj_input_max_promoted_to_fp16, x = var_1947_cast_fp16)[name = string("clip_76_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_5_self_attn_o_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_5_self_attn_o_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(134133312)))];
+            tensor<fp16, [1, 2304, 768]> linear_39_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_5_self_attn_o_proj_linear_weight_promoted_to_fp16, x = clip_76_cast_fp16)[name = string("linear_39_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_5_self_attn_o_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_5_self_attn_o_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.32p+2)];
+            fp16 model_vision_tower_encoder_layers_5_self_attn_o_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_5_self_attn_o_proj_output_max_promoted_to_fp16"), val = fp16(0x1.3p+2)];
+            tensor<fp16, [1, 2304, 768]> clip_77_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_5_self_attn_o_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_5_self_attn_o_proj_output_max_promoted_to_fp16, x = linear_39_cast_fp16)[name = string("clip_77_cast_fp16")];
+            fp16 var_33_promoted_39_to_fp16 = const()[name = string("op_33_promoted_39_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_1960_cast_fp16 = pow(x = clip_77_cast_fp16, y = var_33_promoted_39_to_fp16)[name = string("op_1960_cast_fp16")];
+            tensor<int32, [1]> var_1962_axes_0 = const()[name = string("op_1962_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1962_keep_dims_0 = const()[name = string("op_1962_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_1962_cast_fp16 = reduce_mean(axes = var_1962_axes_0, keep_dims = var_1962_keep_dims_0, x = var_1960_cast_fp16)[name = string("op_1962_cast_fp16")];
+            fp16 var_1963_to_fp16 = const()[name = string("op_1963_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_79_cast_fp16 = add(x = var_1962_cast_fp16, y = var_1963_to_fp16)[name = string("mean_squared_79_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_1965_cast_fp16 = pow(x = mean_squared_79_cast_fp16, y = var_27_to_fp16)[name = string("op_1965_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_145_cast_fp16 = mul(x = clip_77_cast_fp16, y = var_1965_cast_fp16)[name = string("normed_output_145_cast_fp16")];
+            tensor<fp16, [768]> const_147_to_fp16 = const()[name = string("const_147_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(135313024)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_147_cast_fp16 = mul(x = normed_output_145_cast_fp16, y = const_147_to_fp16)[name = string("normed_output_147_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_331_cast_fp16 = add(x = hidden_states_293_cast_fp16, y = normed_output_147_cast_fp16)[name = string("hidden_states_331_cast_fp16")];
+            fp16 var_33_promoted_40_to_fp16 = const()[name = string("op_33_promoted_40_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_1973_cast_fp16 = pow(x = hidden_states_331_cast_fp16, y = var_33_promoted_40_to_fp16)[name = string("op_1973_cast_fp16")];
+            tensor<int32, [1]> var_1975_axes_0 = const()[name = string("op_1975_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_1975_keep_dims_0 = const()[name = string("op_1975_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_1975_cast_fp16 = reduce_mean(axes = var_1975_axes_0, keep_dims = var_1975_keep_dims_0, x = var_1973_cast_fp16)[name = string("op_1975_cast_fp16")];
+            fp16 var_1976_to_fp16 = const()[name = string("op_1976_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_81_cast_fp16 = add(x = var_1975_cast_fp16, y = var_1976_to_fp16)[name = string("mean_squared_81_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_1978_cast_fp16 = pow(x = mean_squared_81_cast_fp16, y = var_27_to_fp16)[name = string("op_1978_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_149_cast_fp16 = mul(x = hidden_states_331_cast_fp16, y = var_1978_cast_fp16)[name = string("normed_output_149_cast_fp16")];
+            tensor<fp16, [768]> const_148_to_fp16 = const()[name = string("const_148_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(135314624)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_151_cast_fp16 = mul(x = normed_output_149_cast_fp16, y = const_148_to_fp16)[name = string("normed_output_151_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_5_mlp_gate_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_5_mlp_gate_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.3cp+3)];
+            fp16 model_vision_tower_encoder_layers_5_mlp_gate_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_5_mlp_gate_proj_input_max_promoted_to_fp16"), val = fp16(0x1.3ap+3)];
+            tensor<fp16, [1, 2304, 768]> clip_78_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_5_mlp_gate_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_5_mlp_gate_proj_input_max_promoted_to_fp16, x = normed_output_151_cast_fp16)[name = string("clip_78_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_5_mlp_gate_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_5_mlp_gate_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(135316224)))];
+            tensor<fp16, [1, 2304, 3072]> linear_40_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_5_mlp_gate_proj_linear_weight_promoted_to_fp16, x = clip_78_cast_fp16)[name = string("linear_40_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_5_mlp_gate_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_5_mlp_gate_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.84p+3)];
+            fp16 model_vision_tower_encoder_layers_5_mlp_gate_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_5_mlp_gate_proj_output_max_promoted_to_fp16"), val = fp16(0x1.82p+3)];
+            tensor<fp16, [1, 2304, 3072]> clip_79_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_5_mlp_gate_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_5_mlp_gate_proj_output_max_promoted_to_fp16, x = linear_40_cast_fp16)[name = string("clip_79_cast_fp16")];
+            string var_1995_mode_0 = const()[name = string("op_1995_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 2304, 3072]> var_1995_cast_fp16 = gelu(mode = var_1995_mode_0, x = clip_79_cast_fp16)[name = string("op_1995_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_5_mlp_up_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_5_mlp_up_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(140034880)))];
+            tensor<fp16, [1, 2304, 3072]> linear_41_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_5_mlp_up_proj_linear_weight_promoted_to_fp16, x = clip_78_cast_fp16)[name = string("linear_41_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_5_mlp_up_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_5_mlp_up_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.84p+3)];
+            fp16 model_vision_tower_encoder_layers_5_mlp_up_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_5_mlp_up_proj_output_max_promoted_to_fp16"), val = fp16(0x1.82p+3)];
+            tensor<fp16, [1, 2304, 3072]> clip_81_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_5_mlp_up_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_5_mlp_up_proj_output_max_promoted_to_fp16, x = linear_41_cast_fp16)[name = string("clip_81_cast_fp16")];
+            tensor<fp16, [1, 2304, 3072]> hidden_states_341_cast_fp16 = mul(x = var_1995_cast_fp16, y = clip_81_cast_fp16)[name = string("hidden_states_341_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_5_mlp_down_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_5_mlp_down_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.82p+5)];
+            fp16 model_vision_tower_encoder_layers_5_mlp_down_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_5_mlp_down_proj_input_max_promoted_to_fp16"), val = fp16(0x1.7ep+5)];
+            tensor<fp16, [1, 2304, 3072]> clip_82_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_5_mlp_down_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_5_mlp_down_proj_input_max_promoted_to_fp16, x = hidden_states_341_cast_fp16)[name = string("clip_82_cast_fp16")];
+            tensor<fp16, [768, 3072]> model_vision_tower_encoder_layers_5_mlp_down_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_5_mlp_down_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 3072]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(144753536)))];
+            tensor<fp16, [1, 2304, 768]> linear_42_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_5_mlp_down_proj_linear_weight_promoted_to_fp16, x = clip_82_cast_fp16)[name = string("linear_42_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_5_mlp_down_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_5_mlp_down_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.f2p+4)];
+            fp16 model_vision_tower_encoder_layers_5_mlp_down_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_5_mlp_down_proj_output_max_promoted_to_fp16"), val = fp16(0x1.eep+4)];
+            tensor<fp16, [1, 2304, 768]> clip_83_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_5_mlp_down_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_5_mlp_down_proj_output_max_promoted_to_fp16, x = linear_42_cast_fp16)[name = string("clip_83_cast_fp16")];
+            fp16 var_33_promoted_41_to_fp16 = const()[name = string("op_33_promoted_41_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_2017_cast_fp16 = pow(x = clip_83_cast_fp16, y = var_33_promoted_41_to_fp16)[name = string("op_2017_cast_fp16")];
+            tensor<int32, [1]> var_2019_axes_0 = const()[name = string("op_2019_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2019_keep_dims_0 = const()[name = string("op_2019_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_2019_cast_fp16 = reduce_mean(axes = var_2019_axes_0, keep_dims = var_2019_keep_dims_0, x = var_2017_cast_fp16)[name = string("op_2019_cast_fp16")];
+            fp16 var_2020_to_fp16 = const()[name = string("op_2020_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_83_cast_fp16 = add(x = var_2019_cast_fp16, y = var_2020_to_fp16)[name = string("mean_squared_83_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_2022_cast_fp16 = pow(x = mean_squared_83_cast_fp16, y = var_27_to_fp16)[name = string("op_2022_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_153_cast_fp16 = mul(x = clip_83_cast_fp16, y = var_2022_cast_fp16)[name = string("normed_output_153_cast_fp16")];
+            tensor<fp16, [768]> const_149_to_fp16 = const()[name = string("const_149_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(149472192)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_155_cast_fp16 = mul(x = normed_output_153_cast_fp16, y = const_149_to_fp16)[name = string("normed_output_155_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_351_cast_fp16 = add(x = hidden_states_331_cast_fp16, y = normed_output_155_cast_fp16)[name = string("hidden_states_351_cast_fp16")];
+            fp16 var_33_promoted_42_to_fp16 = const()[name = string("op_33_promoted_42_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_2036_cast_fp16 = pow(x = hidden_states_351_cast_fp16, y = var_33_promoted_42_to_fp16)[name = string("op_2036_cast_fp16")];
+            tensor<int32, [1]> var_2038_axes_0 = const()[name = string("op_2038_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2038_keep_dims_0 = const()[name = string("op_2038_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_2038_cast_fp16 = reduce_mean(axes = var_2038_axes_0, keep_dims = var_2038_keep_dims_0, x = var_2036_cast_fp16)[name = string("op_2038_cast_fp16")];
+            fp16 var_2039_to_fp16 = const()[name = string("op_2039_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_85_cast_fp16 = add(x = var_2038_cast_fp16, y = var_2039_to_fp16)[name = string("mean_squared_85_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_2041_cast_fp16 = pow(x = mean_squared_85_cast_fp16, y = var_27_to_fp16)[name = string("op_2041_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_157_cast_fp16 = mul(x = hidden_states_351_cast_fp16, y = var_2041_cast_fp16)[name = string("normed_output_157_cast_fp16")];
+            tensor<fp16, [768]> const_150_to_fp16 = const()[name = string("const_150_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(149473792)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_159_cast_fp16 = mul(x = normed_output_157_cast_fp16, y = const_150_to_fp16)[name = string("normed_output_159_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_6_self_attn_q_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_6_self_attn_q_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.42p+3)];
+            fp16 model_vision_tower_encoder_layers_6_self_attn_q_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_6_self_attn_q_proj_input_max_promoted_to_fp16"), val = fp16(0x1.4p+3)];
+            tensor<fp16, [1, 2304, 768]> clip_84_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_6_self_attn_q_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_6_self_attn_q_proj_input_max_promoted_to_fp16, x = normed_output_159_cast_fp16)[name = string("clip_84_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_6_self_attn_q_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_6_self_attn_q_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(149475392)))];
+            tensor<fp16, [1, 2304, 768]> linear_43_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_6_self_attn_q_proj_linear_weight_promoted_to_fp16, x = clip_84_cast_fp16)[name = string("linear_43_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_6_self_attn_q_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_6_self_attn_q_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.ccp+3)];
+            fp16 model_vision_tower_encoder_layers_6_self_attn_q_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_6_self_attn_q_proj_output_max_promoted_to_fp16"), val = fp16(0x1.cap+3)];
+            tensor<fp16, [1, 2304, 768]> clip_85_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_6_self_attn_q_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_6_self_attn_q_proj_output_max_promoted_to_fp16, x = linear_43_cast_fp16)[name = string("clip_85_cast_fp16")];
+            tensor<int32, [4]> var_2063 = const()[name = string("op_2063"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_359_cast_fp16 = reshape(shape = var_2063, x = clip_85_cast_fp16)[name = string("hidden_states_359_cast_fp16")];
+            fp16 var_33_promoted_43_to_fp16 = const()[name = string("op_33_promoted_43_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_2067_cast_fp16 = pow(x = hidden_states_359_cast_fp16, y = var_33_promoted_43_to_fp16)[name = string("op_2067_cast_fp16")];
+            tensor<int32, [1]> var_2069_axes_0 = const()[name = string("op_2069_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2069_keep_dims_0 = const()[name = string("op_2069_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_2069_cast_fp16 = reduce_mean(axes = var_2069_axes_0, keep_dims = var_2069_keep_dims_0, x = var_2067_cast_fp16)[name = string("op_2069_cast_fp16")];
+            fp16 var_2070_to_fp16 = const()[name = string("op_2070_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_87_cast_fp16 = add(x = var_2069_cast_fp16, y = var_2070_to_fp16)[name = string("mean_squared_87_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_2072_cast_fp16 = pow(x = mean_squared_87_cast_fp16, y = var_27_to_fp16)[name = string("op_2072_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_161_cast_fp16 = mul(x = hidden_states_359_cast_fp16, y = var_2072_cast_fp16)[name = string("normed_output_161_cast_fp16")];
+            tensor<fp16, [64]> const_153_to_fp16 = const()[name = string("const_153_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(150655104)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_163_cast_fp16 = mul(x = normed_output_161_cast_fp16, y = const_153_to_fp16)[name = string("normed_output_163_cast_fp16")];
+            tensor<int32, [2]> var_2092 = const()[name = string("op_2092"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_2093_axis_0 = const()[name = string("op_2093_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_2093_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_2093_cast_fp16_1 = split(axis = var_2093_axis_0, split_sizes = var_2092, x = normed_output_163_cast_fp16)[name = string("op_2093_cast_fp16")];
+            tensor<int32, [2]> var_2096 = const()[name = string("op_2096"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_2097_axis_0 = const()[name = string("op_2097_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_2097_0, tensor<fp16, [1, 2304, 32]> var_2097_1 = split(axis = var_2097_axis_0, split_sizes = var_2096, x = var_160_cast_fp16)[name = string("op_2097")];
+            tensor<int32, [2]> var_2100 = const()[name = string("op_2100"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_2101_axis_0 = const()[name = string("op_2101_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_2101_0, tensor<fp16, [1, 2304, 32]> var_2101_1 = split(axis = var_2101_axis_0, split_sizes = var_2100, x = var_163_cast_fp16)[name = string("op_2101")];
+            tensor<int32, [1]> cos_101_axes_0 = const()[name = string("cos_101_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_101 = expand_dims(axes = cos_101_axes_0, x = var_2097_0)[name = string("cos_101")];
+            tensor<int32, [1]> sin_101_axes_0 = const()[name = string("sin_101_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_101 = expand_dims(axes = sin_101_axes_0, x = var_2101_0)[name = string("sin_101")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2106_cast_fp16 = mul(x = var_2093_cast_fp16_0, y = cos_101)[name = string("op_2106_cast_fp16")];
+            tensor<int32, [4]> x1_49_begin_0 = const()[name = string("x1_49_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_49_end_0 = const()[name = string("x1_49_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_49_end_mask_0 = const()[name = string("x1_49_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_49_cast_fp16 = slice_by_index(begin = x1_49_begin_0, end = x1_49_end_0, end_mask = x1_49_end_mask_0, x = var_2093_cast_fp16_0)[name = string("x1_49_cast_fp16")];
+            tensor<int32, [4]> x2_49_begin_0 = const()[name = string("x2_49_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_49_end_0 = const()[name = string("x2_49_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_49_end_mask_0 = const()[name = string("x2_49_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_49_cast_fp16 = slice_by_index(begin = x2_49_begin_0, end = x2_49_end_0, end_mask = x2_49_end_mask_0, x = var_2093_cast_fp16_0)[name = string("x2_49_cast_fp16")];
+            fp16 const_158_promoted_to_fp16 = const()[name = string("const_158_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_2117_cast_fp16 = mul(x = x2_49_cast_fp16, y = const_158_promoted_to_fp16)[name = string("op_2117_cast_fp16")];
+            bool var_2119_interleave_0 = const()[name = string("op_2119_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_2119_cast_fp16 = concat(axis = var_38, interleave = var_2119_interleave_0, values = (var_2117_cast_fp16, x1_49_cast_fp16))[name = string("op_2119_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2120_cast_fp16 = mul(x = var_2119_cast_fp16, y = sin_101)[name = string("op_2120_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2121_cast_fp16 = add(x = var_2106_cast_fp16, y = var_2120_cast_fp16)[name = string("op_2121_cast_fp16")];
+            tensor<int32, [1]> cos_105_axes_0 = const()[name = string("cos_105_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_105 = expand_dims(axes = cos_105_axes_0, x = var_2097_1)[name = string("cos_105")];
+            tensor<int32, [1]> sin_105_axes_0 = const()[name = string("sin_105_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_105 = expand_dims(axes = sin_105_axes_0, x = var_2101_1)[name = string("sin_105")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2124_cast_fp16 = mul(x = var_2093_cast_fp16_1, y = cos_105)[name = string("op_2124_cast_fp16")];
+            tensor<int32, [4]> x1_51_begin_0 = const()[name = string("x1_51_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_51_end_0 = const()[name = string("x1_51_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_51_end_mask_0 = const()[name = string("x1_51_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_51_cast_fp16 = slice_by_index(begin = x1_51_begin_0, end = x1_51_end_0, end_mask = x1_51_end_mask_0, x = var_2093_cast_fp16_1)[name = string("x1_51_cast_fp16")];
+            tensor<int32, [4]> x2_51_begin_0 = const()[name = string("x2_51_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_51_end_0 = const()[name = string("x2_51_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_51_end_mask_0 = const()[name = string("x2_51_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_51_cast_fp16 = slice_by_index(begin = x2_51_begin_0, end = x2_51_end_0, end_mask = x2_51_end_mask_0, x = var_2093_cast_fp16_1)[name = string("x2_51_cast_fp16")];
+            fp16 const_161_promoted_to_fp16 = const()[name = string("const_161_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_2135_cast_fp16 = mul(x = x2_51_cast_fp16, y = const_161_promoted_to_fp16)[name = string("op_2135_cast_fp16")];
+            bool var_2137_interleave_0 = const()[name = string("op_2137_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_2137_cast_fp16 = concat(axis = var_38, interleave = var_2137_interleave_0, values = (var_2135_cast_fp16, x1_51_cast_fp16))[name = string("op_2137_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2138_cast_fp16 = mul(x = var_2137_cast_fp16, y = sin_105)[name = string("op_2138_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2139_cast_fp16 = add(x = var_2124_cast_fp16, y = var_2138_cast_fp16)[name = string("op_2139_cast_fp16")];
+            bool query_states_13_interleave_0 = const()[name = string("query_states_13_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> query_states_13_cast_fp16 = concat(axis = var_38, interleave = query_states_13_interleave_0, values = (var_2121_cast_fp16, var_2139_cast_fp16))[name = string("query_states_13_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_6_self_attn_k_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_6_self_attn_k_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(150655296)))];
+            tensor<fp16, [1, 2304, 768]> linear_44_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_6_self_attn_k_proj_linear_weight_promoted_to_fp16, x = clip_84_cast_fp16)[name = string("linear_44_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_6_self_attn_k_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_6_self_attn_k_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.26p+4)];
+            fp16 model_vision_tower_encoder_layers_6_self_attn_k_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_6_self_attn_k_proj_output_max_promoted_to_fp16"), val = fp16(0x1.24p+4)];
+            tensor<fp16, [1, 2304, 768]> clip_87_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_6_self_attn_k_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_6_self_attn_k_proj_output_max_promoted_to_fp16, x = linear_44_cast_fp16)[name = string("clip_87_cast_fp16")];
+            tensor<int32, [4]> var_2152 = const()[name = string("op_2152"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_365_cast_fp16 = reshape(shape = var_2152, x = clip_87_cast_fp16)[name = string("hidden_states_365_cast_fp16")];
+            fp16 var_33_promoted_44_to_fp16 = const()[name = string("op_33_promoted_44_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_2156_cast_fp16 = pow(x = hidden_states_365_cast_fp16, y = var_33_promoted_44_to_fp16)[name = string("op_2156_cast_fp16")];
+            tensor<int32, [1]> var_2158_axes_0 = const()[name = string("op_2158_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2158_keep_dims_0 = const()[name = string("op_2158_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_2158_cast_fp16 = reduce_mean(axes = var_2158_axes_0, keep_dims = var_2158_keep_dims_0, x = var_2156_cast_fp16)[name = string("op_2158_cast_fp16")];
+            fp16 var_2159_to_fp16 = const()[name = string("op_2159_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_89_cast_fp16 = add(x = var_2158_cast_fp16, y = var_2159_to_fp16)[name = string("mean_squared_89_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_2161_cast_fp16 = pow(x = mean_squared_89_cast_fp16, y = var_27_to_fp16)[name = string("op_2161_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_165_cast_fp16 = mul(x = hidden_states_365_cast_fp16, y = var_2161_cast_fp16)[name = string("normed_output_165_cast_fp16")];
+            tensor<fp16, [64]> const_162_to_fp16 = const()[name = string("const_162_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(151835008)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_167_cast_fp16 = mul(x = normed_output_165_cast_fp16, y = const_162_to_fp16)[name = string("normed_output_167_cast_fp16")];
+            tensor<int32, [2]> var_2181 = const()[name = string("op_2181"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_2182_axis_0 = const()[name = string("op_2182_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_2182_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_2182_cast_fp16_1 = split(axis = var_2182_axis_0, split_sizes = var_2181, x = normed_output_167_cast_fp16)[name = string("op_2182_cast_fp16")];
+            tensor<int32, [2]> var_2185 = const()[name = string("op_2185"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_2186_axis_0 = const()[name = string("op_2186_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_2186_0, tensor<fp16, [1, 2304, 32]> var_2186_1 = split(axis = var_2186_axis_0, split_sizes = var_2185, x = var_160_cast_fp16)[name = string("op_2186")];
+            tensor<int32, [2]> var_2189 = const()[name = string("op_2189"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_2190_axis_0 = const()[name = string("op_2190_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_2190_0, tensor<fp16, [1, 2304, 32]> var_2190_1 = split(axis = var_2190_axis_0, split_sizes = var_2189, x = var_163_cast_fp16)[name = string("op_2190")];
+            tensor<int32, [1]> cos_109_axes_0 = const()[name = string("cos_109_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_109 = expand_dims(axes = cos_109_axes_0, x = var_2186_0)[name = string("cos_109")];
+            tensor<int32, [1]> sin_109_axes_0 = const()[name = string("sin_109_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_109 = expand_dims(axes = sin_109_axes_0, x = var_2190_0)[name = string("sin_109")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2195_cast_fp16 = mul(x = var_2182_cast_fp16_0, y = cos_109)[name = string("op_2195_cast_fp16")];
+            tensor<int32, [4]> x1_53_begin_0 = const()[name = string("x1_53_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_53_end_0 = const()[name = string("x1_53_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_53_end_mask_0 = const()[name = string("x1_53_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_53_cast_fp16 = slice_by_index(begin = x1_53_begin_0, end = x1_53_end_0, end_mask = x1_53_end_mask_0, x = var_2182_cast_fp16_0)[name = string("x1_53_cast_fp16")];
+            tensor<int32, [4]> x2_53_begin_0 = const()[name = string("x2_53_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_53_end_0 = const()[name = string("x2_53_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_53_end_mask_0 = const()[name = string("x2_53_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_53_cast_fp16 = slice_by_index(begin = x2_53_begin_0, end = x2_53_end_0, end_mask = x2_53_end_mask_0, x = var_2182_cast_fp16_0)[name = string("x2_53_cast_fp16")];
+            fp16 const_167_promoted_to_fp16 = const()[name = string("const_167_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_2206_cast_fp16 = mul(x = x2_53_cast_fp16, y = const_167_promoted_to_fp16)[name = string("op_2206_cast_fp16")];
+            bool var_2208_interleave_0 = const()[name = string("op_2208_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_2208_cast_fp16 = concat(axis = var_38, interleave = var_2208_interleave_0, values = (var_2206_cast_fp16, x1_53_cast_fp16))[name = string("op_2208_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2209_cast_fp16 = mul(x = var_2208_cast_fp16, y = sin_109)[name = string("op_2209_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2210_cast_fp16 = add(x = var_2195_cast_fp16, y = var_2209_cast_fp16)[name = string("op_2210_cast_fp16")];
+            tensor<int32, [1]> cos_113_axes_0 = const()[name = string("cos_113_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_113 = expand_dims(axes = cos_113_axes_0, x = var_2186_1)[name = string("cos_113")];
+            tensor<int32, [1]> sin_113_axes_0 = const()[name = string("sin_113_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_113 = expand_dims(axes = sin_113_axes_0, x = var_2190_1)[name = string("sin_113")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2213_cast_fp16 = mul(x = var_2182_cast_fp16_1, y = cos_113)[name = string("op_2213_cast_fp16")];
+            tensor<int32, [4]> x1_55_begin_0 = const()[name = string("x1_55_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_55_end_0 = const()[name = string("x1_55_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_55_end_mask_0 = const()[name = string("x1_55_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_55_cast_fp16 = slice_by_index(begin = x1_55_begin_0, end = x1_55_end_0, end_mask = x1_55_end_mask_0, x = var_2182_cast_fp16_1)[name = string("x1_55_cast_fp16")];
+            tensor<int32, [4]> x2_55_begin_0 = const()[name = string("x2_55_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_55_end_0 = const()[name = string("x2_55_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_55_end_mask_0 = const()[name = string("x2_55_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_55_cast_fp16 = slice_by_index(begin = x2_55_begin_0, end = x2_55_end_0, end_mask = x2_55_end_mask_0, x = var_2182_cast_fp16_1)[name = string("x2_55_cast_fp16")];
+            fp16 const_170_promoted_to_fp16 = const()[name = string("const_170_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_2224_cast_fp16 = mul(x = x2_55_cast_fp16, y = const_170_promoted_to_fp16)[name = string("op_2224_cast_fp16")];
+            bool var_2226_interleave_0 = const()[name = string("op_2226_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_2226_cast_fp16 = concat(axis = var_38, interleave = var_2226_interleave_0, values = (var_2224_cast_fp16, x1_55_cast_fp16))[name = string("op_2226_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2227_cast_fp16 = mul(x = var_2226_cast_fp16, y = sin_113)[name = string("op_2227_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2228_cast_fp16 = add(x = var_2213_cast_fp16, y = var_2227_cast_fp16)[name = string("op_2228_cast_fp16")];
+            bool key_states_13_interleave_0 = const()[name = string("key_states_13_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> key_states_13_cast_fp16 = concat(axis = var_38, interleave = key_states_13_interleave_0, values = (var_2210_cast_fp16, var_2228_cast_fp16))[name = string("key_states_13_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_6_self_attn_v_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_6_self_attn_v_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(151835200)))];
+            tensor<fp16, [1, 2304, 768]> linear_45_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_6_self_attn_v_proj_linear_weight_promoted_to_fp16, x = clip_84_cast_fp16)[name = string("linear_45_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_6_self_attn_v_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_6_self_attn_v_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.26p+4)];
+            fp16 model_vision_tower_encoder_layers_6_self_attn_v_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_6_self_attn_v_proj_output_max_promoted_to_fp16"), val = fp16(0x1.24p+4)];
+            tensor<fp16, [1, 2304, 768]> clip_89_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_6_self_attn_v_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_6_self_attn_v_proj_output_max_promoted_to_fp16, x = linear_45_cast_fp16)[name = string("clip_89_cast_fp16")];
+            tensor<int32, [4]> var_2241 = const()[name = string("op_2241"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_371_cast_fp16 = reshape(shape = var_2241, x = clip_89_cast_fp16)[name = string("hidden_states_371_cast_fp16")];
+            fp16 var_33_promoted_45_to_fp16 = const()[name = string("op_33_promoted_45_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_2244_cast_fp16 = pow(x = hidden_states_371_cast_fp16, y = var_33_promoted_45_to_fp16)[name = string("op_2244_cast_fp16")];
+            tensor<int32, [1]> var_2246_axes_0 = const()[name = string("op_2246_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2246_keep_dims_0 = const()[name = string("op_2246_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_2246_cast_fp16 = reduce_mean(axes = var_2246_axes_0, keep_dims = var_2246_keep_dims_0, x = var_2244_cast_fp16)[name = string("op_2246_cast_fp16")];
+            fp16 var_2247_to_fp16 = const()[name = string("op_2247_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_91_cast_fp16 = add(x = var_2246_cast_fp16, y = var_2247_to_fp16)[name = string("mean_squared_91_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_2249_cast_fp16 = pow(x = mean_squared_91_cast_fp16, y = var_27_to_fp16)[name = string("op_2249_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_169_cast_fp16 = mul(x = hidden_states_371_cast_fp16, y = var_2249_cast_fp16)[name = string("normed_output_169_cast_fp16")];
+            tensor<int32, [4]> hidden_states_377_perm_0 = const()[name = string("hidden_states_377_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            bool matmul_6_transpose_y_0 = const()[name = string("matmul_6_transpose_y_0"), val = bool(true)];
+            bool matmul_6_transpose_x_0 = const()[name = string("matmul_6_transpose_x_0"), val = bool(false)];
+            tensor<int32, [4]> transpose_76_perm_0 = const()[name = string("transpose_76_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<int32, [4]> transpose_77_perm_0 = const()[name = string("transpose_77_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_77 = transpose(perm = transpose_77_perm_0, x = key_states_13_cast_fp16)[name = string("transpose_133")];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_76 = transpose(perm = transpose_76_perm_0, x = query_states_13_cast_fp16)[name = string("transpose_134")];
+            tensor<fp16, [1, 12, 2304, 2304]> matmul_6_cast_fp16 = matmul(transpose_x = matmul_6_transpose_x_0, transpose_y = matmul_6_transpose_y_0, x = transpose_76, y = transpose_77)[name = string("matmul_6_cast_fp16")];
+            tensor<fp16, [1, 12, 2304, 2304]> add_6_cast_fp16 = add(x = matmul_6_cast_fp16, y = attention_mask_cast_fp16)[name = string("add_6_cast_fp16")];
+            int32 softmax_6_axis_0 = const()[name = string("softmax_6_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 12, 2304, 2304]> softmax_6_cast_fp16 = softmax(axis = softmax_6_axis_0, x = add_6_cast_fp16)[name = string("softmax_6_cast_fp16")];
+            bool attn_output_25_transpose_x_0 = const()[name = string("attn_output_25_transpose_x_0"), val = bool(false)];
+            bool attn_output_25_transpose_y_0 = const()[name = string("attn_output_25_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 12, 2304, 64]> hidden_states_377_cast_fp16 = transpose(perm = hidden_states_377_perm_0, x = normed_output_169_cast_fp16)[name = string("transpose_135")];
+            tensor<fp16, [1, 12, 2304, 64]> attn_output_25_cast_fp16 = matmul(transpose_x = attn_output_25_transpose_x_0, transpose_y = attn_output_25_transpose_y_0, x = softmax_6_cast_fp16, y = hidden_states_377_cast_fp16)[name = string("attn_output_25_cast_fp16")];
+            tensor<int32, [4]> var_2254_perm_0 = const()[name = string("op_2254_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_2256 = const()[name = string("op_2256"), val = tensor<int32, [3]>([1, 2304, -1])];
+            tensor<fp16, [1, 2304, 12, 64]> var_2254_cast_fp16 = transpose(perm = var_2254_perm_0, x = attn_output_25_cast_fp16)[name = string("transpose_132")];
+            tensor<fp16, [1, 2304, 768]> var_2257_cast_fp16 = reshape(shape = var_2256, x = var_2254_cast_fp16)[name = string("op_2257_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_6_self_attn_o_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_6_self_attn_o_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.2ap+1)];
+            fp16 model_vision_tower_encoder_layers_6_self_attn_o_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_6_self_attn_o_proj_input_max_promoted_to_fp16"), val = fp16(0x1.28p+1)];
+            tensor<fp16, [1, 2304, 768]> clip_90_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_6_self_attn_o_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_6_self_attn_o_proj_input_max_promoted_to_fp16, x = var_2257_cast_fp16)[name = string("clip_90_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_6_self_attn_o_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_6_self_attn_o_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(153014912)))];
+            tensor<fp16, [1, 2304, 768]> linear_46_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_6_self_attn_o_proj_linear_weight_promoted_to_fp16, x = clip_90_cast_fp16)[name = string("linear_46_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_6_self_attn_o_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_6_self_attn_o_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.06p+2)];
+            fp16 model_vision_tower_encoder_layers_6_self_attn_o_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_6_self_attn_o_proj_output_max_promoted_to_fp16"), val = fp16(0x1.04p+2)];
+            tensor<fp16, [1, 2304, 768]> clip_91_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_6_self_attn_o_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_6_self_attn_o_proj_output_max_promoted_to_fp16, x = linear_46_cast_fp16)[name = string("clip_91_cast_fp16")];
+            fp16 var_33_promoted_46_to_fp16 = const()[name = string("op_33_promoted_46_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_2270_cast_fp16 = pow(x = clip_91_cast_fp16, y = var_33_promoted_46_to_fp16)[name = string("op_2270_cast_fp16")];
+            tensor<int32, [1]> var_2272_axes_0 = const()[name = string("op_2272_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2272_keep_dims_0 = const()[name = string("op_2272_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_2272_cast_fp16 = reduce_mean(axes = var_2272_axes_0, keep_dims = var_2272_keep_dims_0, x = var_2270_cast_fp16)[name = string("op_2272_cast_fp16")];
+            fp16 var_2273_to_fp16 = const()[name = string("op_2273_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_93_cast_fp16 = add(x = var_2272_cast_fp16, y = var_2273_to_fp16)[name = string("mean_squared_93_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_2275_cast_fp16 = pow(x = mean_squared_93_cast_fp16, y = var_27_to_fp16)[name = string("op_2275_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_171_cast_fp16 = mul(x = clip_91_cast_fp16, y = var_2275_cast_fp16)[name = string("normed_output_171_cast_fp16")];
+            tensor<fp16, [768]> const_171_to_fp16 = const()[name = string("const_171_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(154194624)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_173_cast_fp16 = mul(x = normed_output_171_cast_fp16, y = const_171_to_fp16)[name = string("normed_output_173_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_389_cast_fp16 = add(x = hidden_states_351_cast_fp16, y = normed_output_173_cast_fp16)[name = string("hidden_states_389_cast_fp16")];
+            fp16 var_33_promoted_47_to_fp16 = const()[name = string("op_33_promoted_47_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_2283_cast_fp16 = pow(x = hidden_states_389_cast_fp16, y = var_33_promoted_47_to_fp16)[name = string("op_2283_cast_fp16")];
+            tensor<int32, [1]> var_2285_axes_0 = const()[name = string("op_2285_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2285_keep_dims_0 = const()[name = string("op_2285_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_2285_cast_fp16 = reduce_mean(axes = var_2285_axes_0, keep_dims = var_2285_keep_dims_0, x = var_2283_cast_fp16)[name = string("op_2285_cast_fp16")];
+            fp16 var_2286_to_fp16 = const()[name = string("op_2286_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_95_cast_fp16 = add(x = var_2285_cast_fp16, y = var_2286_to_fp16)[name = string("mean_squared_95_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_2288_cast_fp16 = pow(x = mean_squared_95_cast_fp16, y = var_27_to_fp16)[name = string("op_2288_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_175_cast_fp16 = mul(x = hidden_states_389_cast_fp16, y = var_2288_cast_fp16)[name = string("normed_output_175_cast_fp16")];
+            tensor<fp16, [768]> const_172_to_fp16 = const()[name = string("const_172_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(154196224)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_177_cast_fp16 = mul(x = normed_output_175_cast_fp16, y = const_172_to_fp16)[name = string("normed_output_177_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_6_mlp_gate_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_6_mlp_gate_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.86p+3)];
+            fp16 model_vision_tower_encoder_layers_6_mlp_gate_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_6_mlp_gate_proj_input_max_promoted_to_fp16"), val = fp16(0x1.82p+3)];
+            tensor<fp16, [1, 2304, 768]> clip_92_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_6_mlp_gate_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_6_mlp_gate_proj_input_max_promoted_to_fp16, x = normed_output_177_cast_fp16)[name = string("clip_92_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_6_mlp_gate_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_6_mlp_gate_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(154197824)))];
+            tensor<fp16, [1, 2304, 3072]> linear_47_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_6_mlp_gate_proj_linear_weight_promoted_to_fp16, x = clip_92_cast_fp16)[name = string("linear_47_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_6_mlp_gate_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_6_mlp_gate_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.e6p+3)];
+            fp16 model_vision_tower_encoder_layers_6_mlp_gate_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_6_mlp_gate_proj_output_max_promoted_to_fp16"), val = fp16(0x1.e2p+3)];
+            tensor<fp16, [1, 2304, 3072]> clip_93_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_6_mlp_gate_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_6_mlp_gate_proj_output_max_promoted_to_fp16, x = linear_47_cast_fp16)[name = string("clip_93_cast_fp16")];
+            string var_2305_mode_0 = const()[name = string("op_2305_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 2304, 3072]> var_2305_cast_fp16 = gelu(mode = var_2305_mode_0, x = clip_93_cast_fp16)[name = string("op_2305_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_6_mlp_up_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_6_mlp_up_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(158916480)))];
+            tensor<fp16, [1, 2304, 3072]> linear_48_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_6_mlp_up_proj_linear_weight_promoted_to_fp16, x = clip_92_cast_fp16)[name = string("linear_48_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_6_mlp_up_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_6_mlp_up_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.e6p+3)];
+            fp16 model_vision_tower_encoder_layers_6_mlp_up_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_6_mlp_up_proj_output_max_promoted_to_fp16"), val = fp16(0x1.e2p+3)];
+            tensor<fp16, [1, 2304, 3072]> clip_95_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_6_mlp_up_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_6_mlp_up_proj_output_max_promoted_to_fp16, x = linear_48_cast_fp16)[name = string("clip_95_cast_fp16")];
+            tensor<fp16, [1, 2304, 3072]> hidden_states_399_cast_fp16 = mul(x = var_2305_cast_fp16, y = clip_95_cast_fp16)[name = string("hidden_states_399_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_6_mlp_down_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_6_mlp_down_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.1ep+6)];
+            fp16 model_vision_tower_encoder_layers_6_mlp_down_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_6_mlp_down_proj_input_max_promoted_to_fp16"), val = fp16(0x1.1cp+6)];
+            tensor<fp16, [1, 2304, 3072]> clip_96_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_6_mlp_down_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_6_mlp_down_proj_input_max_promoted_to_fp16, x = hidden_states_399_cast_fp16)[name = string("clip_96_cast_fp16")];
+            tensor<fp16, [768, 3072]> model_vision_tower_encoder_layers_6_mlp_down_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_6_mlp_down_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 3072]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(163635136)))];
+            tensor<fp16, [1, 2304, 768]> linear_49_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_6_mlp_down_proj_linear_weight_promoted_to_fp16, x = clip_96_cast_fp16)[name = string("linear_49_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_6_mlp_down_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_6_mlp_down_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.34p+5)];
+            fp16 model_vision_tower_encoder_layers_6_mlp_down_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_6_mlp_down_proj_output_max_promoted_to_fp16"), val = fp16(0x1.3p+5)];
+            tensor<fp16, [1, 2304, 768]> clip_97_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_6_mlp_down_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_6_mlp_down_proj_output_max_promoted_to_fp16, x = linear_49_cast_fp16)[name = string("clip_97_cast_fp16")];
+            fp16 var_33_promoted_48_to_fp16 = const()[name = string("op_33_promoted_48_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_2327_cast_fp16 = pow(x = clip_97_cast_fp16, y = var_33_promoted_48_to_fp16)[name = string("op_2327_cast_fp16")];
+            tensor<int32, [1]> var_2329_axes_0 = const()[name = string("op_2329_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2329_keep_dims_0 = const()[name = string("op_2329_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_2329_cast_fp16 = reduce_mean(axes = var_2329_axes_0, keep_dims = var_2329_keep_dims_0, x = var_2327_cast_fp16)[name = string("op_2329_cast_fp16")];
+            fp16 var_2330_to_fp16 = const()[name = string("op_2330_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_97_cast_fp16 = add(x = var_2329_cast_fp16, y = var_2330_to_fp16)[name = string("mean_squared_97_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_2332_cast_fp16 = pow(x = mean_squared_97_cast_fp16, y = var_27_to_fp16)[name = string("op_2332_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_179_cast_fp16 = mul(x = clip_97_cast_fp16, y = var_2332_cast_fp16)[name = string("normed_output_179_cast_fp16")];
+            tensor<fp16, [768]> const_173_to_fp16 = const()[name = string("const_173_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(168353792)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_181_cast_fp16 = mul(x = normed_output_179_cast_fp16, y = const_173_to_fp16)[name = string("normed_output_181_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_409_cast_fp16 = add(x = hidden_states_389_cast_fp16, y = normed_output_181_cast_fp16)[name = string("hidden_states_409_cast_fp16")];
+            fp16 var_33_promoted_49_to_fp16 = const()[name = string("op_33_promoted_49_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_2346_cast_fp16 = pow(x = hidden_states_409_cast_fp16, y = var_33_promoted_49_to_fp16)[name = string("op_2346_cast_fp16")];
+            tensor<int32, [1]> var_2348_axes_0 = const()[name = string("op_2348_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2348_keep_dims_0 = const()[name = string("op_2348_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_2348_cast_fp16 = reduce_mean(axes = var_2348_axes_0, keep_dims = var_2348_keep_dims_0, x = var_2346_cast_fp16)[name = string("op_2348_cast_fp16")];
+            fp16 var_2349_to_fp16 = const()[name = string("op_2349_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_99_cast_fp16 = add(x = var_2348_cast_fp16, y = var_2349_to_fp16)[name = string("mean_squared_99_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_2351_cast_fp16 = pow(x = mean_squared_99_cast_fp16, y = var_27_to_fp16)[name = string("op_2351_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_183_cast_fp16 = mul(x = hidden_states_409_cast_fp16, y = var_2351_cast_fp16)[name = string("normed_output_183_cast_fp16")];
+            tensor<fp16, [768]> const_174_to_fp16 = const()[name = string("const_174_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(168355392)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_185_cast_fp16 = mul(x = normed_output_183_cast_fp16, y = const_174_to_fp16)[name = string("normed_output_185_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_7_self_attn_q_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_7_self_attn_q_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.26p+3)];
+            fp16 model_vision_tower_encoder_layers_7_self_attn_q_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_7_self_attn_q_proj_input_max_promoted_to_fp16"), val = fp16(0x1.24p+3)];
+            tensor<fp16, [1, 2304, 768]> clip_98_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_7_self_attn_q_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_7_self_attn_q_proj_input_max_promoted_to_fp16, x = normed_output_185_cast_fp16)[name = string("clip_98_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_7_self_attn_q_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_7_self_attn_q_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(168356992)))];
+            tensor<fp16, [1, 2304, 768]> linear_50_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_7_self_attn_q_proj_linear_weight_promoted_to_fp16, x = clip_98_cast_fp16)[name = string("linear_50_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_7_self_attn_q_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_7_self_attn_q_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.92p+3)];
+            fp16 model_vision_tower_encoder_layers_7_self_attn_q_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_7_self_attn_q_proj_output_max_promoted_to_fp16"), val = fp16(0x1.8ep+3)];
+            tensor<fp16, [1, 2304, 768]> clip_99_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_7_self_attn_q_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_7_self_attn_q_proj_output_max_promoted_to_fp16, x = linear_50_cast_fp16)[name = string("clip_99_cast_fp16")];
+            tensor<int32, [4]> var_2373 = const()[name = string("op_2373"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_417_cast_fp16 = reshape(shape = var_2373, x = clip_99_cast_fp16)[name = string("hidden_states_417_cast_fp16")];
+            fp16 var_33_promoted_50_to_fp16 = const()[name = string("op_33_promoted_50_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_2377_cast_fp16 = pow(x = hidden_states_417_cast_fp16, y = var_33_promoted_50_to_fp16)[name = string("op_2377_cast_fp16")];
+            tensor<int32, [1]> var_2379_axes_0 = const()[name = string("op_2379_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2379_keep_dims_0 = const()[name = string("op_2379_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_2379_cast_fp16 = reduce_mean(axes = var_2379_axes_0, keep_dims = var_2379_keep_dims_0, x = var_2377_cast_fp16)[name = string("op_2379_cast_fp16")];
+            fp16 var_2380_to_fp16 = const()[name = string("op_2380_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_101_cast_fp16 = add(x = var_2379_cast_fp16, y = var_2380_to_fp16)[name = string("mean_squared_101_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_2382_cast_fp16 = pow(x = mean_squared_101_cast_fp16, y = var_27_to_fp16)[name = string("op_2382_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_187_cast_fp16 = mul(x = hidden_states_417_cast_fp16, y = var_2382_cast_fp16)[name = string("normed_output_187_cast_fp16")];
+            tensor<fp16, [64]> const_177_to_fp16 = const()[name = string("const_177_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(169536704)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_189_cast_fp16 = mul(x = normed_output_187_cast_fp16, y = const_177_to_fp16)[name = string("normed_output_189_cast_fp16")];
+            tensor<int32, [2]> var_2402 = const()[name = string("op_2402"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_2403_axis_0 = const()[name = string("op_2403_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_2403_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_2403_cast_fp16_1 = split(axis = var_2403_axis_0, split_sizes = var_2402, x = normed_output_189_cast_fp16)[name = string("op_2403_cast_fp16")];
+            tensor<int32, [2]> var_2406 = const()[name = string("op_2406"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_2407_axis_0 = const()[name = string("op_2407_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_2407_0, tensor<fp16, [1, 2304, 32]> var_2407_1 = split(axis = var_2407_axis_0, split_sizes = var_2406, x = var_160_cast_fp16)[name = string("op_2407")];
+            tensor<int32, [2]> var_2410 = const()[name = string("op_2410"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_2411_axis_0 = const()[name = string("op_2411_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_2411_0, tensor<fp16, [1, 2304, 32]> var_2411_1 = split(axis = var_2411_axis_0, split_sizes = var_2410, x = var_163_cast_fp16)[name = string("op_2411")];
+            tensor<int32, [1]> cos_117_axes_0 = const()[name = string("cos_117_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_117 = expand_dims(axes = cos_117_axes_0, x = var_2407_0)[name = string("cos_117")];
+            tensor<int32, [1]> sin_117_axes_0 = const()[name = string("sin_117_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_117 = expand_dims(axes = sin_117_axes_0, x = var_2411_0)[name = string("sin_117")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2416_cast_fp16 = mul(x = var_2403_cast_fp16_0, y = cos_117)[name = string("op_2416_cast_fp16")];
+            tensor<int32, [4]> x1_57_begin_0 = const()[name = string("x1_57_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_57_end_0 = const()[name = string("x1_57_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_57_end_mask_0 = const()[name = string("x1_57_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_57_cast_fp16 = slice_by_index(begin = x1_57_begin_0, end = x1_57_end_0, end_mask = x1_57_end_mask_0, x = var_2403_cast_fp16_0)[name = string("x1_57_cast_fp16")];
+            tensor<int32, [4]> x2_57_begin_0 = const()[name = string("x2_57_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_57_end_0 = const()[name = string("x2_57_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_57_end_mask_0 = const()[name = string("x2_57_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_57_cast_fp16 = slice_by_index(begin = x2_57_begin_0, end = x2_57_end_0, end_mask = x2_57_end_mask_0, x = var_2403_cast_fp16_0)[name = string("x2_57_cast_fp16")];
+            fp16 const_182_promoted_to_fp16 = const()[name = string("const_182_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_2427_cast_fp16 = mul(x = x2_57_cast_fp16, y = const_182_promoted_to_fp16)[name = string("op_2427_cast_fp16")];
+            bool var_2429_interleave_0 = const()[name = string("op_2429_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_2429_cast_fp16 = concat(axis = var_38, interleave = var_2429_interleave_0, values = (var_2427_cast_fp16, x1_57_cast_fp16))[name = string("op_2429_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2430_cast_fp16 = mul(x = var_2429_cast_fp16, y = sin_117)[name = string("op_2430_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2431_cast_fp16 = add(x = var_2416_cast_fp16, y = var_2430_cast_fp16)[name = string("op_2431_cast_fp16")];
+            tensor<int32, [1]> cos_121_axes_0 = const()[name = string("cos_121_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_121 = expand_dims(axes = cos_121_axes_0, x = var_2407_1)[name = string("cos_121")];
+            tensor<int32, [1]> sin_121_axes_0 = const()[name = string("sin_121_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_121 = expand_dims(axes = sin_121_axes_0, x = var_2411_1)[name = string("sin_121")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2434_cast_fp16 = mul(x = var_2403_cast_fp16_1, y = cos_121)[name = string("op_2434_cast_fp16")];
+            tensor<int32, [4]> x1_59_begin_0 = const()[name = string("x1_59_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_59_end_0 = const()[name = string("x1_59_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_59_end_mask_0 = const()[name = string("x1_59_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_59_cast_fp16 = slice_by_index(begin = x1_59_begin_0, end = x1_59_end_0, end_mask = x1_59_end_mask_0, x = var_2403_cast_fp16_1)[name = string("x1_59_cast_fp16")];
+            tensor<int32, [4]> x2_59_begin_0 = const()[name = string("x2_59_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_59_end_0 = const()[name = string("x2_59_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_59_end_mask_0 = const()[name = string("x2_59_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_59_cast_fp16 = slice_by_index(begin = x2_59_begin_0, end = x2_59_end_0, end_mask = x2_59_end_mask_0, x = var_2403_cast_fp16_1)[name = string("x2_59_cast_fp16")];
+            fp16 const_185_promoted_to_fp16 = const()[name = string("const_185_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_2445_cast_fp16 = mul(x = x2_59_cast_fp16, y = const_185_promoted_to_fp16)[name = string("op_2445_cast_fp16")];
+            bool var_2447_interleave_0 = const()[name = string("op_2447_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_2447_cast_fp16 = concat(axis = var_38, interleave = var_2447_interleave_0, values = (var_2445_cast_fp16, x1_59_cast_fp16))[name = string("op_2447_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2448_cast_fp16 = mul(x = var_2447_cast_fp16, y = sin_121)[name = string("op_2448_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2449_cast_fp16 = add(x = var_2434_cast_fp16, y = var_2448_cast_fp16)[name = string("op_2449_cast_fp16")];
+            bool query_states_15_interleave_0 = const()[name = string("query_states_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> query_states_15_cast_fp16 = concat(axis = var_38, interleave = query_states_15_interleave_0, values = (var_2431_cast_fp16, var_2449_cast_fp16))[name = string("query_states_15_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_7_self_attn_k_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_7_self_attn_k_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(169536896)))];
+            tensor<fp16, [1, 2304, 768]> linear_51_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_7_self_attn_k_proj_linear_weight_promoted_to_fp16, x = clip_98_cast_fp16)[name = string("linear_51_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_7_self_attn_k_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_7_self_attn_k_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.e4p+3)];
+            fp16 model_vision_tower_encoder_layers_7_self_attn_k_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_7_self_attn_k_proj_output_max_promoted_to_fp16"), val = fp16(0x1.ep+3)];
+            tensor<fp16, [1, 2304, 768]> clip_101_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_7_self_attn_k_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_7_self_attn_k_proj_output_max_promoted_to_fp16, x = linear_51_cast_fp16)[name = string("clip_101_cast_fp16")];
+            tensor<int32, [4]> var_2462 = const()[name = string("op_2462"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_423_cast_fp16 = reshape(shape = var_2462, x = clip_101_cast_fp16)[name = string("hidden_states_423_cast_fp16")];
+            fp16 var_33_promoted_51_to_fp16 = const()[name = string("op_33_promoted_51_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_2466_cast_fp16 = pow(x = hidden_states_423_cast_fp16, y = var_33_promoted_51_to_fp16)[name = string("op_2466_cast_fp16")];
+            tensor<int32, [1]> var_2468_axes_0 = const()[name = string("op_2468_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2468_keep_dims_0 = const()[name = string("op_2468_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_2468_cast_fp16 = reduce_mean(axes = var_2468_axes_0, keep_dims = var_2468_keep_dims_0, x = var_2466_cast_fp16)[name = string("op_2468_cast_fp16")];
+            fp16 var_2469_to_fp16 = const()[name = string("op_2469_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_103_cast_fp16 = add(x = var_2468_cast_fp16, y = var_2469_to_fp16)[name = string("mean_squared_103_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_2471_cast_fp16 = pow(x = mean_squared_103_cast_fp16, y = var_27_to_fp16)[name = string("op_2471_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_191_cast_fp16 = mul(x = hidden_states_423_cast_fp16, y = var_2471_cast_fp16)[name = string("normed_output_191_cast_fp16")];
+            tensor<fp16, [64]> const_186_to_fp16 = const()[name = string("const_186_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(170716608)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_193_cast_fp16 = mul(x = normed_output_191_cast_fp16, y = const_186_to_fp16)[name = string("normed_output_193_cast_fp16")];
+            tensor<int32, [2]> var_2491 = const()[name = string("op_2491"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_2492_axis_0 = const()[name = string("op_2492_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_2492_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_2492_cast_fp16_1 = split(axis = var_2492_axis_0, split_sizes = var_2491, x = normed_output_193_cast_fp16)[name = string("op_2492_cast_fp16")];
+            tensor<int32, [2]> var_2495 = const()[name = string("op_2495"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_2496_axis_0 = const()[name = string("op_2496_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_2496_0, tensor<fp16, [1, 2304, 32]> var_2496_1 = split(axis = var_2496_axis_0, split_sizes = var_2495, x = var_160_cast_fp16)[name = string("op_2496")];
+            tensor<int32, [2]> var_2499 = const()[name = string("op_2499"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_2500_axis_0 = const()[name = string("op_2500_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_2500_0, tensor<fp16, [1, 2304, 32]> var_2500_1 = split(axis = var_2500_axis_0, split_sizes = var_2499, x = var_163_cast_fp16)[name = string("op_2500")];
+            tensor<int32, [1]> cos_125_axes_0 = const()[name = string("cos_125_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_125 = expand_dims(axes = cos_125_axes_0, x = var_2496_0)[name = string("cos_125")];
+            tensor<int32, [1]> sin_125_axes_0 = const()[name = string("sin_125_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_125 = expand_dims(axes = sin_125_axes_0, x = var_2500_0)[name = string("sin_125")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2505_cast_fp16 = mul(x = var_2492_cast_fp16_0, y = cos_125)[name = string("op_2505_cast_fp16")];
+            tensor<int32, [4]> x1_61_begin_0 = const()[name = string("x1_61_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_61_end_0 = const()[name = string("x1_61_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_61_end_mask_0 = const()[name = string("x1_61_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_61_cast_fp16 = slice_by_index(begin = x1_61_begin_0, end = x1_61_end_0, end_mask = x1_61_end_mask_0, x = var_2492_cast_fp16_0)[name = string("x1_61_cast_fp16")];
+            tensor<int32, [4]> x2_61_begin_0 = const()[name = string("x2_61_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_61_end_0 = const()[name = string("x2_61_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_61_end_mask_0 = const()[name = string("x2_61_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_61_cast_fp16 = slice_by_index(begin = x2_61_begin_0, end = x2_61_end_0, end_mask = x2_61_end_mask_0, x = var_2492_cast_fp16_0)[name = string("x2_61_cast_fp16")];
+            fp16 const_191_promoted_to_fp16 = const()[name = string("const_191_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_2516_cast_fp16 = mul(x = x2_61_cast_fp16, y = const_191_promoted_to_fp16)[name = string("op_2516_cast_fp16")];
+            bool var_2518_interleave_0 = const()[name = string("op_2518_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_2518_cast_fp16 = concat(axis = var_38, interleave = var_2518_interleave_0, values = (var_2516_cast_fp16, x1_61_cast_fp16))[name = string("op_2518_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2519_cast_fp16 = mul(x = var_2518_cast_fp16, y = sin_125)[name = string("op_2519_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2520_cast_fp16 = add(x = var_2505_cast_fp16, y = var_2519_cast_fp16)[name = string("op_2520_cast_fp16")];
+            tensor<int32, [1]> cos_129_axes_0 = const()[name = string("cos_129_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_129 = expand_dims(axes = cos_129_axes_0, x = var_2496_1)[name = string("cos_129")];
+            tensor<int32, [1]> sin_129_axes_0 = const()[name = string("sin_129_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_129 = expand_dims(axes = sin_129_axes_0, x = var_2500_1)[name = string("sin_129")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2523_cast_fp16 = mul(x = var_2492_cast_fp16_1, y = cos_129)[name = string("op_2523_cast_fp16")];
+            tensor<int32, [4]> x1_63_begin_0 = const()[name = string("x1_63_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_63_end_0 = const()[name = string("x1_63_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_63_end_mask_0 = const()[name = string("x1_63_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_63_cast_fp16 = slice_by_index(begin = x1_63_begin_0, end = x1_63_end_0, end_mask = x1_63_end_mask_0, x = var_2492_cast_fp16_1)[name = string("x1_63_cast_fp16")];
+            tensor<int32, [4]> x2_63_begin_0 = const()[name = string("x2_63_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_63_end_0 = const()[name = string("x2_63_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_63_end_mask_0 = const()[name = string("x2_63_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_63_cast_fp16 = slice_by_index(begin = x2_63_begin_0, end = x2_63_end_0, end_mask = x2_63_end_mask_0, x = var_2492_cast_fp16_1)[name = string("x2_63_cast_fp16")];
+            fp16 const_194_promoted_to_fp16 = const()[name = string("const_194_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_2534_cast_fp16 = mul(x = x2_63_cast_fp16, y = const_194_promoted_to_fp16)[name = string("op_2534_cast_fp16")];
+            bool var_2536_interleave_0 = const()[name = string("op_2536_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_2536_cast_fp16 = concat(axis = var_38, interleave = var_2536_interleave_0, values = (var_2534_cast_fp16, x1_63_cast_fp16))[name = string("op_2536_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2537_cast_fp16 = mul(x = var_2536_cast_fp16, y = sin_129)[name = string("op_2537_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2538_cast_fp16 = add(x = var_2523_cast_fp16, y = var_2537_cast_fp16)[name = string("op_2538_cast_fp16")];
+            bool key_states_15_interleave_0 = const()[name = string("key_states_15_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> key_states_15_cast_fp16 = concat(axis = var_38, interleave = key_states_15_interleave_0, values = (var_2520_cast_fp16, var_2538_cast_fp16))[name = string("key_states_15_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_7_self_attn_v_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_7_self_attn_v_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(170716800)))];
+            tensor<fp16, [1, 2304, 768]> linear_52_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_7_self_attn_v_proj_linear_weight_promoted_to_fp16, x = clip_98_cast_fp16)[name = string("linear_52_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_7_self_attn_v_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_7_self_attn_v_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.e4p+3)];
+            fp16 model_vision_tower_encoder_layers_7_self_attn_v_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_7_self_attn_v_proj_output_max_promoted_to_fp16"), val = fp16(0x1.ep+3)];
+            tensor<fp16, [1, 2304, 768]> clip_103_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_7_self_attn_v_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_7_self_attn_v_proj_output_max_promoted_to_fp16, x = linear_52_cast_fp16)[name = string("clip_103_cast_fp16")];
+            tensor<int32, [4]> var_2551 = const()[name = string("op_2551"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_429_cast_fp16 = reshape(shape = var_2551, x = clip_103_cast_fp16)[name = string("hidden_states_429_cast_fp16")];
+            fp16 var_33_promoted_52_to_fp16 = const()[name = string("op_33_promoted_52_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_2554_cast_fp16 = pow(x = hidden_states_429_cast_fp16, y = var_33_promoted_52_to_fp16)[name = string("op_2554_cast_fp16")];
+            tensor<int32, [1]> var_2556_axes_0 = const()[name = string("op_2556_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2556_keep_dims_0 = const()[name = string("op_2556_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_2556_cast_fp16 = reduce_mean(axes = var_2556_axes_0, keep_dims = var_2556_keep_dims_0, x = var_2554_cast_fp16)[name = string("op_2556_cast_fp16")];
+            fp16 var_2557_to_fp16 = const()[name = string("op_2557_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_105_cast_fp16 = add(x = var_2556_cast_fp16, y = var_2557_to_fp16)[name = string("mean_squared_105_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_2559_cast_fp16 = pow(x = mean_squared_105_cast_fp16, y = var_27_to_fp16)[name = string("op_2559_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_195_cast_fp16 = mul(x = hidden_states_429_cast_fp16, y = var_2559_cast_fp16)[name = string("normed_output_195_cast_fp16")];
+            tensor<int32, [4]> hidden_states_435_perm_0 = const()[name = string("hidden_states_435_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            bool matmul_7_transpose_y_0 = const()[name = string("matmul_7_transpose_y_0"), val = bool(true)];
+            bool matmul_7_transpose_x_0 = const()[name = string("matmul_7_transpose_x_0"), val = bool(false)];
+            tensor<int32, [4]> transpose_78_perm_0 = const()[name = string("transpose_78_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<int32, [4]> transpose_79_perm_0 = const()[name = string("transpose_79_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_79 = transpose(perm = transpose_79_perm_0, x = key_states_15_cast_fp16)[name = string("transpose_129")];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_78 = transpose(perm = transpose_78_perm_0, x = query_states_15_cast_fp16)[name = string("transpose_130")];
+            tensor<fp16, [1, 12, 2304, 2304]> matmul_7_cast_fp16 = matmul(transpose_x = matmul_7_transpose_x_0, transpose_y = matmul_7_transpose_y_0, x = transpose_78, y = transpose_79)[name = string("matmul_7_cast_fp16")];
+            tensor<fp16, [1, 12, 2304, 2304]> add_7_cast_fp16 = add(x = matmul_7_cast_fp16, y = attention_mask_cast_fp16)[name = string("add_7_cast_fp16")];
+            int32 softmax_7_axis_0 = const()[name = string("softmax_7_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 12, 2304, 2304]> softmax_7_cast_fp16 = softmax(axis = softmax_7_axis_0, x = add_7_cast_fp16)[name = string("softmax_7_cast_fp16")];
+            bool attn_output_29_transpose_x_0 = const()[name = string("attn_output_29_transpose_x_0"), val = bool(false)];
+            bool attn_output_29_transpose_y_0 = const()[name = string("attn_output_29_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 12, 2304, 64]> hidden_states_435_cast_fp16 = transpose(perm = hidden_states_435_perm_0, x = normed_output_195_cast_fp16)[name = string("transpose_131")];
+            tensor<fp16, [1, 12, 2304, 64]> attn_output_29_cast_fp16 = matmul(transpose_x = attn_output_29_transpose_x_0, transpose_y = attn_output_29_transpose_y_0, x = softmax_7_cast_fp16, y = hidden_states_435_cast_fp16)[name = string("attn_output_29_cast_fp16")];
+            tensor<int32, [4]> var_2564_perm_0 = const()[name = string("op_2564_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_2566 = const()[name = string("op_2566"), val = tensor<int32, [3]>([1, 2304, -1])];
+            tensor<fp16, [1, 2304, 12, 64]> var_2564_cast_fp16 = transpose(perm = var_2564_perm_0, x = attn_output_29_cast_fp16)[name = string("transpose_128")];
+            tensor<fp16, [1, 2304, 768]> var_2567_cast_fp16 = reshape(shape = var_2566, x = var_2564_cast_fp16)[name = string("op_2567_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_7_self_attn_o_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_7_self_attn_o_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.46p+1)];
+            fp16 model_vision_tower_encoder_layers_7_self_attn_o_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_7_self_attn_o_proj_input_max_promoted_to_fp16"), val = fp16(0x1.42p+1)];
+            tensor<fp16, [1, 2304, 768]> clip_104_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_7_self_attn_o_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_7_self_attn_o_proj_input_max_promoted_to_fp16, x = var_2567_cast_fp16)[name = string("clip_104_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_7_self_attn_o_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_7_self_attn_o_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(171896512)))];
+            tensor<fp16, [1, 2304, 768]> linear_53_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_7_self_attn_o_proj_linear_weight_promoted_to_fp16, x = clip_104_cast_fp16)[name = string("linear_53_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_7_self_attn_o_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_7_self_attn_o_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.e4p+1)];
+            fp16 model_vision_tower_encoder_layers_7_self_attn_o_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_7_self_attn_o_proj_output_max_promoted_to_fp16"), val = fp16(0x1.ep+1)];
+            tensor<fp16, [1, 2304, 768]> clip_105_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_7_self_attn_o_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_7_self_attn_o_proj_output_max_promoted_to_fp16, x = linear_53_cast_fp16)[name = string("clip_105_cast_fp16")];
+            fp16 var_33_promoted_53_to_fp16 = const()[name = string("op_33_promoted_53_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_2580_cast_fp16 = pow(x = clip_105_cast_fp16, y = var_33_promoted_53_to_fp16)[name = string("op_2580_cast_fp16")];
+            tensor<int32, [1]> var_2582_axes_0 = const()[name = string("op_2582_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2582_keep_dims_0 = const()[name = string("op_2582_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_2582_cast_fp16 = reduce_mean(axes = var_2582_axes_0, keep_dims = var_2582_keep_dims_0, x = var_2580_cast_fp16)[name = string("op_2582_cast_fp16")];
+            fp16 var_2583_to_fp16 = const()[name = string("op_2583_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_107_cast_fp16 = add(x = var_2582_cast_fp16, y = var_2583_to_fp16)[name = string("mean_squared_107_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_2585_cast_fp16 = pow(x = mean_squared_107_cast_fp16, y = var_27_to_fp16)[name = string("op_2585_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_197_cast_fp16 = mul(x = clip_105_cast_fp16, y = var_2585_cast_fp16)[name = string("normed_output_197_cast_fp16")];
+            tensor<fp16, [768]> const_195_to_fp16 = const()[name = string("const_195_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(173076224)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_199_cast_fp16 = mul(x = normed_output_197_cast_fp16, y = const_195_to_fp16)[name = string("normed_output_199_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_447_cast_fp16 = add(x = hidden_states_409_cast_fp16, y = normed_output_199_cast_fp16)[name = string("hidden_states_447_cast_fp16")];
+            fp16 var_33_promoted_54_to_fp16 = const()[name = string("op_33_promoted_54_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_2593_cast_fp16 = pow(x = hidden_states_447_cast_fp16, y = var_33_promoted_54_to_fp16)[name = string("op_2593_cast_fp16")];
+            tensor<int32, [1]> var_2595_axes_0 = const()[name = string("op_2595_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2595_keep_dims_0 = const()[name = string("op_2595_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_2595_cast_fp16 = reduce_mean(axes = var_2595_axes_0, keep_dims = var_2595_keep_dims_0, x = var_2593_cast_fp16)[name = string("op_2595_cast_fp16")];
+            fp16 var_2596_to_fp16 = const()[name = string("op_2596_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_109_cast_fp16 = add(x = var_2595_cast_fp16, y = var_2596_to_fp16)[name = string("mean_squared_109_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_2598_cast_fp16 = pow(x = mean_squared_109_cast_fp16, y = var_27_to_fp16)[name = string("op_2598_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_201_cast_fp16 = mul(x = hidden_states_447_cast_fp16, y = var_2598_cast_fp16)[name = string("normed_output_201_cast_fp16")];
+            tensor<fp16, [768]> const_196_to_fp16 = const()[name = string("const_196_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(173077824)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_203_cast_fp16 = mul(x = normed_output_201_cast_fp16, y = const_196_to_fp16)[name = string("normed_output_203_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_7_mlp_gate_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_7_mlp_gate_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.1ep+3)];
+            fp16 model_vision_tower_encoder_layers_7_mlp_gate_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_7_mlp_gate_proj_input_max_promoted_to_fp16"), val = fp16(0x1.1cp+3)];
+            tensor<fp16, [1, 2304, 768]> clip_106_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_7_mlp_gate_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_7_mlp_gate_proj_input_max_promoted_to_fp16, x = normed_output_203_cast_fp16)[name = string("clip_106_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_7_mlp_gate_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_7_mlp_gate_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(173079424)))];
+            tensor<fp16, [1, 2304, 3072]> linear_54_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_7_mlp_gate_proj_linear_weight_promoted_to_fp16, x = clip_106_cast_fp16)[name = string("linear_54_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_7_mlp_gate_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_7_mlp_gate_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.48p+3)];
+            fp16 model_vision_tower_encoder_layers_7_mlp_gate_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_7_mlp_gate_proj_output_max_promoted_to_fp16"), val = fp16(0x1.46p+3)];
+            tensor<fp16, [1, 2304, 3072]> clip_107_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_7_mlp_gate_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_7_mlp_gate_proj_output_max_promoted_to_fp16, x = linear_54_cast_fp16)[name = string("clip_107_cast_fp16")];
+            string var_2615_mode_0 = const()[name = string("op_2615_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 2304, 3072]> var_2615_cast_fp16 = gelu(mode = var_2615_mode_0, x = clip_107_cast_fp16)[name = string("op_2615_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_7_mlp_up_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_7_mlp_up_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(177798080)))];
+            tensor<fp16, [1, 2304, 3072]> linear_55_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_7_mlp_up_proj_linear_weight_promoted_to_fp16, x = clip_106_cast_fp16)[name = string("linear_55_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_7_mlp_up_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_7_mlp_up_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.48p+3)];
+            fp16 model_vision_tower_encoder_layers_7_mlp_up_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_7_mlp_up_proj_output_max_promoted_to_fp16"), val = fp16(0x1.46p+3)];
+            tensor<fp16, [1, 2304, 3072]> clip_109_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_7_mlp_up_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_7_mlp_up_proj_output_max_promoted_to_fp16, x = linear_55_cast_fp16)[name = string("clip_109_cast_fp16")];
+            tensor<fp16, [1, 2304, 3072]> hidden_states_457_cast_fp16 = mul(x = var_2615_cast_fp16, y = clip_109_cast_fp16)[name = string("hidden_states_457_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_7_mlp_down_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_7_mlp_down_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.02p+5)];
+            fp16 model_vision_tower_encoder_layers_7_mlp_down_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_7_mlp_down_proj_input_max_promoted_to_fp16"), val = fp16(0x1p+5)];
+            tensor<fp16, [1, 2304, 3072]> clip_110_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_7_mlp_down_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_7_mlp_down_proj_input_max_promoted_to_fp16, x = hidden_states_457_cast_fp16)[name = string("clip_110_cast_fp16")];
+            tensor<fp16, [768, 3072]> model_vision_tower_encoder_layers_7_mlp_down_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_7_mlp_down_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 3072]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(182516736)))];
+            tensor<fp16, [1, 2304, 768]> linear_56_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_7_mlp_down_proj_linear_weight_promoted_to_fp16, x = clip_110_cast_fp16)[name = string("linear_56_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_7_mlp_down_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_7_mlp_down_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.dap+3)];
+            fp16 model_vision_tower_encoder_layers_7_mlp_down_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_7_mlp_down_proj_output_max_promoted_to_fp16"), val = fp16(0x1.d6p+3)];
+            tensor<fp16, [1, 2304, 768]> clip_111_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_7_mlp_down_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_7_mlp_down_proj_output_max_promoted_to_fp16, x = linear_56_cast_fp16)[name = string("clip_111_cast_fp16")];
+            fp16 var_33_promoted_55_to_fp16 = const()[name = string("op_33_promoted_55_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_2637_cast_fp16 = pow(x = clip_111_cast_fp16, y = var_33_promoted_55_to_fp16)[name = string("op_2637_cast_fp16")];
+            tensor<int32, [1]> var_2639_axes_0 = const()[name = string("op_2639_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2639_keep_dims_0 = const()[name = string("op_2639_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_2639_cast_fp16 = reduce_mean(axes = var_2639_axes_0, keep_dims = var_2639_keep_dims_0, x = var_2637_cast_fp16)[name = string("op_2639_cast_fp16")];
+            fp16 var_2640_to_fp16 = const()[name = string("op_2640_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_111_cast_fp16 = add(x = var_2639_cast_fp16, y = var_2640_to_fp16)[name = string("mean_squared_111_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_2642_cast_fp16 = pow(x = mean_squared_111_cast_fp16, y = var_27_to_fp16)[name = string("op_2642_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_205_cast_fp16 = mul(x = clip_111_cast_fp16, y = var_2642_cast_fp16)[name = string("normed_output_205_cast_fp16")];
+            tensor<fp16, [768]> const_197_to_fp16 = const()[name = string("const_197_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(187235392)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_207_cast_fp16 = mul(x = normed_output_205_cast_fp16, y = const_197_to_fp16)[name = string("normed_output_207_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_467_cast_fp16 = add(x = hidden_states_447_cast_fp16, y = normed_output_207_cast_fp16)[name = string("hidden_states_467_cast_fp16")];
+            fp16 var_33_promoted_56_to_fp16 = const()[name = string("op_33_promoted_56_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_2656_cast_fp16 = pow(x = hidden_states_467_cast_fp16, y = var_33_promoted_56_to_fp16)[name = string("op_2656_cast_fp16")];
+            tensor<int32, [1]> var_2658_axes_0 = const()[name = string("op_2658_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2658_keep_dims_0 = const()[name = string("op_2658_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_2658_cast_fp16 = reduce_mean(axes = var_2658_axes_0, keep_dims = var_2658_keep_dims_0, x = var_2656_cast_fp16)[name = string("op_2658_cast_fp16")];
+            fp16 var_2659_to_fp16 = const()[name = string("op_2659_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_113_cast_fp16 = add(x = var_2658_cast_fp16, y = var_2659_to_fp16)[name = string("mean_squared_113_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_2661_cast_fp16 = pow(x = mean_squared_113_cast_fp16, y = var_27_to_fp16)[name = string("op_2661_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_209_cast_fp16 = mul(x = hidden_states_467_cast_fp16, y = var_2661_cast_fp16)[name = string("normed_output_209_cast_fp16")];
+            tensor<fp16, [768]> const_198_to_fp16 = const()[name = string("const_198_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(187236992)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_211_cast_fp16 = mul(x = normed_output_209_cast_fp16, y = const_198_to_fp16)[name = string("normed_output_211_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_8_self_attn_q_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_8_self_attn_q_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.52p+3)];
+            fp16 model_vision_tower_encoder_layers_8_self_attn_q_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_8_self_attn_q_proj_input_max_promoted_to_fp16"), val = fp16(0x1.4ep+3)];
+            tensor<fp16, [1, 2304, 768]> clip_112_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_8_self_attn_q_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_8_self_attn_q_proj_input_max_promoted_to_fp16, x = normed_output_211_cast_fp16)[name = string("clip_112_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_8_self_attn_q_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_8_self_attn_q_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(187238592)))];
+            tensor<fp16, [1, 2304, 768]> linear_57_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_8_self_attn_q_proj_linear_weight_promoted_to_fp16, x = clip_112_cast_fp16)[name = string("linear_57_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_8_self_attn_q_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_8_self_attn_q_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.dcp+3)];
+            fp16 model_vision_tower_encoder_layers_8_self_attn_q_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_8_self_attn_q_proj_output_max_promoted_to_fp16"), val = fp16(0x1.d8p+3)];
+            tensor<fp16, [1, 2304, 768]> clip_113_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_8_self_attn_q_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_8_self_attn_q_proj_output_max_promoted_to_fp16, x = linear_57_cast_fp16)[name = string("clip_113_cast_fp16")];
+            tensor<int32, [4]> var_2683 = const()[name = string("op_2683"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_475_cast_fp16 = reshape(shape = var_2683, x = clip_113_cast_fp16)[name = string("hidden_states_475_cast_fp16")];
+            fp16 var_33_promoted_57_to_fp16 = const()[name = string("op_33_promoted_57_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_2687_cast_fp16 = pow(x = hidden_states_475_cast_fp16, y = var_33_promoted_57_to_fp16)[name = string("op_2687_cast_fp16")];
+            tensor<int32, [1]> var_2689_axes_0 = const()[name = string("op_2689_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2689_keep_dims_0 = const()[name = string("op_2689_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_2689_cast_fp16 = reduce_mean(axes = var_2689_axes_0, keep_dims = var_2689_keep_dims_0, x = var_2687_cast_fp16)[name = string("op_2689_cast_fp16")];
+            fp16 var_2690_to_fp16 = const()[name = string("op_2690_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_115_cast_fp16 = add(x = var_2689_cast_fp16, y = var_2690_to_fp16)[name = string("mean_squared_115_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_2692_cast_fp16 = pow(x = mean_squared_115_cast_fp16, y = var_27_to_fp16)[name = string("op_2692_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_213_cast_fp16 = mul(x = hidden_states_475_cast_fp16, y = var_2692_cast_fp16)[name = string("normed_output_213_cast_fp16")];
+            tensor<fp16, [64]> const_201_to_fp16 = const()[name = string("const_201_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(188418304)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_215_cast_fp16 = mul(x = normed_output_213_cast_fp16, y = const_201_to_fp16)[name = string("normed_output_215_cast_fp16")];
+            tensor<int32, [2]> var_2712 = const()[name = string("op_2712"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_2713_axis_0 = const()[name = string("op_2713_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_2713_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_2713_cast_fp16_1 = split(axis = var_2713_axis_0, split_sizes = var_2712, x = normed_output_215_cast_fp16)[name = string("op_2713_cast_fp16")];
+            tensor<int32, [2]> var_2716 = const()[name = string("op_2716"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_2717_axis_0 = const()[name = string("op_2717_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_2717_0, tensor<fp16, [1, 2304, 32]> var_2717_1 = split(axis = var_2717_axis_0, split_sizes = var_2716, x = var_160_cast_fp16)[name = string("op_2717")];
+            tensor<int32, [2]> var_2720 = const()[name = string("op_2720"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_2721_axis_0 = const()[name = string("op_2721_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_2721_0, tensor<fp16, [1, 2304, 32]> var_2721_1 = split(axis = var_2721_axis_0, split_sizes = var_2720, x = var_163_cast_fp16)[name = string("op_2721")];
+            tensor<int32, [1]> cos_133_axes_0 = const()[name = string("cos_133_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_133 = expand_dims(axes = cos_133_axes_0, x = var_2717_0)[name = string("cos_133")];
+            tensor<int32, [1]> sin_133_axes_0 = const()[name = string("sin_133_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_133 = expand_dims(axes = sin_133_axes_0, x = var_2721_0)[name = string("sin_133")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2726_cast_fp16 = mul(x = var_2713_cast_fp16_0, y = cos_133)[name = string("op_2726_cast_fp16")];
+            tensor<int32, [4]> x1_65_begin_0 = const()[name = string("x1_65_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_65_end_0 = const()[name = string("x1_65_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_65_end_mask_0 = const()[name = string("x1_65_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_65_cast_fp16 = slice_by_index(begin = x1_65_begin_0, end = x1_65_end_0, end_mask = x1_65_end_mask_0, x = var_2713_cast_fp16_0)[name = string("x1_65_cast_fp16")];
+            tensor<int32, [4]> x2_65_begin_0 = const()[name = string("x2_65_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_65_end_0 = const()[name = string("x2_65_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_65_end_mask_0 = const()[name = string("x2_65_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_65_cast_fp16 = slice_by_index(begin = x2_65_begin_0, end = x2_65_end_0, end_mask = x2_65_end_mask_0, x = var_2713_cast_fp16_0)[name = string("x2_65_cast_fp16")];
+            fp16 const_206_promoted_to_fp16 = const()[name = string("const_206_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_2737_cast_fp16 = mul(x = x2_65_cast_fp16, y = const_206_promoted_to_fp16)[name = string("op_2737_cast_fp16")];
+            bool var_2739_interleave_0 = const()[name = string("op_2739_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_2739_cast_fp16 = concat(axis = var_38, interleave = var_2739_interleave_0, values = (var_2737_cast_fp16, x1_65_cast_fp16))[name = string("op_2739_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2740_cast_fp16 = mul(x = var_2739_cast_fp16, y = sin_133)[name = string("op_2740_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2741_cast_fp16 = add(x = var_2726_cast_fp16, y = var_2740_cast_fp16)[name = string("op_2741_cast_fp16")];
+            tensor<int32, [1]> cos_137_axes_0 = const()[name = string("cos_137_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_137 = expand_dims(axes = cos_137_axes_0, x = var_2717_1)[name = string("cos_137")];
+            tensor<int32, [1]> sin_137_axes_0 = const()[name = string("sin_137_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_137 = expand_dims(axes = sin_137_axes_0, x = var_2721_1)[name = string("sin_137")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2744_cast_fp16 = mul(x = var_2713_cast_fp16_1, y = cos_137)[name = string("op_2744_cast_fp16")];
+            tensor<int32, [4]> x1_67_begin_0 = const()[name = string("x1_67_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_67_end_0 = const()[name = string("x1_67_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_67_end_mask_0 = const()[name = string("x1_67_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_67_cast_fp16 = slice_by_index(begin = x1_67_begin_0, end = x1_67_end_0, end_mask = x1_67_end_mask_0, x = var_2713_cast_fp16_1)[name = string("x1_67_cast_fp16")];
+            tensor<int32, [4]> x2_67_begin_0 = const()[name = string("x2_67_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_67_end_0 = const()[name = string("x2_67_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_67_end_mask_0 = const()[name = string("x2_67_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_67_cast_fp16 = slice_by_index(begin = x2_67_begin_0, end = x2_67_end_0, end_mask = x2_67_end_mask_0, x = var_2713_cast_fp16_1)[name = string("x2_67_cast_fp16")];
+            fp16 const_209_promoted_to_fp16 = const()[name = string("const_209_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_2755_cast_fp16 = mul(x = x2_67_cast_fp16, y = const_209_promoted_to_fp16)[name = string("op_2755_cast_fp16")];
+            bool var_2757_interleave_0 = const()[name = string("op_2757_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_2757_cast_fp16 = concat(axis = var_38, interleave = var_2757_interleave_0, values = (var_2755_cast_fp16, x1_67_cast_fp16))[name = string("op_2757_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2758_cast_fp16 = mul(x = var_2757_cast_fp16, y = sin_137)[name = string("op_2758_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2759_cast_fp16 = add(x = var_2744_cast_fp16, y = var_2758_cast_fp16)[name = string("op_2759_cast_fp16")];
+            bool query_states_17_interleave_0 = const()[name = string("query_states_17_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> query_states_17_cast_fp16 = concat(axis = var_38, interleave = query_states_17_interleave_0, values = (var_2741_cast_fp16, var_2759_cast_fp16))[name = string("query_states_17_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_8_self_attn_k_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_8_self_attn_k_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(188418496)))];
+            tensor<fp16, [1, 2304, 768]> linear_58_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_8_self_attn_k_proj_linear_weight_promoted_to_fp16, x = clip_112_cast_fp16)[name = string("linear_58_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_8_self_attn_k_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_8_self_attn_k_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.16p+4)];
+            fp16 model_vision_tower_encoder_layers_8_self_attn_k_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_8_self_attn_k_proj_output_max_promoted_to_fp16"), val = fp16(0x1.14p+4)];
+            tensor<fp16, [1, 2304, 768]> clip_115_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_8_self_attn_k_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_8_self_attn_k_proj_output_max_promoted_to_fp16, x = linear_58_cast_fp16)[name = string("clip_115_cast_fp16")];
+            tensor<int32, [4]> var_2772 = const()[name = string("op_2772"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_481_cast_fp16 = reshape(shape = var_2772, x = clip_115_cast_fp16)[name = string("hidden_states_481_cast_fp16")];
+            fp16 var_33_promoted_58_to_fp16 = const()[name = string("op_33_promoted_58_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_2776_cast_fp16 = pow(x = hidden_states_481_cast_fp16, y = var_33_promoted_58_to_fp16)[name = string("op_2776_cast_fp16")];
+            tensor<int32, [1]> var_2778_axes_0 = const()[name = string("op_2778_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2778_keep_dims_0 = const()[name = string("op_2778_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_2778_cast_fp16 = reduce_mean(axes = var_2778_axes_0, keep_dims = var_2778_keep_dims_0, x = var_2776_cast_fp16)[name = string("op_2778_cast_fp16")];
+            fp16 var_2779_to_fp16 = const()[name = string("op_2779_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_117_cast_fp16 = add(x = var_2778_cast_fp16, y = var_2779_to_fp16)[name = string("mean_squared_117_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_2781_cast_fp16 = pow(x = mean_squared_117_cast_fp16, y = var_27_to_fp16)[name = string("op_2781_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_217_cast_fp16 = mul(x = hidden_states_481_cast_fp16, y = var_2781_cast_fp16)[name = string("normed_output_217_cast_fp16")];
+            tensor<fp16, [64]> const_210_to_fp16 = const()[name = string("const_210_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(189598208)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_219_cast_fp16 = mul(x = normed_output_217_cast_fp16, y = const_210_to_fp16)[name = string("normed_output_219_cast_fp16")];
+            tensor<int32, [2]> var_2801 = const()[name = string("op_2801"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_2802_axis_0 = const()[name = string("op_2802_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_2802_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_2802_cast_fp16_1 = split(axis = var_2802_axis_0, split_sizes = var_2801, x = normed_output_219_cast_fp16)[name = string("op_2802_cast_fp16")];
+            tensor<int32, [2]> var_2805 = const()[name = string("op_2805"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_2806_axis_0 = const()[name = string("op_2806_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_2806_0, tensor<fp16, [1, 2304, 32]> var_2806_1 = split(axis = var_2806_axis_0, split_sizes = var_2805, x = var_160_cast_fp16)[name = string("op_2806")];
+            tensor<int32, [2]> var_2809 = const()[name = string("op_2809"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_2810_axis_0 = const()[name = string("op_2810_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_2810_0, tensor<fp16, [1, 2304, 32]> var_2810_1 = split(axis = var_2810_axis_0, split_sizes = var_2809, x = var_163_cast_fp16)[name = string("op_2810")];
+            tensor<int32, [1]> cos_141_axes_0 = const()[name = string("cos_141_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_141 = expand_dims(axes = cos_141_axes_0, x = var_2806_0)[name = string("cos_141")];
+            tensor<int32, [1]> sin_141_axes_0 = const()[name = string("sin_141_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_141 = expand_dims(axes = sin_141_axes_0, x = var_2810_0)[name = string("sin_141")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2815_cast_fp16 = mul(x = var_2802_cast_fp16_0, y = cos_141)[name = string("op_2815_cast_fp16")];
+            tensor<int32, [4]> x1_69_begin_0 = const()[name = string("x1_69_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_69_end_0 = const()[name = string("x1_69_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_69_end_mask_0 = const()[name = string("x1_69_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_69_cast_fp16 = slice_by_index(begin = x1_69_begin_0, end = x1_69_end_0, end_mask = x1_69_end_mask_0, x = var_2802_cast_fp16_0)[name = string("x1_69_cast_fp16")];
+            tensor<int32, [4]> x2_69_begin_0 = const()[name = string("x2_69_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_69_end_0 = const()[name = string("x2_69_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_69_end_mask_0 = const()[name = string("x2_69_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_69_cast_fp16 = slice_by_index(begin = x2_69_begin_0, end = x2_69_end_0, end_mask = x2_69_end_mask_0, x = var_2802_cast_fp16_0)[name = string("x2_69_cast_fp16")];
+            fp16 const_215_promoted_to_fp16 = const()[name = string("const_215_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_2826_cast_fp16 = mul(x = x2_69_cast_fp16, y = const_215_promoted_to_fp16)[name = string("op_2826_cast_fp16")];
+            bool var_2828_interleave_0 = const()[name = string("op_2828_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_2828_cast_fp16 = concat(axis = var_38, interleave = var_2828_interleave_0, values = (var_2826_cast_fp16, x1_69_cast_fp16))[name = string("op_2828_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2829_cast_fp16 = mul(x = var_2828_cast_fp16, y = sin_141)[name = string("op_2829_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2830_cast_fp16 = add(x = var_2815_cast_fp16, y = var_2829_cast_fp16)[name = string("op_2830_cast_fp16")];
+            tensor<int32, [1]> cos_145_axes_0 = const()[name = string("cos_145_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_145 = expand_dims(axes = cos_145_axes_0, x = var_2806_1)[name = string("cos_145")];
+            tensor<int32, [1]> sin_145_axes_0 = const()[name = string("sin_145_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_145 = expand_dims(axes = sin_145_axes_0, x = var_2810_1)[name = string("sin_145")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2833_cast_fp16 = mul(x = var_2802_cast_fp16_1, y = cos_145)[name = string("op_2833_cast_fp16")];
+            tensor<int32, [4]> x1_71_begin_0 = const()[name = string("x1_71_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_71_end_0 = const()[name = string("x1_71_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_71_end_mask_0 = const()[name = string("x1_71_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_71_cast_fp16 = slice_by_index(begin = x1_71_begin_0, end = x1_71_end_0, end_mask = x1_71_end_mask_0, x = var_2802_cast_fp16_1)[name = string("x1_71_cast_fp16")];
+            tensor<int32, [4]> x2_71_begin_0 = const()[name = string("x2_71_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_71_end_0 = const()[name = string("x2_71_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_71_end_mask_0 = const()[name = string("x2_71_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_71_cast_fp16 = slice_by_index(begin = x2_71_begin_0, end = x2_71_end_0, end_mask = x2_71_end_mask_0, x = var_2802_cast_fp16_1)[name = string("x2_71_cast_fp16")];
+            fp16 const_218_promoted_to_fp16 = const()[name = string("const_218_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_2844_cast_fp16 = mul(x = x2_71_cast_fp16, y = const_218_promoted_to_fp16)[name = string("op_2844_cast_fp16")];
+            bool var_2846_interleave_0 = const()[name = string("op_2846_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_2846_cast_fp16 = concat(axis = var_38, interleave = var_2846_interleave_0, values = (var_2844_cast_fp16, x1_71_cast_fp16))[name = string("op_2846_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2847_cast_fp16 = mul(x = var_2846_cast_fp16, y = sin_145)[name = string("op_2847_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_2848_cast_fp16 = add(x = var_2833_cast_fp16, y = var_2847_cast_fp16)[name = string("op_2848_cast_fp16")];
+            bool key_states_17_interleave_0 = const()[name = string("key_states_17_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> key_states_17_cast_fp16 = concat(axis = var_38, interleave = key_states_17_interleave_0, values = (var_2830_cast_fp16, var_2848_cast_fp16))[name = string("key_states_17_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_8_self_attn_v_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_8_self_attn_v_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(189598400)))];
+            tensor<fp16, [1, 2304, 768]> linear_59_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_8_self_attn_v_proj_linear_weight_promoted_to_fp16, x = clip_112_cast_fp16)[name = string("linear_59_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_8_self_attn_v_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_8_self_attn_v_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.16p+4)];
+            fp16 model_vision_tower_encoder_layers_8_self_attn_v_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_8_self_attn_v_proj_output_max_promoted_to_fp16"), val = fp16(0x1.14p+4)];
+            tensor<fp16, [1, 2304, 768]> clip_117_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_8_self_attn_v_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_8_self_attn_v_proj_output_max_promoted_to_fp16, x = linear_59_cast_fp16)[name = string("clip_117_cast_fp16")];
+            tensor<int32, [4]> var_2861 = const()[name = string("op_2861"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_487_cast_fp16 = reshape(shape = var_2861, x = clip_117_cast_fp16)[name = string("hidden_states_487_cast_fp16")];
+            fp16 var_33_promoted_59_to_fp16 = const()[name = string("op_33_promoted_59_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_2864_cast_fp16 = pow(x = hidden_states_487_cast_fp16, y = var_33_promoted_59_to_fp16)[name = string("op_2864_cast_fp16")];
+            tensor<int32, [1]> var_2866_axes_0 = const()[name = string("op_2866_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2866_keep_dims_0 = const()[name = string("op_2866_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_2866_cast_fp16 = reduce_mean(axes = var_2866_axes_0, keep_dims = var_2866_keep_dims_0, x = var_2864_cast_fp16)[name = string("op_2866_cast_fp16")];
+            fp16 var_2867_to_fp16 = const()[name = string("op_2867_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_119_cast_fp16 = add(x = var_2866_cast_fp16, y = var_2867_to_fp16)[name = string("mean_squared_119_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_2869_cast_fp16 = pow(x = mean_squared_119_cast_fp16, y = var_27_to_fp16)[name = string("op_2869_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_221_cast_fp16 = mul(x = hidden_states_487_cast_fp16, y = var_2869_cast_fp16)[name = string("normed_output_221_cast_fp16")];
+            tensor<int32, [4]> hidden_states_493_perm_0 = const()[name = string("hidden_states_493_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            bool matmul_8_transpose_y_0 = const()[name = string("matmul_8_transpose_y_0"), val = bool(true)];
+            bool matmul_8_transpose_x_0 = const()[name = string("matmul_8_transpose_x_0"), val = bool(false)];
+            tensor<int32, [4]> transpose_80_perm_0 = const()[name = string("transpose_80_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<int32, [4]> transpose_81_perm_0 = const()[name = string("transpose_81_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_81 = transpose(perm = transpose_81_perm_0, x = key_states_17_cast_fp16)[name = string("transpose_125")];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_80 = transpose(perm = transpose_80_perm_0, x = query_states_17_cast_fp16)[name = string("transpose_126")];
+            tensor<fp16, [1, 12, 2304, 2304]> matmul_8_cast_fp16 = matmul(transpose_x = matmul_8_transpose_x_0, transpose_y = matmul_8_transpose_y_0, x = transpose_80, y = transpose_81)[name = string("matmul_8_cast_fp16")];
+            tensor<fp16, [1, 12, 2304, 2304]> add_8_cast_fp16 = add(x = matmul_8_cast_fp16, y = attention_mask_cast_fp16)[name = string("add_8_cast_fp16")];
+            int32 softmax_8_axis_0 = const()[name = string("softmax_8_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 12, 2304, 2304]> softmax_8_cast_fp16 = softmax(axis = softmax_8_axis_0, x = add_8_cast_fp16)[name = string("softmax_8_cast_fp16")];
+            bool attn_output_33_transpose_x_0 = const()[name = string("attn_output_33_transpose_x_0"), val = bool(false)];
+            bool attn_output_33_transpose_y_0 = const()[name = string("attn_output_33_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 12, 2304, 64]> hidden_states_493_cast_fp16 = transpose(perm = hidden_states_493_perm_0, x = normed_output_221_cast_fp16)[name = string("transpose_127")];
+            tensor<fp16, [1, 12, 2304, 64]> attn_output_33_cast_fp16 = matmul(transpose_x = attn_output_33_transpose_x_0, transpose_y = attn_output_33_transpose_y_0, x = softmax_8_cast_fp16, y = hidden_states_493_cast_fp16)[name = string("attn_output_33_cast_fp16")];
+            tensor<int32, [4]> var_2874_perm_0 = const()[name = string("op_2874_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_2876 = const()[name = string("op_2876"), val = tensor<int32, [3]>([1, 2304, -1])];
+            tensor<fp16, [1, 2304, 12, 64]> var_2874_cast_fp16 = transpose(perm = var_2874_perm_0, x = attn_output_33_cast_fp16)[name = string("transpose_124")];
+            tensor<fp16, [1, 2304, 768]> var_2877_cast_fp16 = reshape(shape = var_2876, x = var_2874_cast_fp16)[name = string("op_2877_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_8_self_attn_o_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_8_self_attn_o_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.2cp+1)];
+            fp16 model_vision_tower_encoder_layers_8_self_attn_o_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_8_self_attn_o_proj_input_max_promoted_to_fp16"), val = fp16(0x1.28p+1)];
+            tensor<fp16, [1, 2304, 768]> clip_118_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_8_self_attn_o_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_8_self_attn_o_proj_input_max_promoted_to_fp16, x = var_2877_cast_fp16)[name = string("clip_118_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_8_self_attn_o_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_8_self_attn_o_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(190778112)))];
+            tensor<fp16, [1, 2304, 768]> linear_60_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_8_self_attn_o_proj_linear_weight_promoted_to_fp16, x = clip_118_cast_fp16)[name = string("linear_60_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_8_self_attn_o_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_8_self_attn_o_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.c2p+1)];
+            fp16 model_vision_tower_encoder_layers_8_self_attn_o_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_8_self_attn_o_proj_output_max_promoted_to_fp16"), val = fp16(0x1.bep+1)];
+            tensor<fp16, [1, 2304, 768]> clip_119_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_8_self_attn_o_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_8_self_attn_o_proj_output_max_promoted_to_fp16, x = linear_60_cast_fp16)[name = string("clip_119_cast_fp16")];
+            fp16 var_33_promoted_60_to_fp16 = const()[name = string("op_33_promoted_60_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_2890_cast_fp16 = pow(x = clip_119_cast_fp16, y = var_33_promoted_60_to_fp16)[name = string("op_2890_cast_fp16")];
+            tensor<int32, [1]> var_2892_axes_0 = const()[name = string("op_2892_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2892_keep_dims_0 = const()[name = string("op_2892_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_2892_cast_fp16 = reduce_mean(axes = var_2892_axes_0, keep_dims = var_2892_keep_dims_0, x = var_2890_cast_fp16)[name = string("op_2892_cast_fp16")];
+            fp16 var_2893_to_fp16 = const()[name = string("op_2893_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_121_cast_fp16 = add(x = var_2892_cast_fp16, y = var_2893_to_fp16)[name = string("mean_squared_121_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_2895_cast_fp16 = pow(x = mean_squared_121_cast_fp16, y = var_27_to_fp16)[name = string("op_2895_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_223_cast_fp16 = mul(x = clip_119_cast_fp16, y = var_2895_cast_fp16)[name = string("normed_output_223_cast_fp16")];
+            tensor<fp16, [768]> const_219_to_fp16 = const()[name = string("const_219_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(191957824)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_225_cast_fp16 = mul(x = normed_output_223_cast_fp16, y = const_219_to_fp16)[name = string("normed_output_225_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_505_cast_fp16 = add(x = hidden_states_467_cast_fp16, y = normed_output_225_cast_fp16)[name = string("hidden_states_505_cast_fp16")];
+            fp16 var_33_promoted_61_to_fp16 = const()[name = string("op_33_promoted_61_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_2903_cast_fp16 = pow(x = hidden_states_505_cast_fp16, y = var_33_promoted_61_to_fp16)[name = string("op_2903_cast_fp16")];
+            tensor<int32, [1]> var_2905_axes_0 = const()[name = string("op_2905_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2905_keep_dims_0 = const()[name = string("op_2905_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_2905_cast_fp16 = reduce_mean(axes = var_2905_axes_0, keep_dims = var_2905_keep_dims_0, x = var_2903_cast_fp16)[name = string("op_2905_cast_fp16")];
+            fp16 var_2906_to_fp16 = const()[name = string("op_2906_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_123_cast_fp16 = add(x = var_2905_cast_fp16, y = var_2906_to_fp16)[name = string("mean_squared_123_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_2908_cast_fp16 = pow(x = mean_squared_123_cast_fp16, y = var_27_to_fp16)[name = string("op_2908_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_227_cast_fp16 = mul(x = hidden_states_505_cast_fp16, y = var_2908_cast_fp16)[name = string("normed_output_227_cast_fp16")];
+            tensor<fp16, [768]> const_220_to_fp16 = const()[name = string("const_220_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(191959424)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_229_cast_fp16 = mul(x = normed_output_227_cast_fp16, y = const_220_to_fp16)[name = string("normed_output_229_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_8_mlp_gate_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_8_mlp_gate_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.cp+2)];
+            fp16 model_vision_tower_encoder_layers_8_mlp_gate_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_8_mlp_gate_proj_input_max_promoted_to_fp16"), val = fp16(0x1.bcp+2)];
+            tensor<fp16, [1, 2304, 768]> clip_120_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_8_mlp_gate_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_8_mlp_gate_proj_input_max_promoted_to_fp16, x = normed_output_229_cast_fp16)[name = string("clip_120_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_8_mlp_gate_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_8_mlp_gate_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(191961024)))];
+            tensor<fp16, [1, 2304, 3072]> linear_61_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_8_mlp_gate_proj_linear_weight_promoted_to_fp16, x = clip_120_cast_fp16)[name = string("linear_61_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_8_mlp_gate_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_8_mlp_gate_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.d8p+2)];
+            fp16 model_vision_tower_encoder_layers_8_mlp_gate_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_8_mlp_gate_proj_output_max_promoted_to_fp16"), val = fp16(0x1.d4p+2)];
+            tensor<fp16, [1, 2304, 3072]> clip_121_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_8_mlp_gate_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_8_mlp_gate_proj_output_max_promoted_to_fp16, x = linear_61_cast_fp16)[name = string("clip_121_cast_fp16")];
+            string var_2925_mode_0 = const()[name = string("op_2925_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 2304, 3072]> var_2925_cast_fp16 = gelu(mode = var_2925_mode_0, x = clip_121_cast_fp16)[name = string("op_2925_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_8_mlp_up_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_8_mlp_up_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(196679680)))];
+            tensor<fp16, [1, 2304, 3072]> linear_62_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_8_mlp_up_proj_linear_weight_promoted_to_fp16, x = clip_120_cast_fp16)[name = string("linear_62_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_8_mlp_up_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_8_mlp_up_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.d8p+2)];
+            fp16 model_vision_tower_encoder_layers_8_mlp_up_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_8_mlp_up_proj_output_max_promoted_to_fp16"), val = fp16(0x1.d4p+2)];
+            tensor<fp16, [1, 2304, 3072]> clip_123_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_8_mlp_up_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_8_mlp_up_proj_output_max_promoted_to_fp16, x = linear_62_cast_fp16)[name = string("clip_123_cast_fp16")];
+            tensor<fp16, [1, 2304, 3072]> hidden_states_515_cast_fp16 = mul(x = var_2925_cast_fp16, y = clip_123_cast_fp16)[name = string("hidden_states_515_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_8_mlp_down_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_8_mlp_down_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.44p+4)];
+            fp16 model_vision_tower_encoder_layers_8_mlp_down_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_8_mlp_down_proj_input_max_promoted_to_fp16"), val = fp16(0x1.42p+4)];
+            tensor<fp16, [1, 2304, 3072]> clip_124_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_8_mlp_down_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_8_mlp_down_proj_input_max_promoted_to_fp16, x = hidden_states_515_cast_fp16)[name = string("clip_124_cast_fp16")];
+            tensor<fp16, [768, 3072]> model_vision_tower_encoder_layers_8_mlp_down_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_8_mlp_down_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 3072]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(201398336)))];
+            tensor<fp16, [1, 2304, 768]> linear_63_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_8_mlp_down_proj_linear_weight_promoted_to_fp16, x = clip_124_cast_fp16)[name = string("linear_63_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_8_mlp_down_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_8_mlp_down_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.8cp+2)];
+            fp16 model_vision_tower_encoder_layers_8_mlp_down_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_8_mlp_down_proj_output_max_promoted_to_fp16"), val = fp16(0x1.8ap+2)];
+            tensor<fp16, [1, 2304, 768]> clip_125_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_8_mlp_down_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_8_mlp_down_proj_output_max_promoted_to_fp16, x = linear_63_cast_fp16)[name = string("clip_125_cast_fp16")];
+            fp16 var_33_promoted_62_to_fp16 = const()[name = string("op_33_promoted_62_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_2947_cast_fp16 = pow(x = clip_125_cast_fp16, y = var_33_promoted_62_to_fp16)[name = string("op_2947_cast_fp16")];
+            tensor<int32, [1]> var_2949_axes_0 = const()[name = string("op_2949_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2949_keep_dims_0 = const()[name = string("op_2949_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_2949_cast_fp16 = reduce_mean(axes = var_2949_axes_0, keep_dims = var_2949_keep_dims_0, x = var_2947_cast_fp16)[name = string("op_2949_cast_fp16")];
+            fp16 var_2950_to_fp16 = const()[name = string("op_2950_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_125_cast_fp16 = add(x = var_2949_cast_fp16, y = var_2950_to_fp16)[name = string("mean_squared_125_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_2952_cast_fp16 = pow(x = mean_squared_125_cast_fp16, y = var_27_to_fp16)[name = string("op_2952_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_231_cast_fp16 = mul(x = clip_125_cast_fp16, y = var_2952_cast_fp16)[name = string("normed_output_231_cast_fp16")];
+            tensor<fp16, [768]> const_221_to_fp16 = const()[name = string("const_221_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(206116992)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_233_cast_fp16 = mul(x = normed_output_231_cast_fp16, y = const_221_to_fp16)[name = string("normed_output_233_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_525_cast_fp16 = add(x = hidden_states_505_cast_fp16, y = normed_output_233_cast_fp16)[name = string("hidden_states_525_cast_fp16")];
+            fp16 var_33_promoted_63_to_fp16 = const()[name = string("op_33_promoted_63_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_2966_cast_fp16 = pow(x = hidden_states_525_cast_fp16, y = var_33_promoted_63_to_fp16)[name = string("op_2966_cast_fp16")];
+            tensor<int32, [1]> var_2968_axes_0 = const()[name = string("op_2968_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2968_keep_dims_0 = const()[name = string("op_2968_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_2968_cast_fp16 = reduce_mean(axes = var_2968_axes_0, keep_dims = var_2968_keep_dims_0, x = var_2966_cast_fp16)[name = string("op_2968_cast_fp16")];
+            fp16 var_2969_to_fp16 = const()[name = string("op_2969_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_127_cast_fp16 = add(x = var_2968_cast_fp16, y = var_2969_to_fp16)[name = string("mean_squared_127_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_2971_cast_fp16 = pow(x = mean_squared_127_cast_fp16, y = var_27_to_fp16)[name = string("op_2971_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_235_cast_fp16 = mul(x = hidden_states_525_cast_fp16, y = var_2971_cast_fp16)[name = string("normed_output_235_cast_fp16")];
+            tensor<fp16, [768]> const_222_to_fp16 = const()[name = string("const_222_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(206118592)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_237_cast_fp16 = mul(x = normed_output_235_cast_fp16, y = const_222_to_fp16)[name = string("normed_output_237_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_9_self_attn_q_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_9_self_attn_q_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.7p+3)];
+            fp16 model_vision_tower_encoder_layers_9_self_attn_q_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_9_self_attn_q_proj_input_max_promoted_to_fp16"), val = fp16(0x1.6cp+3)];
+            tensor<fp16, [1, 2304, 768]> clip_126_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_9_self_attn_q_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_9_self_attn_q_proj_input_max_promoted_to_fp16, x = normed_output_237_cast_fp16)[name = string("clip_126_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_9_self_attn_q_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_9_self_attn_q_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(206120192)))];
+            tensor<fp16, [1, 2304, 768]> linear_64_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_9_self_attn_q_proj_linear_weight_promoted_to_fp16, x = clip_126_cast_fp16)[name = string("linear_64_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_9_self_attn_q_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_9_self_attn_q_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.dp+3)];
+            fp16 model_vision_tower_encoder_layers_9_self_attn_q_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_9_self_attn_q_proj_output_max_promoted_to_fp16"), val = fp16(0x1.ccp+3)];
+            tensor<fp16, [1, 2304, 768]> clip_127_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_9_self_attn_q_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_9_self_attn_q_proj_output_max_promoted_to_fp16, x = linear_64_cast_fp16)[name = string("clip_127_cast_fp16")];
+            tensor<int32, [4]> var_2993 = const()[name = string("op_2993"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_533_cast_fp16 = reshape(shape = var_2993, x = clip_127_cast_fp16)[name = string("hidden_states_533_cast_fp16")];
+            fp16 var_33_promoted_64_to_fp16 = const()[name = string("op_33_promoted_64_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_2997_cast_fp16 = pow(x = hidden_states_533_cast_fp16, y = var_33_promoted_64_to_fp16)[name = string("op_2997_cast_fp16")];
+            tensor<int32, [1]> var_2999_axes_0 = const()[name = string("op_2999_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_2999_keep_dims_0 = const()[name = string("op_2999_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_2999_cast_fp16 = reduce_mean(axes = var_2999_axes_0, keep_dims = var_2999_keep_dims_0, x = var_2997_cast_fp16)[name = string("op_2999_cast_fp16")];
+            fp16 var_3000_to_fp16 = const()[name = string("op_3000_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_129_cast_fp16 = add(x = var_2999_cast_fp16, y = var_3000_to_fp16)[name = string("mean_squared_129_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_3002_cast_fp16 = pow(x = mean_squared_129_cast_fp16, y = var_27_to_fp16)[name = string("op_3002_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_239_cast_fp16 = mul(x = hidden_states_533_cast_fp16, y = var_3002_cast_fp16)[name = string("normed_output_239_cast_fp16")];
+            tensor<fp16, [64]> const_225_to_fp16 = const()[name = string("const_225_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(207299904)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_241_cast_fp16 = mul(x = normed_output_239_cast_fp16, y = const_225_to_fp16)[name = string("normed_output_241_cast_fp16")];
+            tensor<int32, [2]> var_3022 = const()[name = string("op_3022"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_3023_axis_0 = const()[name = string("op_3023_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_3023_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_3023_cast_fp16_1 = split(axis = var_3023_axis_0, split_sizes = var_3022, x = normed_output_241_cast_fp16)[name = string("op_3023_cast_fp16")];
+            tensor<int32, [2]> var_3026 = const()[name = string("op_3026"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_3027_axis_0 = const()[name = string("op_3027_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_3027_0, tensor<fp16, [1, 2304, 32]> var_3027_1 = split(axis = var_3027_axis_0, split_sizes = var_3026, x = var_160_cast_fp16)[name = string("op_3027")];
+            tensor<int32, [2]> var_3030 = const()[name = string("op_3030"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_3031_axis_0 = const()[name = string("op_3031_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_3031_0, tensor<fp16, [1, 2304, 32]> var_3031_1 = split(axis = var_3031_axis_0, split_sizes = var_3030, x = var_163_cast_fp16)[name = string("op_3031")];
+            tensor<int32, [1]> cos_149_axes_0 = const()[name = string("cos_149_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_149 = expand_dims(axes = cos_149_axes_0, x = var_3027_0)[name = string("cos_149")];
+            tensor<int32, [1]> sin_149_axes_0 = const()[name = string("sin_149_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_149 = expand_dims(axes = sin_149_axes_0, x = var_3031_0)[name = string("sin_149")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3036_cast_fp16 = mul(x = var_3023_cast_fp16_0, y = cos_149)[name = string("op_3036_cast_fp16")];
+            tensor<int32, [4]> x1_73_begin_0 = const()[name = string("x1_73_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_73_end_0 = const()[name = string("x1_73_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_73_end_mask_0 = const()[name = string("x1_73_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_73_cast_fp16 = slice_by_index(begin = x1_73_begin_0, end = x1_73_end_0, end_mask = x1_73_end_mask_0, x = var_3023_cast_fp16_0)[name = string("x1_73_cast_fp16")];
+            tensor<int32, [4]> x2_73_begin_0 = const()[name = string("x2_73_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_73_end_0 = const()[name = string("x2_73_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_73_end_mask_0 = const()[name = string("x2_73_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_73_cast_fp16 = slice_by_index(begin = x2_73_begin_0, end = x2_73_end_0, end_mask = x2_73_end_mask_0, x = var_3023_cast_fp16_0)[name = string("x2_73_cast_fp16")];
+            fp16 const_230_promoted_to_fp16 = const()[name = string("const_230_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_3047_cast_fp16 = mul(x = x2_73_cast_fp16, y = const_230_promoted_to_fp16)[name = string("op_3047_cast_fp16")];
+            bool var_3049_interleave_0 = const()[name = string("op_3049_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_3049_cast_fp16 = concat(axis = var_38, interleave = var_3049_interleave_0, values = (var_3047_cast_fp16, x1_73_cast_fp16))[name = string("op_3049_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3050_cast_fp16 = mul(x = var_3049_cast_fp16, y = sin_149)[name = string("op_3050_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3051_cast_fp16 = add(x = var_3036_cast_fp16, y = var_3050_cast_fp16)[name = string("op_3051_cast_fp16")];
+            tensor<int32, [1]> cos_153_axes_0 = const()[name = string("cos_153_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_153 = expand_dims(axes = cos_153_axes_0, x = var_3027_1)[name = string("cos_153")];
+            tensor<int32, [1]> sin_153_axes_0 = const()[name = string("sin_153_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_153 = expand_dims(axes = sin_153_axes_0, x = var_3031_1)[name = string("sin_153")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3054_cast_fp16 = mul(x = var_3023_cast_fp16_1, y = cos_153)[name = string("op_3054_cast_fp16")];
+            tensor<int32, [4]> x1_75_begin_0 = const()[name = string("x1_75_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_75_end_0 = const()[name = string("x1_75_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_75_end_mask_0 = const()[name = string("x1_75_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_75_cast_fp16 = slice_by_index(begin = x1_75_begin_0, end = x1_75_end_0, end_mask = x1_75_end_mask_0, x = var_3023_cast_fp16_1)[name = string("x1_75_cast_fp16")];
+            tensor<int32, [4]> x2_75_begin_0 = const()[name = string("x2_75_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_75_end_0 = const()[name = string("x2_75_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_75_end_mask_0 = const()[name = string("x2_75_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_75_cast_fp16 = slice_by_index(begin = x2_75_begin_0, end = x2_75_end_0, end_mask = x2_75_end_mask_0, x = var_3023_cast_fp16_1)[name = string("x2_75_cast_fp16")];
+            fp16 const_233_promoted_to_fp16 = const()[name = string("const_233_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_3065_cast_fp16 = mul(x = x2_75_cast_fp16, y = const_233_promoted_to_fp16)[name = string("op_3065_cast_fp16")];
+            bool var_3067_interleave_0 = const()[name = string("op_3067_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_3067_cast_fp16 = concat(axis = var_38, interleave = var_3067_interleave_0, values = (var_3065_cast_fp16, x1_75_cast_fp16))[name = string("op_3067_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3068_cast_fp16 = mul(x = var_3067_cast_fp16, y = sin_153)[name = string("op_3068_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3069_cast_fp16 = add(x = var_3054_cast_fp16, y = var_3068_cast_fp16)[name = string("op_3069_cast_fp16")];
+            bool query_states_19_interleave_0 = const()[name = string("query_states_19_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> query_states_19_cast_fp16 = concat(axis = var_38, interleave = query_states_19_interleave_0, values = (var_3051_cast_fp16, var_3069_cast_fp16))[name = string("query_states_19_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_9_self_attn_k_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_9_self_attn_k_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(207300096)))];
+            tensor<fp16, [1, 2304, 768]> linear_65_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_9_self_attn_k_proj_linear_weight_promoted_to_fp16, x = clip_126_cast_fp16)[name = string("linear_65_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_9_self_attn_k_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_9_self_attn_k_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.fep+3)];
+            fp16 model_vision_tower_encoder_layers_9_self_attn_k_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_9_self_attn_k_proj_output_max_promoted_to_fp16"), val = fp16(0x1.fap+3)];
+            tensor<fp16, [1, 2304, 768]> clip_129_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_9_self_attn_k_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_9_self_attn_k_proj_output_max_promoted_to_fp16, x = linear_65_cast_fp16)[name = string("clip_129_cast_fp16")];
+            tensor<int32, [4]> var_3082 = const()[name = string("op_3082"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_539_cast_fp16 = reshape(shape = var_3082, x = clip_129_cast_fp16)[name = string("hidden_states_539_cast_fp16")];
+            fp16 var_33_promoted_65_to_fp16 = const()[name = string("op_33_promoted_65_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_3086_cast_fp16 = pow(x = hidden_states_539_cast_fp16, y = var_33_promoted_65_to_fp16)[name = string("op_3086_cast_fp16")];
+            tensor<int32, [1]> var_3088_axes_0 = const()[name = string("op_3088_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3088_keep_dims_0 = const()[name = string("op_3088_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_3088_cast_fp16 = reduce_mean(axes = var_3088_axes_0, keep_dims = var_3088_keep_dims_0, x = var_3086_cast_fp16)[name = string("op_3088_cast_fp16")];
+            fp16 var_3089_to_fp16 = const()[name = string("op_3089_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_131_cast_fp16 = add(x = var_3088_cast_fp16, y = var_3089_to_fp16)[name = string("mean_squared_131_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_3091_cast_fp16 = pow(x = mean_squared_131_cast_fp16, y = var_27_to_fp16)[name = string("op_3091_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_243_cast_fp16 = mul(x = hidden_states_539_cast_fp16, y = var_3091_cast_fp16)[name = string("normed_output_243_cast_fp16")];
+            tensor<fp16, [64]> const_234_to_fp16 = const()[name = string("const_234_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(208479808)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_245_cast_fp16 = mul(x = normed_output_243_cast_fp16, y = const_234_to_fp16)[name = string("normed_output_245_cast_fp16")];
+            tensor<int32, [2]> var_3111 = const()[name = string("op_3111"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_3112_axis_0 = const()[name = string("op_3112_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_3112_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_3112_cast_fp16_1 = split(axis = var_3112_axis_0, split_sizes = var_3111, x = normed_output_245_cast_fp16)[name = string("op_3112_cast_fp16")];
+            tensor<int32, [2]> var_3115 = const()[name = string("op_3115"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_3116_axis_0 = const()[name = string("op_3116_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_3116_0, tensor<fp16, [1, 2304, 32]> var_3116_1 = split(axis = var_3116_axis_0, split_sizes = var_3115, x = var_160_cast_fp16)[name = string("op_3116")];
+            tensor<int32, [2]> var_3119 = const()[name = string("op_3119"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_3120_axis_0 = const()[name = string("op_3120_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_3120_0, tensor<fp16, [1, 2304, 32]> var_3120_1 = split(axis = var_3120_axis_0, split_sizes = var_3119, x = var_163_cast_fp16)[name = string("op_3120")];
+            tensor<int32, [1]> cos_157_axes_0 = const()[name = string("cos_157_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_157 = expand_dims(axes = cos_157_axes_0, x = var_3116_0)[name = string("cos_157")];
+            tensor<int32, [1]> sin_157_axes_0 = const()[name = string("sin_157_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_157 = expand_dims(axes = sin_157_axes_0, x = var_3120_0)[name = string("sin_157")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3125_cast_fp16 = mul(x = var_3112_cast_fp16_0, y = cos_157)[name = string("op_3125_cast_fp16")];
+            tensor<int32, [4]> x1_77_begin_0 = const()[name = string("x1_77_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_77_end_0 = const()[name = string("x1_77_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_77_end_mask_0 = const()[name = string("x1_77_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_77_cast_fp16 = slice_by_index(begin = x1_77_begin_0, end = x1_77_end_0, end_mask = x1_77_end_mask_0, x = var_3112_cast_fp16_0)[name = string("x1_77_cast_fp16")];
+            tensor<int32, [4]> x2_77_begin_0 = const()[name = string("x2_77_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_77_end_0 = const()[name = string("x2_77_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_77_end_mask_0 = const()[name = string("x2_77_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_77_cast_fp16 = slice_by_index(begin = x2_77_begin_0, end = x2_77_end_0, end_mask = x2_77_end_mask_0, x = var_3112_cast_fp16_0)[name = string("x2_77_cast_fp16")];
+            fp16 const_239_promoted_to_fp16 = const()[name = string("const_239_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_3136_cast_fp16 = mul(x = x2_77_cast_fp16, y = const_239_promoted_to_fp16)[name = string("op_3136_cast_fp16")];
+            bool var_3138_interleave_0 = const()[name = string("op_3138_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_3138_cast_fp16 = concat(axis = var_38, interleave = var_3138_interleave_0, values = (var_3136_cast_fp16, x1_77_cast_fp16))[name = string("op_3138_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3139_cast_fp16 = mul(x = var_3138_cast_fp16, y = sin_157)[name = string("op_3139_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3140_cast_fp16 = add(x = var_3125_cast_fp16, y = var_3139_cast_fp16)[name = string("op_3140_cast_fp16")];
+            tensor<int32, [1]> cos_161_axes_0 = const()[name = string("cos_161_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_161 = expand_dims(axes = cos_161_axes_0, x = var_3116_1)[name = string("cos_161")];
+            tensor<int32, [1]> sin_161_axes_0 = const()[name = string("sin_161_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_161 = expand_dims(axes = sin_161_axes_0, x = var_3120_1)[name = string("sin_161")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3143_cast_fp16 = mul(x = var_3112_cast_fp16_1, y = cos_161)[name = string("op_3143_cast_fp16")];
+            tensor<int32, [4]> x1_79_begin_0 = const()[name = string("x1_79_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_79_end_0 = const()[name = string("x1_79_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_79_end_mask_0 = const()[name = string("x1_79_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_79_cast_fp16 = slice_by_index(begin = x1_79_begin_0, end = x1_79_end_0, end_mask = x1_79_end_mask_0, x = var_3112_cast_fp16_1)[name = string("x1_79_cast_fp16")];
+            tensor<int32, [4]> x2_79_begin_0 = const()[name = string("x2_79_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_79_end_0 = const()[name = string("x2_79_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_79_end_mask_0 = const()[name = string("x2_79_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_79_cast_fp16 = slice_by_index(begin = x2_79_begin_0, end = x2_79_end_0, end_mask = x2_79_end_mask_0, x = var_3112_cast_fp16_1)[name = string("x2_79_cast_fp16")];
+            fp16 const_242_promoted_to_fp16 = const()[name = string("const_242_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_3154_cast_fp16 = mul(x = x2_79_cast_fp16, y = const_242_promoted_to_fp16)[name = string("op_3154_cast_fp16")];
+            bool var_3156_interleave_0 = const()[name = string("op_3156_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_3156_cast_fp16 = concat(axis = var_38, interleave = var_3156_interleave_0, values = (var_3154_cast_fp16, x1_79_cast_fp16))[name = string("op_3156_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3157_cast_fp16 = mul(x = var_3156_cast_fp16, y = sin_161)[name = string("op_3157_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3158_cast_fp16 = add(x = var_3143_cast_fp16, y = var_3157_cast_fp16)[name = string("op_3158_cast_fp16")];
+            bool key_states_19_interleave_0 = const()[name = string("key_states_19_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> key_states_19_cast_fp16 = concat(axis = var_38, interleave = key_states_19_interleave_0, values = (var_3140_cast_fp16, var_3158_cast_fp16))[name = string("key_states_19_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_9_self_attn_v_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_9_self_attn_v_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(208480000)))];
+            tensor<fp16, [1, 2304, 768]> linear_66_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_9_self_attn_v_proj_linear_weight_promoted_to_fp16, x = clip_126_cast_fp16)[name = string("linear_66_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_9_self_attn_v_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_9_self_attn_v_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.fep+3)];
+            fp16 model_vision_tower_encoder_layers_9_self_attn_v_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_9_self_attn_v_proj_output_max_promoted_to_fp16"), val = fp16(0x1.fap+3)];
+            tensor<fp16, [1, 2304, 768]> clip_131_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_9_self_attn_v_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_9_self_attn_v_proj_output_max_promoted_to_fp16, x = linear_66_cast_fp16)[name = string("clip_131_cast_fp16")];
+            tensor<int32, [4]> var_3171 = const()[name = string("op_3171"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_545_cast_fp16 = reshape(shape = var_3171, x = clip_131_cast_fp16)[name = string("hidden_states_545_cast_fp16")];
+            fp16 var_33_promoted_66_to_fp16 = const()[name = string("op_33_promoted_66_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_3174_cast_fp16 = pow(x = hidden_states_545_cast_fp16, y = var_33_promoted_66_to_fp16)[name = string("op_3174_cast_fp16")];
+            tensor<int32, [1]> var_3176_axes_0 = const()[name = string("op_3176_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3176_keep_dims_0 = const()[name = string("op_3176_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_3176_cast_fp16 = reduce_mean(axes = var_3176_axes_0, keep_dims = var_3176_keep_dims_0, x = var_3174_cast_fp16)[name = string("op_3176_cast_fp16")];
+            fp16 var_3177_to_fp16 = const()[name = string("op_3177_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_133_cast_fp16 = add(x = var_3176_cast_fp16, y = var_3177_to_fp16)[name = string("mean_squared_133_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_3179_cast_fp16 = pow(x = mean_squared_133_cast_fp16, y = var_27_to_fp16)[name = string("op_3179_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_247_cast_fp16 = mul(x = hidden_states_545_cast_fp16, y = var_3179_cast_fp16)[name = string("normed_output_247_cast_fp16")];
+            tensor<int32, [4]> hidden_states_551_perm_0 = const()[name = string("hidden_states_551_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            bool matmul_9_transpose_y_0 = const()[name = string("matmul_9_transpose_y_0"), val = bool(true)];
+            bool matmul_9_transpose_x_0 = const()[name = string("matmul_9_transpose_x_0"), val = bool(false)];
+            tensor<int32, [4]> transpose_82_perm_0 = const()[name = string("transpose_82_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<int32, [4]> transpose_83_perm_0 = const()[name = string("transpose_83_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_83 = transpose(perm = transpose_83_perm_0, x = key_states_19_cast_fp16)[name = string("transpose_121")];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_82 = transpose(perm = transpose_82_perm_0, x = query_states_19_cast_fp16)[name = string("transpose_122")];
+            tensor<fp16, [1, 12, 2304, 2304]> matmul_9_cast_fp16 = matmul(transpose_x = matmul_9_transpose_x_0, transpose_y = matmul_9_transpose_y_0, x = transpose_82, y = transpose_83)[name = string("matmul_9_cast_fp16")];
+            tensor<fp16, [1, 12, 2304, 2304]> add_9_cast_fp16 = add(x = matmul_9_cast_fp16, y = attention_mask_cast_fp16)[name = string("add_9_cast_fp16")];
+            int32 softmax_9_axis_0 = const()[name = string("softmax_9_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 12, 2304, 2304]> softmax_9_cast_fp16 = softmax(axis = softmax_9_axis_0, x = add_9_cast_fp16)[name = string("softmax_9_cast_fp16")];
+            bool attn_output_37_transpose_x_0 = const()[name = string("attn_output_37_transpose_x_0"), val = bool(false)];
+            bool attn_output_37_transpose_y_0 = const()[name = string("attn_output_37_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 12, 2304, 64]> hidden_states_551_cast_fp16 = transpose(perm = hidden_states_551_perm_0, x = normed_output_247_cast_fp16)[name = string("transpose_123")];
+            tensor<fp16, [1, 12, 2304, 64]> attn_output_37_cast_fp16 = matmul(transpose_x = attn_output_37_transpose_x_0, transpose_y = attn_output_37_transpose_y_0, x = softmax_9_cast_fp16, y = hidden_states_551_cast_fp16)[name = string("attn_output_37_cast_fp16")];
+            tensor<int32, [4]> var_3184_perm_0 = const()[name = string("op_3184_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_3186 = const()[name = string("op_3186"), val = tensor<int32, [3]>([1, 2304, -1])];
+            tensor<fp16, [1, 2304, 12, 64]> var_3184_cast_fp16 = transpose(perm = var_3184_perm_0, x = attn_output_37_cast_fp16)[name = string("transpose_120")];
+            tensor<fp16, [1, 2304, 768]> var_3187_cast_fp16 = reshape(shape = var_3186, x = var_3184_cast_fp16)[name = string("op_3187_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_9_self_attn_o_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_9_self_attn_o_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.eep+0)];
+            fp16 model_vision_tower_encoder_layers_9_self_attn_o_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_9_self_attn_o_proj_input_max_promoted_to_fp16"), val = fp16(0x1.eap+0)];
+            tensor<fp16, [1, 2304, 768]> clip_132_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_9_self_attn_o_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_9_self_attn_o_proj_input_max_promoted_to_fp16, x = var_3187_cast_fp16)[name = string("clip_132_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_9_self_attn_o_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_9_self_attn_o_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(209659712)))];
+            tensor<fp16, [1, 2304, 768]> linear_67_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_9_self_attn_o_proj_linear_weight_promoted_to_fp16, x = clip_132_cast_fp16)[name = string("linear_67_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_9_self_attn_o_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_9_self_attn_o_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.3ep+1)];
+            fp16 model_vision_tower_encoder_layers_9_self_attn_o_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_9_self_attn_o_proj_output_max_promoted_to_fp16"), val = fp16(0x1.3cp+1)];
+            tensor<fp16, [1, 2304, 768]> clip_133_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_9_self_attn_o_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_9_self_attn_o_proj_output_max_promoted_to_fp16, x = linear_67_cast_fp16)[name = string("clip_133_cast_fp16")];
+            fp16 var_33_promoted_67_to_fp16 = const()[name = string("op_33_promoted_67_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_3200_cast_fp16 = pow(x = clip_133_cast_fp16, y = var_33_promoted_67_to_fp16)[name = string("op_3200_cast_fp16")];
+            tensor<int32, [1]> var_3202_axes_0 = const()[name = string("op_3202_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3202_keep_dims_0 = const()[name = string("op_3202_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_3202_cast_fp16 = reduce_mean(axes = var_3202_axes_0, keep_dims = var_3202_keep_dims_0, x = var_3200_cast_fp16)[name = string("op_3202_cast_fp16")];
+            fp16 var_3203_to_fp16 = const()[name = string("op_3203_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_135_cast_fp16 = add(x = var_3202_cast_fp16, y = var_3203_to_fp16)[name = string("mean_squared_135_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_3205_cast_fp16 = pow(x = mean_squared_135_cast_fp16, y = var_27_to_fp16)[name = string("op_3205_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_249_cast_fp16 = mul(x = clip_133_cast_fp16, y = var_3205_cast_fp16)[name = string("normed_output_249_cast_fp16")];
+            tensor<fp16, [768]> const_243_to_fp16 = const()[name = string("const_243_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(210839424)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_251_cast_fp16 = mul(x = normed_output_249_cast_fp16, y = const_243_to_fp16)[name = string("normed_output_251_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_563_cast_fp16 = add(x = hidden_states_525_cast_fp16, y = normed_output_251_cast_fp16)[name = string("hidden_states_563_cast_fp16")];
+            fp16 var_33_promoted_68_to_fp16 = const()[name = string("op_33_promoted_68_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_3213_cast_fp16 = pow(x = hidden_states_563_cast_fp16, y = var_33_promoted_68_to_fp16)[name = string("op_3213_cast_fp16")];
+            tensor<int32, [1]> var_3215_axes_0 = const()[name = string("op_3215_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3215_keep_dims_0 = const()[name = string("op_3215_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_3215_cast_fp16 = reduce_mean(axes = var_3215_axes_0, keep_dims = var_3215_keep_dims_0, x = var_3213_cast_fp16)[name = string("op_3215_cast_fp16")];
+            fp16 var_3216_to_fp16 = const()[name = string("op_3216_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_137_cast_fp16 = add(x = var_3215_cast_fp16, y = var_3216_to_fp16)[name = string("mean_squared_137_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_3218_cast_fp16 = pow(x = mean_squared_137_cast_fp16, y = var_27_to_fp16)[name = string("op_3218_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_253_cast_fp16 = mul(x = hidden_states_563_cast_fp16, y = var_3218_cast_fp16)[name = string("normed_output_253_cast_fp16")];
+            tensor<fp16, [768]> const_244_to_fp16 = const()[name = string("const_244_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(210841024)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_255_cast_fp16 = mul(x = normed_output_253_cast_fp16, y = const_244_to_fp16)[name = string("normed_output_255_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_9_mlp_gate_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_9_mlp_gate_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.16p+3)];
+            fp16 model_vision_tower_encoder_layers_9_mlp_gate_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_9_mlp_gate_proj_input_max_promoted_to_fp16"), val = fp16(0x1.14p+3)];
+            tensor<fp16, [1, 2304, 768]> clip_134_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_9_mlp_gate_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_9_mlp_gate_proj_input_max_promoted_to_fp16, x = normed_output_255_cast_fp16)[name = string("clip_134_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_9_mlp_gate_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_9_mlp_gate_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(210842624)))];
+            tensor<fp16, [1, 2304, 3072]> linear_68_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_9_mlp_gate_proj_linear_weight_promoted_to_fp16, x = clip_134_cast_fp16)[name = string("linear_68_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_9_mlp_gate_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_9_mlp_gate_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.3p+3)];
+            fp16 model_vision_tower_encoder_layers_9_mlp_gate_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_9_mlp_gate_proj_output_max_promoted_to_fp16"), val = fp16(0x1.2ep+3)];
+            tensor<fp16, [1, 2304, 3072]> clip_135_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_9_mlp_gate_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_9_mlp_gate_proj_output_max_promoted_to_fp16, x = linear_68_cast_fp16)[name = string("clip_135_cast_fp16")];
+            string var_3235_mode_0 = const()[name = string("op_3235_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 2304, 3072]> var_3235_cast_fp16 = gelu(mode = var_3235_mode_0, x = clip_135_cast_fp16)[name = string("op_3235_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_9_mlp_up_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_9_mlp_up_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(215561280)))];
+            tensor<fp16, [1, 2304, 3072]> linear_69_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_9_mlp_up_proj_linear_weight_promoted_to_fp16, x = clip_134_cast_fp16)[name = string("linear_69_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_9_mlp_up_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_9_mlp_up_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.3p+3)];
+            fp16 model_vision_tower_encoder_layers_9_mlp_up_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_9_mlp_up_proj_output_max_promoted_to_fp16"), val = fp16(0x1.2ep+3)];
+            tensor<fp16, [1, 2304, 3072]> clip_137_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_9_mlp_up_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_9_mlp_up_proj_output_max_promoted_to_fp16, x = linear_69_cast_fp16)[name = string("clip_137_cast_fp16")];
+            tensor<fp16, [1, 2304, 3072]> hidden_states_573_cast_fp16 = mul(x = var_3235_cast_fp16, y = clip_137_cast_fp16)[name = string("hidden_states_573_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_9_mlp_down_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_9_mlp_down_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.cap+4)];
+            fp16 model_vision_tower_encoder_layers_9_mlp_down_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_9_mlp_down_proj_input_max_promoted_to_fp16"), val = fp16(0x1.c6p+4)];
+            tensor<fp16, [1, 2304, 3072]> clip_138_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_9_mlp_down_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_9_mlp_down_proj_input_max_promoted_to_fp16, x = hidden_states_573_cast_fp16)[name = string("clip_138_cast_fp16")];
+            tensor<fp16, [768, 3072]> model_vision_tower_encoder_layers_9_mlp_down_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_9_mlp_down_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 3072]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(220279936)))];
+            tensor<fp16, [1, 2304, 768]> linear_70_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_9_mlp_down_proj_linear_weight_promoted_to_fp16, x = clip_138_cast_fp16)[name = string("linear_70_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_9_mlp_down_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_9_mlp_down_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.4ep+3)];
+            fp16 model_vision_tower_encoder_layers_9_mlp_down_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_9_mlp_down_proj_output_max_promoted_to_fp16"), val = fp16(0x1.4ap+3)];
+            tensor<fp16, [1, 2304, 768]> clip_139_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_9_mlp_down_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_9_mlp_down_proj_output_max_promoted_to_fp16, x = linear_70_cast_fp16)[name = string("clip_139_cast_fp16")];
+            fp16 var_33_promoted_69_to_fp16 = const()[name = string("op_33_promoted_69_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_3257_cast_fp16 = pow(x = clip_139_cast_fp16, y = var_33_promoted_69_to_fp16)[name = string("op_3257_cast_fp16")];
+            tensor<int32, [1]> var_3259_axes_0 = const()[name = string("op_3259_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3259_keep_dims_0 = const()[name = string("op_3259_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_3259_cast_fp16 = reduce_mean(axes = var_3259_axes_0, keep_dims = var_3259_keep_dims_0, x = var_3257_cast_fp16)[name = string("op_3259_cast_fp16")];
+            fp16 var_3260_to_fp16 = const()[name = string("op_3260_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_139_cast_fp16 = add(x = var_3259_cast_fp16, y = var_3260_to_fp16)[name = string("mean_squared_139_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_3262_cast_fp16 = pow(x = mean_squared_139_cast_fp16, y = var_27_to_fp16)[name = string("op_3262_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_257_cast_fp16 = mul(x = clip_139_cast_fp16, y = var_3262_cast_fp16)[name = string("normed_output_257_cast_fp16")];
+            tensor<fp16, [768]> const_245_to_fp16 = const()[name = string("const_245_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(224998592)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_259_cast_fp16 = mul(x = normed_output_257_cast_fp16, y = const_245_to_fp16)[name = string("normed_output_259_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_583_cast_fp16 = add(x = hidden_states_563_cast_fp16, y = normed_output_259_cast_fp16)[name = string("hidden_states_583_cast_fp16")];
+            fp16 var_33_promoted_70_to_fp16 = const()[name = string("op_33_promoted_70_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_3276_cast_fp16 = pow(x = hidden_states_583_cast_fp16, y = var_33_promoted_70_to_fp16)[name = string("op_3276_cast_fp16")];
+            tensor<int32, [1]> var_3278_axes_0 = const()[name = string("op_3278_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3278_keep_dims_0 = const()[name = string("op_3278_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_3278_cast_fp16 = reduce_mean(axes = var_3278_axes_0, keep_dims = var_3278_keep_dims_0, x = var_3276_cast_fp16)[name = string("op_3278_cast_fp16")];
+            fp16 var_3279_to_fp16 = const()[name = string("op_3279_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_141_cast_fp16 = add(x = var_3278_cast_fp16, y = var_3279_to_fp16)[name = string("mean_squared_141_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_3281_cast_fp16 = pow(x = mean_squared_141_cast_fp16, y = var_27_to_fp16)[name = string("op_3281_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_261_cast_fp16 = mul(x = hidden_states_583_cast_fp16, y = var_3281_cast_fp16)[name = string("normed_output_261_cast_fp16")];
+            tensor<fp16, [768]> const_246_to_fp16 = const()[name = string("const_246_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(225000192)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_263_cast_fp16 = mul(x = normed_output_261_cast_fp16, y = const_246_to_fp16)[name = string("normed_output_263_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_10_self_attn_q_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_10_self_attn_q_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.b6p+3)];
+            fp16 model_vision_tower_encoder_layers_10_self_attn_q_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_10_self_attn_q_proj_input_max_promoted_to_fp16"), val = fp16(0x1.b4p+3)];
+            tensor<fp16, [1, 2304, 768]> clip_140_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_10_self_attn_q_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_10_self_attn_q_proj_input_max_promoted_to_fp16, x = normed_output_263_cast_fp16)[name = string("clip_140_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_10_self_attn_q_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_10_self_attn_q_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(225001792)))];
+            tensor<fp16, [1, 2304, 768]> linear_71_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_10_self_attn_q_proj_linear_weight_promoted_to_fp16, x = clip_140_cast_fp16)[name = string("linear_71_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_10_self_attn_q_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_10_self_attn_q_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.eap+3)];
+            fp16 model_vision_tower_encoder_layers_10_self_attn_q_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_10_self_attn_q_proj_output_max_promoted_to_fp16"), val = fp16(0x1.e6p+3)];
+            tensor<fp16, [1, 2304, 768]> clip_141_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_10_self_attn_q_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_10_self_attn_q_proj_output_max_promoted_to_fp16, x = linear_71_cast_fp16)[name = string("clip_141_cast_fp16")];
+            tensor<int32, [4]> var_3303 = const()[name = string("op_3303"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_591_cast_fp16 = reshape(shape = var_3303, x = clip_141_cast_fp16)[name = string("hidden_states_591_cast_fp16")];
+            fp16 var_33_promoted_71_to_fp16 = const()[name = string("op_33_promoted_71_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_3307_cast_fp16 = pow(x = hidden_states_591_cast_fp16, y = var_33_promoted_71_to_fp16)[name = string("op_3307_cast_fp16")];
+            tensor<int32, [1]> var_3309_axes_0 = const()[name = string("op_3309_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3309_keep_dims_0 = const()[name = string("op_3309_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_3309_cast_fp16 = reduce_mean(axes = var_3309_axes_0, keep_dims = var_3309_keep_dims_0, x = var_3307_cast_fp16)[name = string("op_3309_cast_fp16")];
+            fp16 var_3310_to_fp16 = const()[name = string("op_3310_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_143_cast_fp16 = add(x = var_3309_cast_fp16, y = var_3310_to_fp16)[name = string("mean_squared_143_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_3312_cast_fp16 = pow(x = mean_squared_143_cast_fp16, y = var_27_to_fp16)[name = string("op_3312_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_265_cast_fp16 = mul(x = hidden_states_591_cast_fp16, y = var_3312_cast_fp16)[name = string("normed_output_265_cast_fp16")];
+            tensor<fp16, [64]> const_249_to_fp16 = const()[name = string("const_249_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(226181504)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_267_cast_fp16 = mul(x = normed_output_265_cast_fp16, y = const_249_to_fp16)[name = string("normed_output_267_cast_fp16")];
+            tensor<int32, [2]> var_3332 = const()[name = string("op_3332"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_3333_axis_0 = const()[name = string("op_3333_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_3333_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_3333_cast_fp16_1 = split(axis = var_3333_axis_0, split_sizes = var_3332, x = normed_output_267_cast_fp16)[name = string("op_3333_cast_fp16")];
+            tensor<int32, [2]> var_3336 = const()[name = string("op_3336"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_3337_axis_0 = const()[name = string("op_3337_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_3337_0, tensor<fp16, [1, 2304, 32]> var_3337_1 = split(axis = var_3337_axis_0, split_sizes = var_3336, x = var_160_cast_fp16)[name = string("op_3337")];
+            tensor<int32, [2]> var_3340 = const()[name = string("op_3340"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_3341_axis_0 = const()[name = string("op_3341_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_3341_0, tensor<fp16, [1, 2304, 32]> var_3341_1 = split(axis = var_3341_axis_0, split_sizes = var_3340, x = var_163_cast_fp16)[name = string("op_3341")];
+            tensor<int32, [1]> cos_165_axes_0 = const()[name = string("cos_165_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_165 = expand_dims(axes = cos_165_axes_0, x = var_3337_0)[name = string("cos_165")];
+            tensor<int32, [1]> sin_165_axes_0 = const()[name = string("sin_165_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_165 = expand_dims(axes = sin_165_axes_0, x = var_3341_0)[name = string("sin_165")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3346_cast_fp16 = mul(x = var_3333_cast_fp16_0, y = cos_165)[name = string("op_3346_cast_fp16")];
+            tensor<int32, [4]> x1_81_begin_0 = const()[name = string("x1_81_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_81_end_0 = const()[name = string("x1_81_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_81_end_mask_0 = const()[name = string("x1_81_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_81_cast_fp16 = slice_by_index(begin = x1_81_begin_0, end = x1_81_end_0, end_mask = x1_81_end_mask_0, x = var_3333_cast_fp16_0)[name = string("x1_81_cast_fp16")];
+            tensor<int32, [4]> x2_81_begin_0 = const()[name = string("x2_81_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_81_end_0 = const()[name = string("x2_81_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_81_end_mask_0 = const()[name = string("x2_81_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_81_cast_fp16 = slice_by_index(begin = x2_81_begin_0, end = x2_81_end_0, end_mask = x2_81_end_mask_0, x = var_3333_cast_fp16_0)[name = string("x2_81_cast_fp16")];
+            fp16 const_254_promoted_to_fp16 = const()[name = string("const_254_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_3357_cast_fp16 = mul(x = x2_81_cast_fp16, y = const_254_promoted_to_fp16)[name = string("op_3357_cast_fp16")];
+            bool var_3359_interleave_0 = const()[name = string("op_3359_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_3359_cast_fp16 = concat(axis = var_38, interleave = var_3359_interleave_0, values = (var_3357_cast_fp16, x1_81_cast_fp16))[name = string("op_3359_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3360_cast_fp16 = mul(x = var_3359_cast_fp16, y = sin_165)[name = string("op_3360_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3361_cast_fp16 = add(x = var_3346_cast_fp16, y = var_3360_cast_fp16)[name = string("op_3361_cast_fp16")];
+            tensor<int32, [1]> cos_169_axes_0 = const()[name = string("cos_169_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_169 = expand_dims(axes = cos_169_axes_0, x = var_3337_1)[name = string("cos_169")];
+            tensor<int32, [1]> sin_169_axes_0 = const()[name = string("sin_169_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_169 = expand_dims(axes = sin_169_axes_0, x = var_3341_1)[name = string("sin_169")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3364_cast_fp16 = mul(x = var_3333_cast_fp16_1, y = cos_169)[name = string("op_3364_cast_fp16")];
+            tensor<int32, [4]> x1_83_begin_0 = const()[name = string("x1_83_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_83_end_0 = const()[name = string("x1_83_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_83_end_mask_0 = const()[name = string("x1_83_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_83_cast_fp16 = slice_by_index(begin = x1_83_begin_0, end = x1_83_end_0, end_mask = x1_83_end_mask_0, x = var_3333_cast_fp16_1)[name = string("x1_83_cast_fp16")];
+            tensor<int32, [4]> x2_83_begin_0 = const()[name = string("x2_83_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_83_end_0 = const()[name = string("x2_83_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_83_end_mask_0 = const()[name = string("x2_83_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_83_cast_fp16 = slice_by_index(begin = x2_83_begin_0, end = x2_83_end_0, end_mask = x2_83_end_mask_0, x = var_3333_cast_fp16_1)[name = string("x2_83_cast_fp16")];
+            fp16 const_257_promoted_to_fp16 = const()[name = string("const_257_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_3375_cast_fp16 = mul(x = x2_83_cast_fp16, y = const_257_promoted_to_fp16)[name = string("op_3375_cast_fp16")];
+            bool var_3377_interleave_0 = const()[name = string("op_3377_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_3377_cast_fp16 = concat(axis = var_38, interleave = var_3377_interleave_0, values = (var_3375_cast_fp16, x1_83_cast_fp16))[name = string("op_3377_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3378_cast_fp16 = mul(x = var_3377_cast_fp16, y = sin_169)[name = string("op_3378_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3379_cast_fp16 = add(x = var_3364_cast_fp16, y = var_3378_cast_fp16)[name = string("op_3379_cast_fp16")];
+            bool query_states_21_interleave_0 = const()[name = string("query_states_21_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> query_states_21_cast_fp16 = concat(axis = var_38, interleave = query_states_21_interleave_0, values = (var_3361_cast_fp16, var_3379_cast_fp16))[name = string("query_states_21_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_10_self_attn_k_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_10_self_attn_k_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(226181696)))];
+            tensor<fp16, [1, 2304, 768]> linear_72_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_10_self_attn_k_proj_linear_weight_promoted_to_fp16, x = clip_140_cast_fp16)[name = string("linear_72_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_10_self_attn_k_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_10_self_attn_k_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.52p+4)];
+            fp16 model_vision_tower_encoder_layers_10_self_attn_k_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_10_self_attn_k_proj_output_max_promoted_to_fp16"), val = fp16(0x1.5p+4)];
+            tensor<fp16, [1, 2304, 768]> clip_143_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_10_self_attn_k_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_10_self_attn_k_proj_output_max_promoted_to_fp16, x = linear_72_cast_fp16)[name = string("clip_143_cast_fp16")];
+            tensor<int32, [4]> var_3392 = const()[name = string("op_3392"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_597_cast_fp16 = reshape(shape = var_3392, x = clip_143_cast_fp16)[name = string("hidden_states_597_cast_fp16")];
+            fp16 var_33_promoted_72_to_fp16 = const()[name = string("op_33_promoted_72_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_3396_cast_fp16 = pow(x = hidden_states_597_cast_fp16, y = var_33_promoted_72_to_fp16)[name = string("op_3396_cast_fp16")];
+            tensor<int32, [1]> var_3398_axes_0 = const()[name = string("op_3398_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3398_keep_dims_0 = const()[name = string("op_3398_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_3398_cast_fp16 = reduce_mean(axes = var_3398_axes_0, keep_dims = var_3398_keep_dims_0, x = var_3396_cast_fp16)[name = string("op_3398_cast_fp16")];
+            fp16 var_3399_to_fp16 = const()[name = string("op_3399_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_145_cast_fp16 = add(x = var_3398_cast_fp16, y = var_3399_to_fp16)[name = string("mean_squared_145_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_3401_cast_fp16 = pow(x = mean_squared_145_cast_fp16, y = var_27_to_fp16)[name = string("op_3401_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_269_cast_fp16 = mul(x = hidden_states_597_cast_fp16, y = var_3401_cast_fp16)[name = string("normed_output_269_cast_fp16")];
+            tensor<fp16, [64]> const_258_to_fp16 = const()[name = string("const_258_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227361408)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_271_cast_fp16 = mul(x = normed_output_269_cast_fp16, y = const_258_to_fp16)[name = string("normed_output_271_cast_fp16")];
+            tensor<int32, [2]> var_3421 = const()[name = string("op_3421"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_3422_axis_0 = const()[name = string("op_3422_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_3422_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_3422_cast_fp16_1 = split(axis = var_3422_axis_0, split_sizes = var_3421, x = normed_output_271_cast_fp16)[name = string("op_3422_cast_fp16")];
+            tensor<int32, [2]> var_3425 = const()[name = string("op_3425"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_3426_axis_0 = const()[name = string("op_3426_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_3426_0, tensor<fp16, [1, 2304, 32]> var_3426_1 = split(axis = var_3426_axis_0, split_sizes = var_3425, x = var_160_cast_fp16)[name = string("op_3426")];
+            tensor<int32, [2]> var_3429 = const()[name = string("op_3429"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_3430_axis_0 = const()[name = string("op_3430_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_3430_0, tensor<fp16, [1, 2304, 32]> var_3430_1 = split(axis = var_3430_axis_0, split_sizes = var_3429, x = var_163_cast_fp16)[name = string("op_3430")];
+            tensor<int32, [1]> cos_173_axes_0 = const()[name = string("cos_173_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_173 = expand_dims(axes = cos_173_axes_0, x = var_3426_0)[name = string("cos_173")];
+            tensor<int32, [1]> sin_173_axes_0 = const()[name = string("sin_173_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_173 = expand_dims(axes = sin_173_axes_0, x = var_3430_0)[name = string("sin_173")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3435_cast_fp16 = mul(x = var_3422_cast_fp16_0, y = cos_173)[name = string("op_3435_cast_fp16")];
+            tensor<int32, [4]> x1_85_begin_0 = const()[name = string("x1_85_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_85_end_0 = const()[name = string("x1_85_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_85_end_mask_0 = const()[name = string("x1_85_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_85_cast_fp16 = slice_by_index(begin = x1_85_begin_0, end = x1_85_end_0, end_mask = x1_85_end_mask_0, x = var_3422_cast_fp16_0)[name = string("x1_85_cast_fp16")];
+            tensor<int32, [4]> x2_85_begin_0 = const()[name = string("x2_85_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_85_end_0 = const()[name = string("x2_85_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_85_end_mask_0 = const()[name = string("x2_85_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_85_cast_fp16 = slice_by_index(begin = x2_85_begin_0, end = x2_85_end_0, end_mask = x2_85_end_mask_0, x = var_3422_cast_fp16_0)[name = string("x2_85_cast_fp16")];
+            fp16 const_263_promoted_to_fp16 = const()[name = string("const_263_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_3446_cast_fp16 = mul(x = x2_85_cast_fp16, y = const_263_promoted_to_fp16)[name = string("op_3446_cast_fp16")];
+            bool var_3448_interleave_0 = const()[name = string("op_3448_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_3448_cast_fp16 = concat(axis = var_38, interleave = var_3448_interleave_0, values = (var_3446_cast_fp16, x1_85_cast_fp16))[name = string("op_3448_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3449_cast_fp16 = mul(x = var_3448_cast_fp16, y = sin_173)[name = string("op_3449_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3450_cast_fp16 = add(x = var_3435_cast_fp16, y = var_3449_cast_fp16)[name = string("op_3450_cast_fp16")];
+            tensor<int32, [1]> cos_177_axes_0 = const()[name = string("cos_177_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_177 = expand_dims(axes = cos_177_axes_0, x = var_3426_1)[name = string("cos_177")];
+            tensor<int32, [1]> sin_177_axes_0 = const()[name = string("sin_177_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_177 = expand_dims(axes = sin_177_axes_0, x = var_3430_1)[name = string("sin_177")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3453_cast_fp16 = mul(x = var_3422_cast_fp16_1, y = cos_177)[name = string("op_3453_cast_fp16")];
+            tensor<int32, [4]> x1_87_begin_0 = const()[name = string("x1_87_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_87_end_0 = const()[name = string("x1_87_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_87_end_mask_0 = const()[name = string("x1_87_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_87_cast_fp16 = slice_by_index(begin = x1_87_begin_0, end = x1_87_end_0, end_mask = x1_87_end_mask_0, x = var_3422_cast_fp16_1)[name = string("x1_87_cast_fp16")];
+            tensor<int32, [4]> x2_87_begin_0 = const()[name = string("x2_87_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_87_end_0 = const()[name = string("x2_87_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_87_end_mask_0 = const()[name = string("x2_87_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_87_cast_fp16 = slice_by_index(begin = x2_87_begin_0, end = x2_87_end_0, end_mask = x2_87_end_mask_0, x = var_3422_cast_fp16_1)[name = string("x2_87_cast_fp16")];
+            fp16 const_266_promoted_to_fp16 = const()[name = string("const_266_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_3464_cast_fp16 = mul(x = x2_87_cast_fp16, y = const_266_promoted_to_fp16)[name = string("op_3464_cast_fp16")];
+            bool var_3466_interleave_0 = const()[name = string("op_3466_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_3466_cast_fp16 = concat(axis = var_38, interleave = var_3466_interleave_0, values = (var_3464_cast_fp16, x1_87_cast_fp16))[name = string("op_3466_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3467_cast_fp16 = mul(x = var_3466_cast_fp16, y = sin_177)[name = string("op_3467_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3468_cast_fp16 = add(x = var_3453_cast_fp16, y = var_3467_cast_fp16)[name = string("op_3468_cast_fp16")];
+            bool key_states_21_interleave_0 = const()[name = string("key_states_21_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> key_states_21_cast_fp16 = concat(axis = var_38, interleave = key_states_21_interleave_0, values = (var_3450_cast_fp16, var_3468_cast_fp16))[name = string("key_states_21_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_10_self_attn_v_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_10_self_attn_v_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(227361600)))];
+            tensor<fp16, [1, 2304, 768]> linear_73_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_10_self_attn_v_proj_linear_weight_promoted_to_fp16, x = clip_140_cast_fp16)[name = string("linear_73_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_10_self_attn_v_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_10_self_attn_v_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.52p+4)];
+            fp16 model_vision_tower_encoder_layers_10_self_attn_v_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_10_self_attn_v_proj_output_max_promoted_to_fp16"), val = fp16(0x1.5p+4)];
+            tensor<fp16, [1, 2304, 768]> clip_145_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_10_self_attn_v_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_10_self_attn_v_proj_output_max_promoted_to_fp16, x = linear_73_cast_fp16)[name = string("clip_145_cast_fp16")];
+            tensor<int32, [4]> var_3481 = const()[name = string("op_3481"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_603_cast_fp16 = reshape(shape = var_3481, x = clip_145_cast_fp16)[name = string("hidden_states_603_cast_fp16")];
+            fp16 var_33_promoted_73_to_fp16 = const()[name = string("op_33_promoted_73_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_3484_cast_fp16 = pow(x = hidden_states_603_cast_fp16, y = var_33_promoted_73_to_fp16)[name = string("op_3484_cast_fp16")];
+            tensor<int32, [1]> var_3486_axes_0 = const()[name = string("op_3486_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3486_keep_dims_0 = const()[name = string("op_3486_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_3486_cast_fp16 = reduce_mean(axes = var_3486_axes_0, keep_dims = var_3486_keep_dims_0, x = var_3484_cast_fp16)[name = string("op_3486_cast_fp16")];
+            fp16 var_3487_to_fp16 = const()[name = string("op_3487_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_147_cast_fp16 = add(x = var_3486_cast_fp16, y = var_3487_to_fp16)[name = string("mean_squared_147_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_3489_cast_fp16 = pow(x = mean_squared_147_cast_fp16, y = var_27_to_fp16)[name = string("op_3489_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_273_cast_fp16 = mul(x = hidden_states_603_cast_fp16, y = var_3489_cast_fp16)[name = string("normed_output_273_cast_fp16")];
+            tensor<int32, [4]> hidden_states_609_perm_0 = const()[name = string("hidden_states_609_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            bool matmul_10_transpose_y_0 = const()[name = string("matmul_10_transpose_y_0"), val = bool(true)];
+            bool matmul_10_transpose_x_0 = const()[name = string("matmul_10_transpose_x_0"), val = bool(false)];
+            tensor<int32, [4]> transpose_84_perm_0 = const()[name = string("transpose_84_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<int32, [4]> transpose_85_perm_0 = const()[name = string("transpose_85_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_85 = transpose(perm = transpose_85_perm_0, x = key_states_21_cast_fp16)[name = string("transpose_117")];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_84 = transpose(perm = transpose_84_perm_0, x = query_states_21_cast_fp16)[name = string("transpose_118")];
+            tensor<fp16, [1, 12, 2304, 2304]> matmul_10_cast_fp16 = matmul(transpose_x = matmul_10_transpose_x_0, transpose_y = matmul_10_transpose_y_0, x = transpose_84, y = transpose_85)[name = string("matmul_10_cast_fp16")];
+            tensor<fp16, [1, 12, 2304, 2304]> add_10_cast_fp16 = add(x = matmul_10_cast_fp16, y = attention_mask_cast_fp16)[name = string("add_10_cast_fp16")];
+            int32 softmax_10_axis_0 = const()[name = string("softmax_10_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 12, 2304, 2304]> softmax_10_cast_fp16 = softmax(axis = softmax_10_axis_0, x = add_10_cast_fp16)[name = string("softmax_10_cast_fp16")];
+            bool attn_output_41_transpose_x_0 = const()[name = string("attn_output_41_transpose_x_0"), val = bool(false)];
+            bool attn_output_41_transpose_y_0 = const()[name = string("attn_output_41_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 12, 2304, 64]> hidden_states_609_cast_fp16 = transpose(perm = hidden_states_609_perm_0, x = normed_output_273_cast_fp16)[name = string("transpose_119")];
+            tensor<fp16, [1, 12, 2304, 64]> attn_output_41_cast_fp16 = matmul(transpose_x = attn_output_41_transpose_x_0, transpose_y = attn_output_41_transpose_y_0, x = softmax_10_cast_fp16, y = hidden_states_609_cast_fp16)[name = string("attn_output_41_cast_fp16")];
+            tensor<int32, [4]> var_3494_perm_0 = const()[name = string("op_3494_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_3496 = const()[name = string("op_3496"), val = tensor<int32, [3]>([1, 2304, -1])];
+            tensor<fp16, [1, 2304, 12, 64]> var_3494_cast_fp16 = transpose(perm = var_3494_perm_0, x = attn_output_41_cast_fp16)[name = string("transpose_116")];
+            tensor<fp16, [1, 2304, 768]> var_3497_cast_fp16 = reshape(shape = var_3496, x = var_3494_cast_fp16)[name = string("op_3497_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_10_self_attn_o_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_10_self_attn_o_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.f2p+0)];
+            fp16 model_vision_tower_encoder_layers_10_self_attn_o_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_10_self_attn_o_proj_input_max_promoted_to_fp16"), val = fp16(0x1.fp+0)];
+            tensor<fp16, [1, 2304, 768]> clip_146_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_10_self_attn_o_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_10_self_attn_o_proj_input_max_promoted_to_fp16, x = var_3497_cast_fp16)[name = string("clip_146_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_10_self_attn_o_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_10_self_attn_o_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(228541312)))];
+            tensor<fp16, [1, 2304, 768]> linear_74_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_10_self_attn_o_proj_linear_weight_promoted_to_fp16, x = clip_146_cast_fp16)[name = string("linear_74_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_10_self_attn_o_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_10_self_attn_o_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.3p+1)];
+            fp16 model_vision_tower_encoder_layers_10_self_attn_o_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_10_self_attn_o_proj_output_max_promoted_to_fp16"), val = fp16(0x1.2ep+1)];
+            tensor<fp16, [1, 2304, 768]> clip_147_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_10_self_attn_o_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_10_self_attn_o_proj_output_max_promoted_to_fp16, x = linear_74_cast_fp16)[name = string("clip_147_cast_fp16")];
+            fp16 var_33_promoted_74_to_fp16 = const()[name = string("op_33_promoted_74_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_3510_cast_fp16 = pow(x = clip_147_cast_fp16, y = var_33_promoted_74_to_fp16)[name = string("op_3510_cast_fp16")];
+            tensor<int32, [1]> var_3512_axes_0 = const()[name = string("op_3512_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3512_keep_dims_0 = const()[name = string("op_3512_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_3512_cast_fp16 = reduce_mean(axes = var_3512_axes_0, keep_dims = var_3512_keep_dims_0, x = var_3510_cast_fp16)[name = string("op_3512_cast_fp16")];
+            fp16 var_3513_to_fp16 = const()[name = string("op_3513_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_149_cast_fp16 = add(x = var_3512_cast_fp16, y = var_3513_to_fp16)[name = string("mean_squared_149_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_3515_cast_fp16 = pow(x = mean_squared_149_cast_fp16, y = var_27_to_fp16)[name = string("op_3515_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_275_cast_fp16 = mul(x = clip_147_cast_fp16, y = var_3515_cast_fp16)[name = string("normed_output_275_cast_fp16")];
+            tensor<fp16, [768]> const_267_to_fp16 = const()[name = string("const_267_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(229721024)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_277_cast_fp16 = mul(x = normed_output_275_cast_fp16, y = const_267_to_fp16)[name = string("normed_output_277_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_621_cast_fp16 = add(x = hidden_states_583_cast_fp16, y = normed_output_277_cast_fp16)[name = string("hidden_states_621_cast_fp16")];
+            fp16 var_33_promoted_75_to_fp16 = const()[name = string("op_33_promoted_75_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_3523_cast_fp16 = pow(x = hidden_states_621_cast_fp16, y = var_33_promoted_75_to_fp16)[name = string("op_3523_cast_fp16")];
+            tensor<int32, [1]> var_3525_axes_0 = const()[name = string("op_3525_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3525_keep_dims_0 = const()[name = string("op_3525_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_3525_cast_fp16 = reduce_mean(axes = var_3525_axes_0, keep_dims = var_3525_keep_dims_0, x = var_3523_cast_fp16)[name = string("op_3525_cast_fp16")];
+            fp16 var_3526_to_fp16 = const()[name = string("op_3526_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_151_cast_fp16 = add(x = var_3525_cast_fp16, y = var_3526_to_fp16)[name = string("mean_squared_151_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_3528_cast_fp16 = pow(x = mean_squared_151_cast_fp16, y = var_27_to_fp16)[name = string("op_3528_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_279_cast_fp16 = mul(x = hidden_states_621_cast_fp16, y = var_3528_cast_fp16)[name = string("normed_output_279_cast_fp16")];
+            tensor<fp16, [768]> const_268_to_fp16 = const()[name = string("const_268_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(229722624)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_281_cast_fp16 = mul(x = normed_output_279_cast_fp16, y = const_268_to_fp16)[name = string("normed_output_281_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_10_mlp_gate_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_10_mlp_gate_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.1ep+3)];
+            fp16 model_vision_tower_encoder_layers_10_mlp_gate_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_10_mlp_gate_proj_input_max_promoted_to_fp16"), val = fp16(0x1.1cp+3)];
+            tensor<fp16, [1, 2304, 768]> clip_148_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_10_mlp_gate_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_10_mlp_gate_proj_input_max_promoted_to_fp16, x = normed_output_281_cast_fp16)[name = string("clip_148_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_10_mlp_gate_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_10_mlp_gate_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(229724224)))];
+            tensor<fp16, [1, 2304, 3072]> linear_75_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_10_mlp_gate_proj_linear_weight_promoted_to_fp16, x = clip_148_cast_fp16)[name = string("linear_75_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_10_mlp_gate_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_10_mlp_gate_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.38p+3)];
+            fp16 model_vision_tower_encoder_layers_10_mlp_gate_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_10_mlp_gate_proj_output_max_promoted_to_fp16"), val = fp16(0x1.36p+3)];
+            tensor<fp16, [1, 2304, 3072]> clip_149_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_10_mlp_gate_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_10_mlp_gate_proj_output_max_promoted_to_fp16, x = linear_75_cast_fp16)[name = string("clip_149_cast_fp16")];
+            string var_3545_mode_0 = const()[name = string("op_3545_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 2304, 3072]> var_3545_cast_fp16 = gelu(mode = var_3545_mode_0, x = clip_149_cast_fp16)[name = string("op_3545_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_10_mlp_up_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_10_mlp_up_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(234442880)))];
+            tensor<fp16, [1, 2304, 3072]> linear_76_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_10_mlp_up_proj_linear_weight_promoted_to_fp16, x = clip_148_cast_fp16)[name = string("linear_76_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_10_mlp_up_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_10_mlp_up_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.38p+3)];
+            fp16 model_vision_tower_encoder_layers_10_mlp_up_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_10_mlp_up_proj_output_max_promoted_to_fp16"), val = fp16(0x1.36p+3)];
+            tensor<fp16, [1, 2304, 3072]> clip_151_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_10_mlp_up_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_10_mlp_up_proj_output_max_promoted_to_fp16, x = linear_76_cast_fp16)[name = string("clip_151_cast_fp16")];
+            tensor<fp16, [1, 2304, 3072]> hidden_states_631_cast_fp16 = mul(x = var_3545_cast_fp16, y = clip_151_cast_fp16)[name = string("hidden_states_631_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_10_mlp_down_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_10_mlp_down_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.28p+4)];
+            fp16 model_vision_tower_encoder_layers_10_mlp_down_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_10_mlp_down_proj_input_max_promoted_to_fp16"), val = fp16(0x1.26p+4)];
+            tensor<fp16, [1, 2304, 3072]> clip_152_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_10_mlp_down_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_10_mlp_down_proj_input_max_promoted_to_fp16, x = hidden_states_631_cast_fp16)[name = string("clip_152_cast_fp16")];
+            tensor<fp16, [768, 3072]> model_vision_tower_encoder_layers_10_mlp_down_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_10_mlp_down_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 3072]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(239161536)))];
+            tensor<fp16, [1, 2304, 768]> linear_77_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_10_mlp_down_proj_linear_weight_promoted_to_fp16, x = clip_152_cast_fp16)[name = string("linear_77_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_10_mlp_down_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_10_mlp_down_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.66p+2)];
+            fp16 model_vision_tower_encoder_layers_10_mlp_down_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_10_mlp_down_proj_output_max_promoted_to_fp16"), val = fp16(0x1.64p+2)];
+            tensor<fp16, [1, 2304, 768]> clip_153_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_10_mlp_down_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_10_mlp_down_proj_output_max_promoted_to_fp16, x = linear_77_cast_fp16)[name = string("clip_153_cast_fp16")];
+            fp16 var_33_promoted_76_to_fp16 = const()[name = string("op_33_promoted_76_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_3567_cast_fp16 = pow(x = clip_153_cast_fp16, y = var_33_promoted_76_to_fp16)[name = string("op_3567_cast_fp16")];
+            tensor<int32, [1]> var_3569_axes_0 = const()[name = string("op_3569_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3569_keep_dims_0 = const()[name = string("op_3569_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_3569_cast_fp16 = reduce_mean(axes = var_3569_axes_0, keep_dims = var_3569_keep_dims_0, x = var_3567_cast_fp16)[name = string("op_3569_cast_fp16")];
+            fp16 var_3570_to_fp16 = const()[name = string("op_3570_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_153_cast_fp16 = add(x = var_3569_cast_fp16, y = var_3570_to_fp16)[name = string("mean_squared_153_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_3572_cast_fp16 = pow(x = mean_squared_153_cast_fp16, y = var_27_to_fp16)[name = string("op_3572_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_283_cast_fp16 = mul(x = clip_153_cast_fp16, y = var_3572_cast_fp16)[name = string("normed_output_283_cast_fp16")];
+            tensor<fp16, [768]> const_269_to_fp16 = const()[name = string("const_269_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(243880192)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_285_cast_fp16 = mul(x = normed_output_283_cast_fp16, y = const_269_to_fp16)[name = string("normed_output_285_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_641_cast_fp16 = add(x = hidden_states_621_cast_fp16, y = normed_output_285_cast_fp16)[name = string("hidden_states_641_cast_fp16")];
+            fp16 var_33_promoted_77_to_fp16 = const()[name = string("op_33_promoted_77_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_3586_cast_fp16 = pow(x = hidden_states_641_cast_fp16, y = var_33_promoted_77_to_fp16)[name = string("op_3586_cast_fp16")];
+            tensor<int32, [1]> var_3588_axes_0 = const()[name = string("op_3588_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3588_keep_dims_0 = const()[name = string("op_3588_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_3588_cast_fp16 = reduce_mean(axes = var_3588_axes_0, keep_dims = var_3588_keep_dims_0, x = var_3586_cast_fp16)[name = string("op_3588_cast_fp16")];
+            fp16 var_3589_to_fp16 = const()[name = string("op_3589_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_155_cast_fp16 = add(x = var_3588_cast_fp16, y = var_3589_to_fp16)[name = string("mean_squared_155_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_3591_cast_fp16 = pow(x = mean_squared_155_cast_fp16, y = var_27_to_fp16)[name = string("op_3591_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_287_cast_fp16 = mul(x = hidden_states_641_cast_fp16, y = var_3591_cast_fp16)[name = string("normed_output_287_cast_fp16")];
+            tensor<fp16, [768]> const_270_to_fp16 = const()[name = string("const_270_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(243881792)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_289_cast_fp16 = mul(x = normed_output_287_cast_fp16, y = const_270_to_fp16)[name = string("normed_output_289_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_11_self_attn_q_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_11_self_attn_q_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.7cp+3)];
+            fp16 model_vision_tower_encoder_layers_11_self_attn_q_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_11_self_attn_q_proj_input_max_promoted_to_fp16"), val = fp16(0x1.78p+3)];
+            tensor<fp16, [1, 2304, 768]> clip_154_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_11_self_attn_q_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_11_self_attn_q_proj_input_max_promoted_to_fp16, x = normed_output_289_cast_fp16)[name = string("clip_154_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_11_self_attn_q_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_11_self_attn_q_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(243883392)))];
+            tensor<fp16, [1, 2304, 768]> linear_78_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_11_self_attn_q_proj_linear_weight_promoted_to_fp16, x = clip_154_cast_fp16)[name = string("linear_78_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_11_self_attn_q_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_11_self_attn_q_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.dp+3)];
+            fp16 model_vision_tower_encoder_layers_11_self_attn_q_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_11_self_attn_q_proj_output_max_promoted_to_fp16"), val = fp16(0x1.ccp+3)];
+            tensor<fp16, [1, 2304, 768]> clip_155_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_11_self_attn_q_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_11_self_attn_q_proj_output_max_promoted_to_fp16, x = linear_78_cast_fp16)[name = string("clip_155_cast_fp16")];
+            tensor<int32, [4]> var_3613 = const()[name = string("op_3613"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_649_cast_fp16 = reshape(shape = var_3613, x = clip_155_cast_fp16)[name = string("hidden_states_649_cast_fp16")];
+            fp16 var_33_promoted_78_to_fp16 = const()[name = string("op_33_promoted_78_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_3617_cast_fp16 = pow(x = hidden_states_649_cast_fp16, y = var_33_promoted_78_to_fp16)[name = string("op_3617_cast_fp16")];
+            tensor<int32, [1]> var_3619_axes_0 = const()[name = string("op_3619_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3619_keep_dims_0 = const()[name = string("op_3619_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_3619_cast_fp16 = reduce_mean(axes = var_3619_axes_0, keep_dims = var_3619_keep_dims_0, x = var_3617_cast_fp16)[name = string("op_3619_cast_fp16")];
+            fp16 var_3620_to_fp16 = const()[name = string("op_3620_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_157_cast_fp16 = add(x = var_3619_cast_fp16, y = var_3620_to_fp16)[name = string("mean_squared_157_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_3622_cast_fp16 = pow(x = mean_squared_157_cast_fp16, y = var_27_to_fp16)[name = string("op_3622_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_291_cast_fp16 = mul(x = hidden_states_649_cast_fp16, y = var_3622_cast_fp16)[name = string("normed_output_291_cast_fp16")];
+            tensor<fp16, [64]> const_273_to_fp16 = const()[name = string("const_273_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(245063104)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_293_cast_fp16 = mul(x = normed_output_291_cast_fp16, y = const_273_to_fp16)[name = string("normed_output_293_cast_fp16")];
+            tensor<int32, [2]> var_3642 = const()[name = string("op_3642"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_3643_axis_0 = const()[name = string("op_3643_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_3643_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_3643_cast_fp16_1 = split(axis = var_3643_axis_0, split_sizes = var_3642, x = normed_output_293_cast_fp16)[name = string("op_3643_cast_fp16")];
+            tensor<int32, [2]> var_3646 = const()[name = string("op_3646"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_3647_axis_0 = const()[name = string("op_3647_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_3647_0, tensor<fp16, [1, 2304, 32]> var_3647_1 = split(axis = var_3647_axis_0, split_sizes = var_3646, x = var_160_cast_fp16)[name = string("op_3647")];
+            tensor<int32, [2]> var_3650 = const()[name = string("op_3650"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_3651_axis_0 = const()[name = string("op_3651_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_3651_0, tensor<fp16, [1, 2304, 32]> var_3651_1 = split(axis = var_3651_axis_0, split_sizes = var_3650, x = var_163_cast_fp16)[name = string("op_3651")];
+            tensor<int32, [1]> cos_181_axes_0 = const()[name = string("cos_181_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_181 = expand_dims(axes = cos_181_axes_0, x = var_3647_0)[name = string("cos_181")];
+            tensor<int32, [1]> sin_181_axes_0 = const()[name = string("sin_181_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_181 = expand_dims(axes = sin_181_axes_0, x = var_3651_0)[name = string("sin_181")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3656_cast_fp16 = mul(x = var_3643_cast_fp16_0, y = cos_181)[name = string("op_3656_cast_fp16")];
+            tensor<int32, [4]> x1_89_begin_0 = const()[name = string("x1_89_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_89_end_0 = const()[name = string("x1_89_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_89_end_mask_0 = const()[name = string("x1_89_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_89_cast_fp16 = slice_by_index(begin = x1_89_begin_0, end = x1_89_end_0, end_mask = x1_89_end_mask_0, x = var_3643_cast_fp16_0)[name = string("x1_89_cast_fp16")];
+            tensor<int32, [4]> x2_89_begin_0 = const()[name = string("x2_89_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_89_end_0 = const()[name = string("x2_89_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_89_end_mask_0 = const()[name = string("x2_89_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_89_cast_fp16 = slice_by_index(begin = x2_89_begin_0, end = x2_89_end_0, end_mask = x2_89_end_mask_0, x = var_3643_cast_fp16_0)[name = string("x2_89_cast_fp16")];
+            fp16 const_278_promoted_to_fp16 = const()[name = string("const_278_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_3667_cast_fp16 = mul(x = x2_89_cast_fp16, y = const_278_promoted_to_fp16)[name = string("op_3667_cast_fp16")];
+            bool var_3669_interleave_0 = const()[name = string("op_3669_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_3669_cast_fp16 = concat(axis = var_38, interleave = var_3669_interleave_0, values = (var_3667_cast_fp16, x1_89_cast_fp16))[name = string("op_3669_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3670_cast_fp16 = mul(x = var_3669_cast_fp16, y = sin_181)[name = string("op_3670_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3671_cast_fp16 = add(x = var_3656_cast_fp16, y = var_3670_cast_fp16)[name = string("op_3671_cast_fp16")];
+            tensor<int32, [1]> cos_185_axes_0 = const()[name = string("cos_185_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_185 = expand_dims(axes = cos_185_axes_0, x = var_3647_1)[name = string("cos_185")];
+            tensor<int32, [1]> sin_185_axes_0 = const()[name = string("sin_185_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_185 = expand_dims(axes = sin_185_axes_0, x = var_3651_1)[name = string("sin_185")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3674_cast_fp16 = mul(x = var_3643_cast_fp16_1, y = cos_185)[name = string("op_3674_cast_fp16")];
+            tensor<int32, [4]> x1_91_begin_0 = const()[name = string("x1_91_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_91_end_0 = const()[name = string("x1_91_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_91_end_mask_0 = const()[name = string("x1_91_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_91_cast_fp16 = slice_by_index(begin = x1_91_begin_0, end = x1_91_end_0, end_mask = x1_91_end_mask_0, x = var_3643_cast_fp16_1)[name = string("x1_91_cast_fp16")];
+            tensor<int32, [4]> x2_91_begin_0 = const()[name = string("x2_91_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_91_end_0 = const()[name = string("x2_91_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_91_end_mask_0 = const()[name = string("x2_91_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_91_cast_fp16 = slice_by_index(begin = x2_91_begin_0, end = x2_91_end_0, end_mask = x2_91_end_mask_0, x = var_3643_cast_fp16_1)[name = string("x2_91_cast_fp16")];
+            fp16 const_281_promoted_to_fp16 = const()[name = string("const_281_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_3685_cast_fp16 = mul(x = x2_91_cast_fp16, y = const_281_promoted_to_fp16)[name = string("op_3685_cast_fp16")];
+            bool var_3687_interleave_0 = const()[name = string("op_3687_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_3687_cast_fp16 = concat(axis = var_38, interleave = var_3687_interleave_0, values = (var_3685_cast_fp16, x1_91_cast_fp16))[name = string("op_3687_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3688_cast_fp16 = mul(x = var_3687_cast_fp16, y = sin_185)[name = string("op_3688_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3689_cast_fp16 = add(x = var_3674_cast_fp16, y = var_3688_cast_fp16)[name = string("op_3689_cast_fp16")];
+            bool query_states_23_interleave_0 = const()[name = string("query_states_23_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> query_states_23_cast_fp16 = concat(axis = var_38, interleave = query_states_23_interleave_0, values = (var_3671_cast_fp16, var_3689_cast_fp16))[name = string("query_states_23_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_11_self_attn_k_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_11_self_attn_k_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(245063296)))];
+            tensor<fp16, [1, 2304, 768]> linear_79_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_11_self_attn_k_proj_linear_weight_promoted_to_fp16, x = clip_154_cast_fp16)[name = string("linear_79_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_11_self_attn_k_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_11_self_attn_k_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.bap+3)];
+            fp16 model_vision_tower_encoder_layers_11_self_attn_k_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_11_self_attn_k_proj_output_max_promoted_to_fp16"), val = fp16(0x1.b8p+3)];
+            tensor<fp16, [1, 2304, 768]> clip_157_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_11_self_attn_k_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_11_self_attn_k_proj_output_max_promoted_to_fp16, x = linear_79_cast_fp16)[name = string("clip_157_cast_fp16")];
+            tensor<int32, [4]> var_3702 = const()[name = string("op_3702"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_655_cast_fp16 = reshape(shape = var_3702, x = clip_157_cast_fp16)[name = string("hidden_states_655_cast_fp16")];
+            fp16 var_33_promoted_79_to_fp16 = const()[name = string("op_33_promoted_79_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_3706_cast_fp16 = pow(x = hidden_states_655_cast_fp16, y = var_33_promoted_79_to_fp16)[name = string("op_3706_cast_fp16")];
+            tensor<int32, [1]> var_3708_axes_0 = const()[name = string("op_3708_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3708_keep_dims_0 = const()[name = string("op_3708_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_3708_cast_fp16 = reduce_mean(axes = var_3708_axes_0, keep_dims = var_3708_keep_dims_0, x = var_3706_cast_fp16)[name = string("op_3708_cast_fp16")];
+            fp16 var_3709_to_fp16 = const()[name = string("op_3709_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_159_cast_fp16 = add(x = var_3708_cast_fp16, y = var_3709_to_fp16)[name = string("mean_squared_159_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_3711_cast_fp16 = pow(x = mean_squared_159_cast_fp16, y = var_27_to_fp16)[name = string("op_3711_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_295_cast_fp16 = mul(x = hidden_states_655_cast_fp16, y = var_3711_cast_fp16)[name = string("normed_output_295_cast_fp16")];
+            tensor<fp16, [64]> const_282_to_fp16 = const()[name = string("const_282_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(246243008)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_297_cast_fp16 = mul(x = normed_output_295_cast_fp16, y = const_282_to_fp16)[name = string("normed_output_297_cast_fp16")];
+            tensor<int32, [2]> var_3731 = const()[name = string("op_3731"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_3732_axis_0 = const()[name = string("op_3732_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_3732_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_3732_cast_fp16_1 = split(axis = var_3732_axis_0, split_sizes = var_3731, x = normed_output_297_cast_fp16)[name = string("op_3732_cast_fp16")];
+            tensor<int32, [2]> var_3735 = const()[name = string("op_3735"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_3736_axis_0 = const()[name = string("op_3736_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_3736_0, tensor<fp16, [1, 2304, 32]> var_3736_1 = split(axis = var_3736_axis_0, split_sizes = var_3735, x = var_160_cast_fp16)[name = string("op_3736")];
+            tensor<int32, [2]> var_3739 = const()[name = string("op_3739"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_3740_axis_0 = const()[name = string("op_3740_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_3740_0, tensor<fp16, [1, 2304, 32]> var_3740_1 = split(axis = var_3740_axis_0, split_sizes = var_3739, x = var_163_cast_fp16)[name = string("op_3740")];
+            tensor<int32, [1]> cos_189_axes_0 = const()[name = string("cos_189_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_189 = expand_dims(axes = cos_189_axes_0, x = var_3736_0)[name = string("cos_189")];
+            tensor<int32, [1]> sin_189_axes_0 = const()[name = string("sin_189_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_189 = expand_dims(axes = sin_189_axes_0, x = var_3740_0)[name = string("sin_189")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3745_cast_fp16 = mul(x = var_3732_cast_fp16_0, y = cos_189)[name = string("op_3745_cast_fp16")];
+            tensor<int32, [4]> x1_93_begin_0 = const()[name = string("x1_93_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_93_end_0 = const()[name = string("x1_93_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_93_end_mask_0 = const()[name = string("x1_93_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_93_cast_fp16 = slice_by_index(begin = x1_93_begin_0, end = x1_93_end_0, end_mask = x1_93_end_mask_0, x = var_3732_cast_fp16_0)[name = string("x1_93_cast_fp16")];
+            tensor<int32, [4]> x2_93_begin_0 = const()[name = string("x2_93_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_93_end_0 = const()[name = string("x2_93_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_93_end_mask_0 = const()[name = string("x2_93_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_93_cast_fp16 = slice_by_index(begin = x2_93_begin_0, end = x2_93_end_0, end_mask = x2_93_end_mask_0, x = var_3732_cast_fp16_0)[name = string("x2_93_cast_fp16")];
+            fp16 const_287_promoted_to_fp16 = const()[name = string("const_287_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_3756_cast_fp16 = mul(x = x2_93_cast_fp16, y = const_287_promoted_to_fp16)[name = string("op_3756_cast_fp16")];
+            bool var_3758_interleave_0 = const()[name = string("op_3758_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_3758_cast_fp16 = concat(axis = var_38, interleave = var_3758_interleave_0, values = (var_3756_cast_fp16, x1_93_cast_fp16))[name = string("op_3758_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3759_cast_fp16 = mul(x = var_3758_cast_fp16, y = sin_189)[name = string("op_3759_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3760_cast_fp16 = add(x = var_3745_cast_fp16, y = var_3759_cast_fp16)[name = string("op_3760_cast_fp16")];
+            tensor<int32, [1]> cos_193_axes_0 = const()[name = string("cos_193_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_193 = expand_dims(axes = cos_193_axes_0, x = var_3736_1)[name = string("cos_193")];
+            tensor<int32, [1]> sin_193_axes_0 = const()[name = string("sin_193_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_193 = expand_dims(axes = sin_193_axes_0, x = var_3740_1)[name = string("sin_193")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3763_cast_fp16 = mul(x = var_3732_cast_fp16_1, y = cos_193)[name = string("op_3763_cast_fp16")];
+            tensor<int32, [4]> x1_95_begin_0 = const()[name = string("x1_95_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_95_end_0 = const()[name = string("x1_95_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_95_end_mask_0 = const()[name = string("x1_95_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_95_cast_fp16 = slice_by_index(begin = x1_95_begin_0, end = x1_95_end_0, end_mask = x1_95_end_mask_0, x = var_3732_cast_fp16_1)[name = string("x1_95_cast_fp16")];
+            tensor<int32, [4]> x2_95_begin_0 = const()[name = string("x2_95_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_95_end_0 = const()[name = string("x2_95_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_95_end_mask_0 = const()[name = string("x2_95_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_95_cast_fp16 = slice_by_index(begin = x2_95_begin_0, end = x2_95_end_0, end_mask = x2_95_end_mask_0, x = var_3732_cast_fp16_1)[name = string("x2_95_cast_fp16")];
+            fp16 const_290_promoted_to_fp16 = const()[name = string("const_290_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_3774_cast_fp16 = mul(x = x2_95_cast_fp16, y = const_290_promoted_to_fp16)[name = string("op_3774_cast_fp16")];
+            bool var_3776_interleave_0 = const()[name = string("op_3776_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_3776_cast_fp16 = concat(axis = var_38, interleave = var_3776_interleave_0, values = (var_3774_cast_fp16, x1_95_cast_fp16))[name = string("op_3776_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3777_cast_fp16 = mul(x = var_3776_cast_fp16, y = sin_193)[name = string("op_3777_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3778_cast_fp16 = add(x = var_3763_cast_fp16, y = var_3777_cast_fp16)[name = string("op_3778_cast_fp16")];
+            bool key_states_23_interleave_0 = const()[name = string("key_states_23_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> key_states_23_cast_fp16 = concat(axis = var_38, interleave = key_states_23_interleave_0, values = (var_3760_cast_fp16, var_3778_cast_fp16))[name = string("key_states_23_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_11_self_attn_v_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_11_self_attn_v_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(246243200)))];
+            tensor<fp16, [1, 2304, 768]> linear_80_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_11_self_attn_v_proj_linear_weight_promoted_to_fp16, x = clip_154_cast_fp16)[name = string("linear_80_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_11_self_attn_v_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_11_self_attn_v_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.bap+3)];
+            fp16 model_vision_tower_encoder_layers_11_self_attn_v_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_11_self_attn_v_proj_output_max_promoted_to_fp16"), val = fp16(0x1.b8p+3)];
+            tensor<fp16, [1, 2304, 768]> clip_159_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_11_self_attn_v_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_11_self_attn_v_proj_output_max_promoted_to_fp16, x = linear_80_cast_fp16)[name = string("clip_159_cast_fp16")];
+            tensor<int32, [4]> var_3791 = const()[name = string("op_3791"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_661_cast_fp16 = reshape(shape = var_3791, x = clip_159_cast_fp16)[name = string("hidden_states_661_cast_fp16")];
+            fp16 var_33_promoted_80_to_fp16 = const()[name = string("op_33_promoted_80_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_3794_cast_fp16 = pow(x = hidden_states_661_cast_fp16, y = var_33_promoted_80_to_fp16)[name = string("op_3794_cast_fp16")];
+            tensor<int32, [1]> var_3796_axes_0 = const()[name = string("op_3796_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3796_keep_dims_0 = const()[name = string("op_3796_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_3796_cast_fp16 = reduce_mean(axes = var_3796_axes_0, keep_dims = var_3796_keep_dims_0, x = var_3794_cast_fp16)[name = string("op_3796_cast_fp16")];
+            fp16 var_3797_to_fp16 = const()[name = string("op_3797_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_161_cast_fp16 = add(x = var_3796_cast_fp16, y = var_3797_to_fp16)[name = string("mean_squared_161_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_3799_cast_fp16 = pow(x = mean_squared_161_cast_fp16, y = var_27_to_fp16)[name = string("op_3799_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_299_cast_fp16 = mul(x = hidden_states_661_cast_fp16, y = var_3799_cast_fp16)[name = string("normed_output_299_cast_fp16")];
+            tensor<int32, [4]> hidden_states_667_perm_0 = const()[name = string("hidden_states_667_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            bool matmul_11_transpose_y_0 = const()[name = string("matmul_11_transpose_y_0"), val = bool(true)];
+            bool matmul_11_transpose_x_0 = const()[name = string("matmul_11_transpose_x_0"), val = bool(false)];
+            tensor<int32, [4]> transpose_86_perm_0 = const()[name = string("transpose_86_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<int32, [4]> transpose_87_perm_0 = const()[name = string("transpose_87_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_87 = transpose(perm = transpose_87_perm_0, x = key_states_23_cast_fp16)[name = string("transpose_113")];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_86 = transpose(perm = transpose_86_perm_0, x = query_states_23_cast_fp16)[name = string("transpose_114")];
+            tensor<fp16, [1, 12, 2304, 2304]> matmul_11_cast_fp16 = matmul(transpose_x = matmul_11_transpose_x_0, transpose_y = matmul_11_transpose_y_0, x = transpose_86, y = transpose_87)[name = string("matmul_11_cast_fp16")];
+            tensor<fp16, [1, 12, 2304, 2304]> add_11_cast_fp16 = add(x = matmul_11_cast_fp16, y = attention_mask_cast_fp16)[name = string("add_11_cast_fp16")];
+            int32 softmax_11_axis_0 = const()[name = string("softmax_11_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 12, 2304, 2304]> softmax_11_cast_fp16 = softmax(axis = softmax_11_axis_0, x = add_11_cast_fp16)[name = string("softmax_11_cast_fp16")];
+            bool attn_output_45_transpose_x_0 = const()[name = string("attn_output_45_transpose_x_0"), val = bool(false)];
+            bool attn_output_45_transpose_y_0 = const()[name = string("attn_output_45_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 12, 2304, 64]> hidden_states_667_cast_fp16 = transpose(perm = hidden_states_667_perm_0, x = normed_output_299_cast_fp16)[name = string("transpose_115")];
+            tensor<fp16, [1, 12, 2304, 64]> attn_output_45_cast_fp16 = matmul(transpose_x = attn_output_45_transpose_x_0, transpose_y = attn_output_45_transpose_y_0, x = softmax_11_cast_fp16, y = hidden_states_667_cast_fp16)[name = string("attn_output_45_cast_fp16")];
+            tensor<int32, [4]> var_3804_perm_0 = const()[name = string("op_3804_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_3806 = const()[name = string("op_3806"), val = tensor<int32, [3]>([1, 2304, -1])];
+            tensor<fp16, [1, 2304, 12, 64]> var_3804_cast_fp16 = transpose(perm = var_3804_perm_0, x = attn_output_45_cast_fp16)[name = string("transpose_112")];
+            tensor<fp16, [1, 2304, 768]> var_3807_cast_fp16 = reshape(shape = var_3806, x = var_3804_cast_fp16)[name = string("op_3807_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_11_self_attn_o_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_11_self_attn_o_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.34p+1)];
+            fp16 model_vision_tower_encoder_layers_11_self_attn_o_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_11_self_attn_o_proj_input_max_promoted_to_fp16"), val = fp16(0x1.32p+1)];
+            tensor<fp16, [1, 2304, 768]> clip_160_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_11_self_attn_o_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_11_self_attn_o_proj_input_max_promoted_to_fp16, x = var_3807_cast_fp16)[name = string("clip_160_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_11_self_attn_o_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_11_self_attn_o_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(247422912)))];
+            tensor<fp16, [1, 2304, 768]> linear_81_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_11_self_attn_o_proj_linear_weight_promoted_to_fp16, x = clip_160_cast_fp16)[name = string("linear_81_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_11_self_attn_o_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_11_self_attn_o_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.dep+1)];
+            fp16 model_vision_tower_encoder_layers_11_self_attn_o_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_11_self_attn_o_proj_output_max_promoted_to_fp16"), val = fp16(0x1.dap+1)];
+            tensor<fp16, [1, 2304, 768]> clip_161_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_11_self_attn_o_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_11_self_attn_o_proj_output_max_promoted_to_fp16, x = linear_81_cast_fp16)[name = string("clip_161_cast_fp16")];
+            fp16 var_33_promoted_81_to_fp16 = const()[name = string("op_33_promoted_81_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_3820_cast_fp16 = pow(x = clip_161_cast_fp16, y = var_33_promoted_81_to_fp16)[name = string("op_3820_cast_fp16")];
+            tensor<int32, [1]> var_3822_axes_0 = const()[name = string("op_3822_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3822_keep_dims_0 = const()[name = string("op_3822_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_3822_cast_fp16 = reduce_mean(axes = var_3822_axes_0, keep_dims = var_3822_keep_dims_0, x = var_3820_cast_fp16)[name = string("op_3822_cast_fp16")];
+            fp16 var_3823_to_fp16 = const()[name = string("op_3823_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_163_cast_fp16 = add(x = var_3822_cast_fp16, y = var_3823_to_fp16)[name = string("mean_squared_163_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_3825_cast_fp16 = pow(x = mean_squared_163_cast_fp16, y = var_27_to_fp16)[name = string("op_3825_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_301_cast_fp16 = mul(x = clip_161_cast_fp16, y = var_3825_cast_fp16)[name = string("normed_output_301_cast_fp16")];
+            tensor<fp16, [768]> const_291_to_fp16 = const()[name = string("const_291_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(248602624)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_303_cast_fp16 = mul(x = normed_output_301_cast_fp16, y = const_291_to_fp16)[name = string("normed_output_303_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_679_cast_fp16 = add(x = hidden_states_641_cast_fp16, y = normed_output_303_cast_fp16)[name = string("hidden_states_679_cast_fp16")];
+            fp16 var_33_promoted_82_to_fp16 = const()[name = string("op_33_promoted_82_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_3833_cast_fp16 = pow(x = hidden_states_679_cast_fp16, y = var_33_promoted_82_to_fp16)[name = string("op_3833_cast_fp16")];
+            tensor<int32, [1]> var_3835_axes_0 = const()[name = string("op_3835_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3835_keep_dims_0 = const()[name = string("op_3835_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_3835_cast_fp16 = reduce_mean(axes = var_3835_axes_0, keep_dims = var_3835_keep_dims_0, x = var_3833_cast_fp16)[name = string("op_3835_cast_fp16")];
+            fp16 var_3836_to_fp16 = const()[name = string("op_3836_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_165_cast_fp16 = add(x = var_3835_cast_fp16, y = var_3836_to_fp16)[name = string("mean_squared_165_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_3838_cast_fp16 = pow(x = mean_squared_165_cast_fp16, y = var_27_to_fp16)[name = string("op_3838_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_305_cast_fp16 = mul(x = hidden_states_679_cast_fp16, y = var_3838_cast_fp16)[name = string("normed_output_305_cast_fp16")];
+            tensor<fp16, [768]> const_292_to_fp16 = const()[name = string("const_292_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(248604224)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_307_cast_fp16 = mul(x = normed_output_305_cast_fp16, y = const_292_to_fp16)[name = string("normed_output_307_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_11_mlp_gate_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_11_mlp_gate_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.84p+2)];
+            fp16 model_vision_tower_encoder_layers_11_mlp_gate_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_11_mlp_gate_proj_input_max_promoted_to_fp16"), val = fp16(0x1.82p+2)];
+            tensor<fp16, [1, 2304, 768]> clip_162_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_11_mlp_gate_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_11_mlp_gate_proj_input_max_promoted_to_fp16, x = normed_output_307_cast_fp16)[name = string("clip_162_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_11_mlp_gate_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_11_mlp_gate_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(248605824)))];
+            tensor<fp16, [1, 2304, 3072]> linear_82_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_11_mlp_gate_proj_linear_weight_promoted_to_fp16, x = clip_162_cast_fp16)[name = string("linear_82_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_11_mlp_gate_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_11_mlp_gate_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.8p+2)];
+            fp16 model_vision_tower_encoder_layers_11_mlp_gate_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_11_mlp_gate_proj_output_max_promoted_to_fp16"), val = fp16(0x1.7ep+2)];
+            tensor<fp16, [1, 2304, 3072]> clip_163_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_11_mlp_gate_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_11_mlp_gate_proj_output_max_promoted_to_fp16, x = linear_82_cast_fp16)[name = string("clip_163_cast_fp16")];
+            string var_3855_mode_0 = const()[name = string("op_3855_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 2304, 3072]> var_3855_cast_fp16 = gelu(mode = var_3855_mode_0, x = clip_163_cast_fp16)[name = string("op_3855_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_11_mlp_up_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_11_mlp_up_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(253324480)))];
+            tensor<fp16, [1, 2304, 3072]> linear_83_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_11_mlp_up_proj_linear_weight_promoted_to_fp16, x = clip_162_cast_fp16)[name = string("linear_83_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_11_mlp_up_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_11_mlp_up_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.8p+2)];
+            fp16 model_vision_tower_encoder_layers_11_mlp_up_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_11_mlp_up_proj_output_max_promoted_to_fp16"), val = fp16(0x1.7ep+2)];
+            tensor<fp16, [1, 2304, 3072]> clip_165_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_11_mlp_up_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_11_mlp_up_proj_output_max_promoted_to_fp16, x = linear_83_cast_fp16)[name = string("clip_165_cast_fp16")];
+            tensor<fp16, [1, 2304, 3072]> hidden_states_689_cast_fp16 = mul(x = var_3855_cast_fp16, y = clip_165_cast_fp16)[name = string("hidden_states_689_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_11_mlp_down_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_11_mlp_down_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.2ep+3)];
+            fp16 model_vision_tower_encoder_layers_11_mlp_down_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_11_mlp_down_proj_input_max_promoted_to_fp16"), val = fp16(0x1.2cp+3)];
+            tensor<fp16, [1, 2304, 3072]> clip_166_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_11_mlp_down_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_11_mlp_down_proj_input_max_promoted_to_fp16, x = hidden_states_689_cast_fp16)[name = string("clip_166_cast_fp16")];
+            tensor<fp16, [768, 3072]> model_vision_tower_encoder_layers_11_mlp_down_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_11_mlp_down_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 3072]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(258043136)))];
+            tensor<fp16, [1, 2304, 768]> linear_84_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_11_mlp_down_proj_linear_weight_promoted_to_fp16, x = clip_166_cast_fp16)[name = string("linear_84_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_11_mlp_down_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_11_mlp_down_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.3cp+1)];
+            fp16 model_vision_tower_encoder_layers_11_mlp_down_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_11_mlp_down_proj_output_max_promoted_to_fp16"), val = fp16(0x1.3ap+1)];
+            tensor<fp16, [1, 2304, 768]> clip_167_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_11_mlp_down_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_11_mlp_down_proj_output_max_promoted_to_fp16, x = linear_84_cast_fp16)[name = string("clip_167_cast_fp16")];
+            fp16 var_33_promoted_83_to_fp16 = const()[name = string("op_33_promoted_83_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_3877_cast_fp16 = pow(x = clip_167_cast_fp16, y = var_33_promoted_83_to_fp16)[name = string("op_3877_cast_fp16")];
+            tensor<int32, [1]> var_3879_axes_0 = const()[name = string("op_3879_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3879_keep_dims_0 = const()[name = string("op_3879_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_3879_cast_fp16 = reduce_mean(axes = var_3879_axes_0, keep_dims = var_3879_keep_dims_0, x = var_3877_cast_fp16)[name = string("op_3879_cast_fp16")];
+            fp16 var_3880_to_fp16 = const()[name = string("op_3880_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_167_cast_fp16 = add(x = var_3879_cast_fp16, y = var_3880_to_fp16)[name = string("mean_squared_167_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_3882_cast_fp16 = pow(x = mean_squared_167_cast_fp16, y = var_27_to_fp16)[name = string("op_3882_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_309_cast_fp16 = mul(x = clip_167_cast_fp16, y = var_3882_cast_fp16)[name = string("normed_output_309_cast_fp16")];
+            tensor<fp16, [768]> const_293_to_fp16 = const()[name = string("const_293_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(262761792)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_311_cast_fp16 = mul(x = normed_output_309_cast_fp16, y = const_293_to_fp16)[name = string("normed_output_311_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_699_cast_fp16 = add(x = hidden_states_679_cast_fp16, y = normed_output_311_cast_fp16)[name = string("hidden_states_699_cast_fp16")];
+            fp16 var_33_promoted_84_to_fp16 = const()[name = string("op_33_promoted_84_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_3896_cast_fp16 = pow(x = hidden_states_699_cast_fp16, y = var_33_promoted_84_to_fp16)[name = string("op_3896_cast_fp16")];
+            tensor<int32, [1]> var_3898_axes_0 = const()[name = string("op_3898_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3898_keep_dims_0 = const()[name = string("op_3898_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_3898_cast_fp16 = reduce_mean(axes = var_3898_axes_0, keep_dims = var_3898_keep_dims_0, x = var_3896_cast_fp16)[name = string("op_3898_cast_fp16")];
+            fp16 var_3899_to_fp16 = const()[name = string("op_3899_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_169_cast_fp16 = add(x = var_3898_cast_fp16, y = var_3899_to_fp16)[name = string("mean_squared_169_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_3901_cast_fp16 = pow(x = mean_squared_169_cast_fp16, y = var_27_to_fp16)[name = string("op_3901_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_313_cast_fp16 = mul(x = hidden_states_699_cast_fp16, y = var_3901_cast_fp16)[name = string("normed_output_313_cast_fp16")];
+            tensor<fp16, [768]> const_294_to_fp16 = const()[name = string("const_294_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(262763392)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_315_cast_fp16 = mul(x = normed_output_313_cast_fp16, y = const_294_to_fp16)[name = string("normed_output_315_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_12_self_attn_q_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_12_self_attn_q_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.cap+3)];
+            fp16 model_vision_tower_encoder_layers_12_self_attn_q_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_12_self_attn_q_proj_input_max_promoted_to_fp16"), val = fp16(0x1.c6p+3)];
+            tensor<fp16, [1, 2304, 768]> clip_168_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_12_self_attn_q_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_12_self_attn_q_proj_input_max_promoted_to_fp16, x = normed_output_315_cast_fp16)[name = string("clip_168_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_12_self_attn_q_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_12_self_attn_q_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(262764992)))];
+            tensor<fp16, [1, 2304, 768]> linear_85_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_12_self_attn_q_proj_linear_weight_promoted_to_fp16, x = clip_168_cast_fp16)[name = string("linear_85_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_12_self_attn_q_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_12_self_attn_q_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.f8p+3)];
+            fp16 model_vision_tower_encoder_layers_12_self_attn_q_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_12_self_attn_q_proj_output_max_promoted_to_fp16"), val = fp16(0x1.f4p+3)];
+            tensor<fp16, [1, 2304, 768]> clip_169_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_12_self_attn_q_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_12_self_attn_q_proj_output_max_promoted_to_fp16, x = linear_85_cast_fp16)[name = string("clip_169_cast_fp16")];
+            tensor<int32, [4]> var_3923 = const()[name = string("op_3923"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_707_cast_fp16 = reshape(shape = var_3923, x = clip_169_cast_fp16)[name = string("hidden_states_707_cast_fp16")];
+            fp16 var_33_promoted_85_to_fp16 = const()[name = string("op_33_promoted_85_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_3927_cast_fp16 = pow(x = hidden_states_707_cast_fp16, y = var_33_promoted_85_to_fp16)[name = string("op_3927_cast_fp16")];
+            tensor<int32, [1]> var_3929_axes_0 = const()[name = string("op_3929_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_3929_keep_dims_0 = const()[name = string("op_3929_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_3929_cast_fp16 = reduce_mean(axes = var_3929_axes_0, keep_dims = var_3929_keep_dims_0, x = var_3927_cast_fp16)[name = string("op_3929_cast_fp16")];
+            fp16 var_3930_to_fp16 = const()[name = string("op_3930_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_171_cast_fp16 = add(x = var_3929_cast_fp16, y = var_3930_to_fp16)[name = string("mean_squared_171_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_3932_cast_fp16 = pow(x = mean_squared_171_cast_fp16, y = var_27_to_fp16)[name = string("op_3932_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_317_cast_fp16 = mul(x = hidden_states_707_cast_fp16, y = var_3932_cast_fp16)[name = string("normed_output_317_cast_fp16")];
+            tensor<fp16, [64]> const_297_to_fp16 = const()[name = string("const_297_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(263944704)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_319_cast_fp16 = mul(x = normed_output_317_cast_fp16, y = const_297_to_fp16)[name = string("normed_output_319_cast_fp16")];
+            tensor<int32, [2]> var_3952 = const()[name = string("op_3952"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_3953_axis_0 = const()[name = string("op_3953_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_3953_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_3953_cast_fp16_1 = split(axis = var_3953_axis_0, split_sizes = var_3952, x = normed_output_319_cast_fp16)[name = string("op_3953_cast_fp16")];
+            tensor<int32, [2]> var_3956 = const()[name = string("op_3956"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_3957_axis_0 = const()[name = string("op_3957_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_3957_0, tensor<fp16, [1, 2304, 32]> var_3957_1 = split(axis = var_3957_axis_0, split_sizes = var_3956, x = var_160_cast_fp16)[name = string("op_3957")];
+            tensor<int32, [2]> var_3960 = const()[name = string("op_3960"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_3961_axis_0 = const()[name = string("op_3961_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_3961_0, tensor<fp16, [1, 2304, 32]> var_3961_1 = split(axis = var_3961_axis_0, split_sizes = var_3960, x = var_163_cast_fp16)[name = string("op_3961")];
+            tensor<int32, [1]> cos_197_axes_0 = const()[name = string("cos_197_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_197 = expand_dims(axes = cos_197_axes_0, x = var_3957_0)[name = string("cos_197")];
+            tensor<int32, [1]> sin_197_axes_0 = const()[name = string("sin_197_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_197 = expand_dims(axes = sin_197_axes_0, x = var_3961_0)[name = string("sin_197")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3966_cast_fp16 = mul(x = var_3953_cast_fp16_0, y = cos_197)[name = string("op_3966_cast_fp16")];
+            tensor<int32, [4]> x1_97_begin_0 = const()[name = string("x1_97_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_97_end_0 = const()[name = string("x1_97_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_97_end_mask_0 = const()[name = string("x1_97_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_97_cast_fp16 = slice_by_index(begin = x1_97_begin_0, end = x1_97_end_0, end_mask = x1_97_end_mask_0, x = var_3953_cast_fp16_0)[name = string("x1_97_cast_fp16")];
+            tensor<int32, [4]> x2_97_begin_0 = const()[name = string("x2_97_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_97_end_0 = const()[name = string("x2_97_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_97_end_mask_0 = const()[name = string("x2_97_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_97_cast_fp16 = slice_by_index(begin = x2_97_begin_0, end = x2_97_end_0, end_mask = x2_97_end_mask_0, x = var_3953_cast_fp16_0)[name = string("x2_97_cast_fp16")];
+            fp16 const_302_promoted_to_fp16 = const()[name = string("const_302_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_3977_cast_fp16 = mul(x = x2_97_cast_fp16, y = const_302_promoted_to_fp16)[name = string("op_3977_cast_fp16")];
+            bool var_3979_interleave_0 = const()[name = string("op_3979_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_3979_cast_fp16 = concat(axis = var_38, interleave = var_3979_interleave_0, values = (var_3977_cast_fp16, x1_97_cast_fp16))[name = string("op_3979_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3980_cast_fp16 = mul(x = var_3979_cast_fp16, y = sin_197)[name = string("op_3980_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3981_cast_fp16 = add(x = var_3966_cast_fp16, y = var_3980_cast_fp16)[name = string("op_3981_cast_fp16")];
+            tensor<int32, [1]> cos_201_axes_0 = const()[name = string("cos_201_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_201 = expand_dims(axes = cos_201_axes_0, x = var_3957_1)[name = string("cos_201")];
+            tensor<int32, [1]> sin_201_axes_0 = const()[name = string("sin_201_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_201 = expand_dims(axes = sin_201_axes_0, x = var_3961_1)[name = string("sin_201")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3984_cast_fp16 = mul(x = var_3953_cast_fp16_1, y = cos_201)[name = string("op_3984_cast_fp16")];
+            tensor<int32, [4]> x1_99_begin_0 = const()[name = string("x1_99_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_99_end_0 = const()[name = string("x1_99_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_99_end_mask_0 = const()[name = string("x1_99_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_99_cast_fp16 = slice_by_index(begin = x1_99_begin_0, end = x1_99_end_0, end_mask = x1_99_end_mask_0, x = var_3953_cast_fp16_1)[name = string("x1_99_cast_fp16")];
+            tensor<int32, [4]> x2_99_begin_0 = const()[name = string("x2_99_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_99_end_0 = const()[name = string("x2_99_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_99_end_mask_0 = const()[name = string("x2_99_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_99_cast_fp16 = slice_by_index(begin = x2_99_begin_0, end = x2_99_end_0, end_mask = x2_99_end_mask_0, x = var_3953_cast_fp16_1)[name = string("x2_99_cast_fp16")];
+            fp16 const_305_promoted_to_fp16 = const()[name = string("const_305_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_3995_cast_fp16 = mul(x = x2_99_cast_fp16, y = const_305_promoted_to_fp16)[name = string("op_3995_cast_fp16")];
+            bool var_3997_interleave_0 = const()[name = string("op_3997_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_3997_cast_fp16 = concat(axis = var_38, interleave = var_3997_interleave_0, values = (var_3995_cast_fp16, x1_99_cast_fp16))[name = string("op_3997_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3998_cast_fp16 = mul(x = var_3997_cast_fp16, y = sin_201)[name = string("op_3998_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_3999_cast_fp16 = add(x = var_3984_cast_fp16, y = var_3998_cast_fp16)[name = string("op_3999_cast_fp16")];
+            bool query_states_25_interleave_0 = const()[name = string("query_states_25_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> query_states_25_cast_fp16 = concat(axis = var_38, interleave = query_states_25_interleave_0, values = (var_3981_cast_fp16, var_3999_cast_fp16))[name = string("query_states_25_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_12_self_attn_k_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_12_self_attn_k_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(263944896)))];
+            tensor<fp16, [1, 2304, 768]> linear_86_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_12_self_attn_k_proj_linear_weight_promoted_to_fp16, x = clip_168_cast_fp16)[name = string("linear_86_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_12_self_attn_k_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_12_self_attn_k_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.2p+4)];
+            fp16 model_vision_tower_encoder_layers_12_self_attn_k_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_12_self_attn_k_proj_output_max_promoted_to_fp16"), val = fp16(0x1.1ep+4)];
+            tensor<fp16, [1, 2304, 768]> clip_171_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_12_self_attn_k_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_12_self_attn_k_proj_output_max_promoted_to_fp16, x = linear_86_cast_fp16)[name = string("clip_171_cast_fp16")];
+            tensor<int32, [4]> var_4012 = const()[name = string("op_4012"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_713_cast_fp16 = reshape(shape = var_4012, x = clip_171_cast_fp16)[name = string("hidden_states_713_cast_fp16")];
+            fp16 var_33_promoted_86_to_fp16 = const()[name = string("op_33_promoted_86_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_4016_cast_fp16 = pow(x = hidden_states_713_cast_fp16, y = var_33_promoted_86_to_fp16)[name = string("op_4016_cast_fp16")];
+            tensor<int32, [1]> var_4018_axes_0 = const()[name = string("op_4018_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4018_keep_dims_0 = const()[name = string("op_4018_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_4018_cast_fp16 = reduce_mean(axes = var_4018_axes_0, keep_dims = var_4018_keep_dims_0, x = var_4016_cast_fp16)[name = string("op_4018_cast_fp16")];
+            fp16 var_4019_to_fp16 = const()[name = string("op_4019_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_173_cast_fp16 = add(x = var_4018_cast_fp16, y = var_4019_to_fp16)[name = string("mean_squared_173_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_4021_cast_fp16 = pow(x = mean_squared_173_cast_fp16, y = var_27_to_fp16)[name = string("op_4021_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_321_cast_fp16 = mul(x = hidden_states_713_cast_fp16, y = var_4021_cast_fp16)[name = string("normed_output_321_cast_fp16")];
+            tensor<fp16, [64]> const_306_to_fp16 = const()[name = string("const_306_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(265124608)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_323_cast_fp16 = mul(x = normed_output_321_cast_fp16, y = const_306_to_fp16)[name = string("normed_output_323_cast_fp16")];
+            tensor<int32, [2]> var_4041 = const()[name = string("op_4041"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_4042_axis_0 = const()[name = string("op_4042_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_4042_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_4042_cast_fp16_1 = split(axis = var_4042_axis_0, split_sizes = var_4041, x = normed_output_323_cast_fp16)[name = string("op_4042_cast_fp16")];
+            tensor<int32, [2]> var_4045 = const()[name = string("op_4045"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_4046_axis_0 = const()[name = string("op_4046_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_4046_0, tensor<fp16, [1, 2304, 32]> var_4046_1 = split(axis = var_4046_axis_0, split_sizes = var_4045, x = var_160_cast_fp16)[name = string("op_4046")];
+            tensor<int32, [2]> var_4049 = const()[name = string("op_4049"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_4050_axis_0 = const()[name = string("op_4050_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_4050_0, tensor<fp16, [1, 2304, 32]> var_4050_1 = split(axis = var_4050_axis_0, split_sizes = var_4049, x = var_163_cast_fp16)[name = string("op_4050")];
+            tensor<int32, [1]> cos_205_axes_0 = const()[name = string("cos_205_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_205 = expand_dims(axes = cos_205_axes_0, x = var_4046_0)[name = string("cos_205")];
+            tensor<int32, [1]> sin_205_axes_0 = const()[name = string("sin_205_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_205 = expand_dims(axes = sin_205_axes_0, x = var_4050_0)[name = string("sin_205")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4055_cast_fp16 = mul(x = var_4042_cast_fp16_0, y = cos_205)[name = string("op_4055_cast_fp16")];
+            tensor<int32, [4]> x1_101_begin_0 = const()[name = string("x1_101_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_101_end_0 = const()[name = string("x1_101_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_101_end_mask_0 = const()[name = string("x1_101_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_101_cast_fp16 = slice_by_index(begin = x1_101_begin_0, end = x1_101_end_0, end_mask = x1_101_end_mask_0, x = var_4042_cast_fp16_0)[name = string("x1_101_cast_fp16")];
+            tensor<int32, [4]> x2_101_begin_0 = const()[name = string("x2_101_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_101_end_0 = const()[name = string("x2_101_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_101_end_mask_0 = const()[name = string("x2_101_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_101_cast_fp16 = slice_by_index(begin = x2_101_begin_0, end = x2_101_end_0, end_mask = x2_101_end_mask_0, x = var_4042_cast_fp16_0)[name = string("x2_101_cast_fp16")];
+            fp16 const_311_promoted_to_fp16 = const()[name = string("const_311_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_4066_cast_fp16 = mul(x = x2_101_cast_fp16, y = const_311_promoted_to_fp16)[name = string("op_4066_cast_fp16")];
+            bool var_4068_interleave_0 = const()[name = string("op_4068_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_4068_cast_fp16 = concat(axis = var_38, interleave = var_4068_interleave_0, values = (var_4066_cast_fp16, x1_101_cast_fp16))[name = string("op_4068_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4069_cast_fp16 = mul(x = var_4068_cast_fp16, y = sin_205)[name = string("op_4069_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4070_cast_fp16 = add(x = var_4055_cast_fp16, y = var_4069_cast_fp16)[name = string("op_4070_cast_fp16")];
+            tensor<int32, [1]> cos_209_axes_0 = const()[name = string("cos_209_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_209 = expand_dims(axes = cos_209_axes_0, x = var_4046_1)[name = string("cos_209")];
+            tensor<int32, [1]> sin_209_axes_0 = const()[name = string("sin_209_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_209 = expand_dims(axes = sin_209_axes_0, x = var_4050_1)[name = string("sin_209")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4073_cast_fp16 = mul(x = var_4042_cast_fp16_1, y = cos_209)[name = string("op_4073_cast_fp16")];
+            tensor<int32, [4]> x1_103_begin_0 = const()[name = string("x1_103_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_103_end_0 = const()[name = string("x1_103_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_103_end_mask_0 = const()[name = string("x1_103_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_103_cast_fp16 = slice_by_index(begin = x1_103_begin_0, end = x1_103_end_0, end_mask = x1_103_end_mask_0, x = var_4042_cast_fp16_1)[name = string("x1_103_cast_fp16")];
+            tensor<int32, [4]> x2_103_begin_0 = const()[name = string("x2_103_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_103_end_0 = const()[name = string("x2_103_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_103_end_mask_0 = const()[name = string("x2_103_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_103_cast_fp16 = slice_by_index(begin = x2_103_begin_0, end = x2_103_end_0, end_mask = x2_103_end_mask_0, x = var_4042_cast_fp16_1)[name = string("x2_103_cast_fp16")];
+            fp16 const_314_promoted_to_fp16 = const()[name = string("const_314_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_4084_cast_fp16 = mul(x = x2_103_cast_fp16, y = const_314_promoted_to_fp16)[name = string("op_4084_cast_fp16")];
+            bool var_4086_interleave_0 = const()[name = string("op_4086_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_4086_cast_fp16 = concat(axis = var_38, interleave = var_4086_interleave_0, values = (var_4084_cast_fp16, x1_103_cast_fp16))[name = string("op_4086_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4087_cast_fp16 = mul(x = var_4086_cast_fp16, y = sin_209)[name = string("op_4087_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4088_cast_fp16 = add(x = var_4073_cast_fp16, y = var_4087_cast_fp16)[name = string("op_4088_cast_fp16")];
+            bool key_states_25_interleave_0 = const()[name = string("key_states_25_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> key_states_25_cast_fp16 = concat(axis = var_38, interleave = key_states_25_interleave_0, values = (var_4070_cast_fp16, var_4088_cast_fp16))[name = string("key_states_25_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_12_self_attn_v_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_12_self_attn_v_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(265124800)))];
+            tensor<fp16, [1, 2304, 768]> linear_87_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_12_self_attn_v_proj_linear_weight_promoted_to_fp16, x = clip_168_cast_fp16)[name = string("linear_87_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_12_self_attn_v_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_12_self_attn_v_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.2p+4)];
+            fp16 model_vision_tower_encoder_layers_12_self_attn_v_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_12_self_attn_v_proj_output_max_promoted_to_fp16"), val = fp16(0x1.1ep+4)];
+            tensor<fp16, [1, 2304, 768]> clip_173_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_12_self_attn_v_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_12_self_attn_v_proj_output_max_promoted_to_fp16, x = linear_87_cast_fp16)[name = string("clip_173_cast_fp16")];
+            tensor<int32, [4]> var_4101 = const()[name = string("op_4101"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_719_cast_fp16 = reshape(shape = var_4101, x = clip_173_cast_fp16)[name = string("hidden_states_719_cast_fp16")];
+            fp16 var_33_promoted_87_to_fp16 = const()[name = string("op_33_promoted_87_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_4104_cast_fp16 = pow(x = hidden_states_719_cast_fp16, y = var_33_promoted_87_to_fp16)[name = string("op_4104_cast_fp16")];
+            tensor<int32, [1]> var_4106_axes_0 = const()[name = string("op_4106_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4106_keep_dims_0 = const()[name = string("op_4106_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_4106_cast_fp16 = reduce_mean(axes = var_4106_axes_0, keep_dims = var_4106_keep_dims_0, x = var_4104_cast_fp16)[name = string("op_4106_cast_fp16")];
+            fp16 var_4107_to_fp16 = const()[name = string("op_4107_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_175_cast_fp16 = add(x = var_4106_cast_fp16, y = var_4107_to_fp16)[name = string("mean_squared_175_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_4109_cast_fp16 = pow(x = mean_squared_175_cast_fp16, y = var_27_to_fp16)[name = string("op_4109_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_325_cast_fp16 = mul(x = hidden_states_719_cast_fp16, y = var_4109_cast_fp16)[name = string("normed_output_325_cast_fp16")];
+            tensor<int32, [4]> hidden_states_725_perm_0 = const()[name = string("hidden_states_725_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            bool matmul_12_transpose_y_0 = const()[name = string("matmul_12_transpose_y_0"), val = bool(true)];
+            bool matmul_12_transpose_x_0 = const()[name = string("matmul_12_transpose_x_0"), val = bool(false)];
+            tensor<int32, [4]> transpose_88_perm_0 = const()[name = string("transpose_88_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<int32, [4]> transpose_89_perm_0 = const()[name = string("transpose_89_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_89 = transpose(perm = transpose_89_perm_0, x = key_states_25_cast_fp16)[name = string("transpose_109")];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_88 = transpose(perm = transpose_88_perm_0, x = query_states_25_cast_fp16)[name = string("transpose_110")];
+            tensor<fp16, [1, 12, 2304, 2304]> matmul_12_cast_fp16 = matmul(transpose_x = matmul_12_transpose_x_0, transpose_y = matmul_12_transpose_y_0, x = transpose_88, y = transpose_89)[name = string("matmul_12_cast_fp16")];
+            tensor<fp16, [1, 12, 2304, 2304]> add_12_cast_fp16 = add(x = matmul_12_cast_fp16, y = attention_mask_cast_fp16)[name = string("add_12_cast_fp16")];
+            int32 softmax_12_axis_0 = const()[name = string("softmax_12_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 12, 2304, 2304]> softmax_12_cast_fp16 = softmax(axis = softmax_12_axis_0, x = add_12_cast_fp16)[name = string("softmax_12_cast_fp16")];
+            bool attn_output_49_transpose_x_0 = const()[name = string("attn_output_49_transpose_x_0"), val = bool(false)];
+            bool attn_output_49_transpose_y_0 = const()[name = string("attn_output_49_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 12, 2304, 64]> hidden_states_725_cast_fp16 = transpose(perm = hidden_states_725_perm_0, x = normed_output_325_cast_fp16)[name = string("transpose_111")];
+            tensor<fp16, [1, 12, 2304, 64]> attn_output_49_cast_fp16 = matmul(transpose_x = attn_output_49_transpose_x_0, transpose_y = attn_output_49_transpose_y_0, x = softmax_12_cast_fp16, y = hidden_states_725_cast_fp16)[name = string("attn_output_49_cast_fp16")];
+            tensor<int32, [4]> var_4114_perm_0 = const()[name = string("op_4114_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_4116 = const()[name = string("op_4116"), val = tensor<int32, [3]>([1, 2304, -1])];
+            tensor<fp16, [1, 2304, 12, 64]> var_4114_cast_fp16 = transpose(perm = var_4114_perm_0, x = attn_output_49_cast_fp16)[name = string("transpose_108")];
+            tensor<fp16, [1, 2304, 768]> var_4117_cast_fp16 = reshape(shape = var_4116, x = var_4114_cast_fp16)[name = string("op_4117_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_12_self_attn_o_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_12_self_attn_o_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.eep+0)];
+            fp16 model_vision_tower_encoder_layers_12_self_attn_o_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_12_self_attn_o_proj_input_max_promoted_to_fp16"), val = fp16(0x1.ecp+0)];
+            tensor<fp16, [1, 2304, 768]> clip_174_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_12_self_attn_o_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_12_self_attn_o_proj_input_max_promoted_to_fp16, x = var_4117_cast_fp16)[name = string("clip_174_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_12_self_attn_o_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_12_self_attn_o_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(266304512)))];
+            tensor<fp16, [1, 2304, 768]> linear_88_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_12_self_attn_o_proj_linear_weight_promoted_to_fp16, x = clip_174_cast_fp16)[name = string("linear_88_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_12_self_attn_o_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_12_self_attn_o_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.3cp+1)];
+            fp16 model_vision_tower_encoder_layers_12_self_attn_o_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_12_self_attn_o_proj_output_max_promoted_to_fp16"), val = fp16(0x1.3ap+1)];
+            tensor<fp16, [1, 2304, 768]> clip_175_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_12_self_attn_o_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_12_self_attn_o_proj_output_max_promoted_to_fp16, x = linear_88_cast_fp16)[name = string("clip_175_cast_fp16")];
+            fp16 var_33_promoted_88_to_fp16 = const()[name = string("op_33_promoted_88_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_4130_cast_fp16 = pow(x = clip_175_cast_fp16, y = var_33_promoted_88_to_fp16)[name = string("op_4130_cast_fp16")];
+            tensor<int32, [1]> var_4132_axes_0 = const()[name = string("op_4132_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4132_keep_dims_0 = const()[name = string("op_4132_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_4132_cast_fp16 = reduce_mean(axes = var_4132_axes_0, keep_dims = var_4132_keep_dims_0, x = var_4130_cast_fp16)[name = string("op_4132_cast_fp16")];
+            fp16 var_4133_to_fp16 = const()[name = string("op_4133_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_177_cast_fp16 = add(x = var_4132_cast_fp16, y = var_4133_to_fp16)[name = string("mean_squared_177_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_4135_cast_fp16 = pow(x = mean_squared_177_cast_fp16, y = var_27_to_fp16)[name = string("op_4135_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_327_cast_fp16 = mul(x = clip_175_cast_fp16, y = var_4135_cast_fp16)[name = string("normed_output_327_cast_fp16")];
+            tensor<fp16, [768]> const_315_to_fp16 = const()[name = string("const_315_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(267484224)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_329_cast_fp16 = mul(x = normed_output_327_cast_fp16, y = const_315_to_fp16)[name = string("normed_output_329_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_737_cast_fp16 = add(x = hidden_states_699_cast_fp16, y = normed_output_329_cast_fp16)[name = string("hidden_states_737_cast_fp16")];
+            fp16 var_33_promoted_89_to_fp16 = const()[name = string("op_33_promoted_89_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_4143_cast_fp16 = pow(x = hidden_states_737_cast_fp16, y = var_33_promoted_89_to_fp16)[name = string("op_4143_cast_fp16")];
+            tensor<int32, [1]> var_4145_axes_0 = const()[name = string("op_4145_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4145_keep_dims_0 = const()[name = string("op_4145_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_4145_cast_fp16 = reduce_mean(axes = var_4145_axes_0, keep_dims = var_4145_keep_dims_0, x = var_4143_cast_fp16)[name = string("op_4145_cast_fp16")];
+            fp16 var_4146_to_fp16 = const()[name = string("op_4146_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_179_cast_fp16 = add(x = var_4145_cast_fp16, y = var_4146_to_fp16)[name = string("mean_squared_179_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_4148_cast_fp16 = pow(x = mean_squared_179_cast_fp16, y = var_27_to_fp16)[name = string("op_4148_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_331_cast_fp16 = mul(x = hidden_states_737_cast_fp16, y = var_4148_cast_fp16)[name = string("normed_output_331_cast_fp16")];
+            tensor<fp16, [768]> const_316_to_fp16 = const()[name = string("const_316_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(267485824)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_333_cast_fp16 = mul(x = normed_output_331_cast_fp16, y = const_316_to_fp16)[name = string("normed_output_333_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_12_mlp_gate_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_12_mlp_gate_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.cp+2)];
+            fp16 model_vision_tower_encoder_layers_12_mlp_gate_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_12_mlp_gate_proj_input_max_promoted_to_fp16"), val = fp16(0x1.bcp+2)];
+            tensor<fp16, [1, 2304, 768]> clip_176_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_12_mlp_gate_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_12_mlp_gate_proj_input_max_promoted_to_fp16, x = normed_output_333_cast_fp16)[name = string("clip_176_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_12_mlp_gate_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_12_mlp_gate_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(267487424)))];
+            tensor<fp16, [1, 2304, 3072]> linear_89_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_12_mlp_gate_proj_linear_weight_promoted_to_fp16, x = clip_176_cast_fp16)[name = string("linear_89_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_12_mlp_gate_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_12_mlp_gate_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.cap+2)];
+            fp16 model_vision_tower_encoder_layers_12_mlp_gate_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_12_mlp_gate_proj_output_max_promoted_to_fp16"), val = fp16(0x1.c6p+2)];
+            tensor<fp16, [1, 2304, 3072]> clip_177_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_12_mlp_gate_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_12_mlp_gate_proj_output_max_promoted_to_fp16, x = linear_89_cast_fp16)[name = string("clip_177_cast_fp16")];
+            string var_4165_mode_0 = const()[name = string("op_4165_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 2304, 3072]> var_4165_cast_fp16 = gelu(mode = var_4165_mode_0, x = clip_177_cast_fp16)[name = string("op_4165_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_12_mlp_up_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_12_mlp_up_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(272206080)))];
+            tensor<fp16, [1, 2304, 3072]> linear_90_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_12_mlp_up_proj_linear_weight_promoted_to_fp16, x = clip_176_cast_fp16)[name = string("linear_90_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_12_mlp_up_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_12_mlp_up_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.cap+2)];
+            fp16 model_vision_tower_encoder_layers_12_mlp_up_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_12_mlp_up_proj_output_max_promoted_to_fp16"), val = fp16(0x1.c6p+2)];
+            tensor<fp16, [1, 2304, 3072]> clip_179_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_12_mlp_up_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_12_mlp_up_proj_output_max_promoted_to_fp16, x = linear_90_cast_fp16)[name = string("clip_179_cast_fp16")];
+            tensor<fp16, [1, 2304, 3072]> hidden_states_747_cast_fp16 = mul(x = var_4165_cast_fp16, y = clip_179_cast_fp16)[name = string("hidden_states_747_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_12_mlp_down_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_12_mlp_down_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.a4p+3)];
+            fp16 model_vision_tower_encoder_layers_12_mlp_down_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_12_mlp_down_proj_input_max_promoted_to_fp16"), val = fp16(0x1.ap+3)];
+            tensor<fp16, [1, 2304, 3072]> clip_180_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_12_mlp_down_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_12_mlp_down_proj_input_max_promoted_to_fp16, x = hidden_states_747_cast_fp16)[name = string("clip_180_cast_fp16")];
+            tensor<fp16, [768, 3072]> model_vision_tower_encoder_layers_12_mlp_down_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_12_mlp_down_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 3072]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(276924736)))];
+            tensor<fp16, [1, 2304, 768]> linear_91_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_12_mlp_down_proj_linear_weight_promoted_to_fp16, x = clip_180_cast_fp16)[name = string("linear_91_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_12_mlp_down_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_12_mlp_down_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.1cp+2)];
+            fp16 model_vision_tower_encoder_layers_12_mlp_down_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_12_mlp_down_proj_output_max_promoted_to_fp16"), val = fp16(0x1.1ap+2)];
+            tensor<fp16, [1, 2304, 768]> clip_181_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_12_mlp_down_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_12_mlp_down_proj_output_max_promoted_to_fp16, x = linear_91_cast_fp16)[name = string("clip_181_cast_fp16")];
+            fp16 var_33_promoted_90_to_fp16 = const()[name = string("op_33_promoted_90_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_4187_cast_fp16 = pow(x = clip_181_cast_fp16, y = var_33_promoted_90_to_fp16)[name = string("op_4187_cast_fp16")];
+            tensor<int32, [1]> var_4189_axes_0 = const()[name = string("op_4189_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4189_keep_dims_0 = const()[name = string("op_4189_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_4189_cast_fp16 = reduce_mean(axes = var_4189_axes_0, keep_dims = var_4189_keep_dims_0, x = var_4187_cast_fp16)[name = string("op_4189_cast_fp16")];
+            fp16 var_4190_to_fp16 = const()[name = string("op_4190_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_181_cast_fp16 = add(x = var_4189_cast_fp16, y = var_4190_to_fp16)[name = string("mean_squared_181_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_4192_cast_fp16 = pow(x = mean_squared_181_cast_fp16, y = var_27_to_fp16)[name = string("op_4192_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_335_cast_fp16 = mul(x = clip_181_cast_fp16, y = var_4192_cast_fp16)[name = string("normed_output_335_cast_fp16")];
+            tensor<fp16, [768]> const_317_to_fp16 = const()[name = string("const_317_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(281643392)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_337_cast_fp16 = mul(x = normed_output_335_cast_fp16, y = const_317_to_fp16)[name = string("normed_output_337_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_757_cast_fp16 = add(x = hidden_states_737_cast_fp16, y = normed_output_337_cast_fp16)[name = string("hidden_states_757_cast_fp16")];
+            fp16 var_33_promoted_91_to_fp16 = const()[name = string("op_33_promoted_91_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_4206_cast_fp16 = pow(x = hidden_states_757_cast_fp16, y = var_33_promoted_91_to_fp16)[name = string("op_4206_cast_fp16")];
+            tensor<int32, [1]> var_4208_axes_0 = const()[name = string("op_4208_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4208_keep_dims_0 = const()[name = string("op_4208_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_4208_cast_fp16 = reduce_mean(axes = var_4208_axes_0, keep_dims = var_4208_keep_dims_0, x = var_4206_cast_fp16)[name = string("op_4208_cast_fp16")];
+            fp16 var_4209_to_fp16 = const()[name = string("op_4209_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_183_cast_fp16 = add(x = var_4208_cast_fp16, y = var_4209_to_fp16)[name = string("mean_squared_183_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_4211_cast_fp16 = pow(x = mean_squared_183_cast_fp16, y = var_27_to_fp16)[name = string("op_4211_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_339_cast_fp16 = mul(x = hidden_states_757_cast_fp16, y = var_4211_cast_fp16)[name = string("normed_output_339_cast_fp16")];
+            tensor<fp16, [768]> const_318_to_fp16 = const()[name = string("const_318_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(281644992)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_341_cast_fp16 = mul(x = normed_output_339_cast_fp16, y = const_318_to_fp16)[name = string("normed_output_341_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_13_self_attn_q_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_13_self_attn_q_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.12p+4)];
+            fp16 model_vision_tower_encoder_layers_13_self_attn_q_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_13_self_attn_q_proj_input_max_promoted_to_fp16"), val = fp16(0x1.0ep+4)];
+            tensor<fp16, [1, 2304, 768]> clip_182_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_13_self_attn_q_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_13_self_attn_q_proj_input_max_promoted_to_fp16, x = normed_output_341_cast_fp16)[name = string("clip_182_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_13_self_attn_q_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_13_self_attn_q_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(281646592)))];
+            tensor<fp16, [1, 2304, 768]> linear_92_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_13_self_attn_q_proj_linear_weight_promoted_to_fp16, x = clip_182_cast_fp16)[name = string("linear_92_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_13_self_attn_q_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_13_self_attn_q_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.18p+4)];
+            fp16 model_vision_tower_encoder_layers_13_self_attn_q_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_13_self_attn_q_proj_output_max_promoted_to_fp16"), val = fp16(0x1.16p+4)];
+            tensor<fp16, [1, 2304, 768]> clip_183_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_13_self_attn_q_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_13_self_attn_q_proj_output_max_promoted_to_fp16, x = linear_92_cast_fp16)[name = string("clip_183_cast_fp16")];
+            tensor<int32, [4]> var_4233 = const()[name = string("op_4233"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_765_cast_fp16 = reshape(shape = var_4233, x = clip_183_cast_fp16)[name = string("hidden_states_765_cast_fp16")];
+            fp16 var_33_promoted_92_to_fp16 = const()[name = string("op_33_promoted_92_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_4237_cast_fp16 = pow(x = hidden_states_765_cast_fp16, y = var_33_promoted_92_to_fp16)[name = string("op_4237_cast_fp16")];
+            tensor<int32, [1]> var_4239_axes_0 = const()[name = string("op_4239_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4239_keep_dims_0 = const()[name = string("op_4239_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_4239_cast_fp16 = reduce_mean(axes = var_4239_axes_0, keep_dims = var_4239_keep_dims_0, x = var_4237_cast_fp16)[name = string("op_4239_cast_fp16")];
+            fp16 var_4240_to_fp16 = const()[name = string("op_4240_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_185_cast_fp16 = add(x = var_4239_cast_fp16, y = var_4240_to_fp16)[name = string("mean_squared_185_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_4242_cast_fp16 = pow(x = mean_squared_185_cast_fp16, y = var_27_to_fp16)[name = string("op_4242_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_343_cast_fp16 = mul(x = hidden_states_765_cast_fp16, y = var_4242_cast_fp16)[name = string("normed_output_343_cast_fp16")];
+            tensor<fp16, [64]> const_321_to_fp16 = const()[name = string("const_321_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(282826304)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_345_cast_fp16 = mul(x = normed_output_343_cast_fp16, y = const_321_to_fp16)[name = string("normed_output_345_cast_fp16")];
+            tensor<int32, [2]> var_4262 = const()[name = string("op_4262"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_4263_axis_0 = const()[name = string("op_4263_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_4263_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_4263_cast_fp16_1 = split(axis = var_4263_axis_0, split_sizes = var_4262, x = normed_output_345_cast_fp16)[name = string("op_4263_cast_fp16")];
+            tensor<int32, [2]> var_4266 = const()[name = string("op_4266"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_4267_axis_0 = const()[name = string("op_4267_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_4267_0, tensor<fp16, [1, 2304, 32]> var_4267_1 = split(axis = var_4267_axis_0, split_sizes = var_4266, x = var_160_cast_fp16)[name = string("op_4267")];
+            tensor<int32, [2]> var_4270 = const()[name = string("op_4270"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_4271_axis_0 = const()[name = string("op_4271_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_4271_0, tensor<fp16, [1, 2304, 32]> var_4271_1 = split(axis = var_4271_axis_0, split_sizes = var_4270, x = var_163_cast_fp16)[name = string("op_4271")];
+            tensor<int32, [1]> cos_213_axes_0 = const()[name = string("cos_213_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_213 = expand_dims(axes = cos_213_axes_0, x = var_4267_0)[name = string("cos_213")];
+            tensor<int32, [1]> sin_213_axes_0 = const()[name = string("sin_213_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_213 = expand_dims(axes = sin_213_axes_0, x = var_4271_0)[name = string("sin_213")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4276_cast_fp16 = mul(x = var_4263_cast_fp16_0, y = cos_213)[name = string("op_4276_cast_fp16")];
+            tensor<int32, [4]> x1_105_begin_0 = const()[name = string("x1_105_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_105_end_0 = const()[name = string("x1_105_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_105_end_mask_0 = const()[name = string("x1_105_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_105_cast_fp16 = slice_by_index(begin = x1_105_begin_0, end = x1_105_end_0, end_mask = x1_105_end_mask_0, x = var_4263_cast_fp16_0)[name = string("x1_105_cast_fp16")];
+            tensor<int32, [4]> x2_105_begin_0 = const()[name = string("x2_105_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_105_end_0 = const()[name = string("x2_105_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_105_end_mask_0 = const()[name = string("x2_105_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_105_cast_fp16 = slice_by_index(begin = x2_105_begin_0, end = x2_105_end_0, end_mask = x2_105_end_mask_0, x = var_4263_cast_fp16_0)[name = string("x2_105_cast_fp16")];
+            fp16 const_326_promoted_to_fp16 = const()[name = string("const_326_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_4287_cast_fp16 = mul(x = x2_105_cast_fp16, y = const_326_promoted_to_fp16)[name = string("op_4287_cast_fp16")];
+            bool var_4289_interleave_0 = const()[name = string("op_4289_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_4289_cast_fp16 = concat(axis = var_38, interleave = var_4289_interleave_0, values = (var_4287_cast_fp16, x1_105_cast_fp16))[name = string("op_4289_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4290_cast_fp16 = mul(x = var_4289_cast_fp16, y = sin_213)[name = string("op_4290_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4291_cast_fp16 = add(x = var_4276_cast_fp16, y = var_4290_cast_fp16)[name = string("op_4291_cast_fp16")];
+            tensor<int32, [1]> cos_217_axes_0 = const()[name = string("cos_217_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_217 = expand_dims(axes = cos_217_axes_0, x = var_4267_1)[name = string("cos_217")];
+            tensor<int32, [1]> sin_217_axes_0 = const()[name = string("sin_217_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_217 = expand_dims(axes = sin_217_axes_0, x = var_4271_1)[name = string("sin_217")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4294_cast_fp16 = mul(x = var_4263_cast_fp16_1, y = cos_217)[name = string("op_4294_cast_fp16")];
+            tensor<int32, [4]> x1_107_begin_0 = const()[name = string("x1_107_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_107_end_0 = const()[name = string("x1_107_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_107_end_mask_0 = const()[name = string("x1_107_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_107_cast_fp16 = slice_by_index(begin = x1_107_begin_0, end = x1_107_end_0, end_mask = x1_107_end_mask_0, x = var_4263_cast_fp16_1)[name = string("x1_107_cast_fp16")];
+            tensor<int32, [4]> x2_107_begin_0 = const()[name = string("x2_107_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_107_end_0 = const()[name = string("x2_107_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_107_end_mask_0 = const()[name = string("x2_107_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_107_cast_fp16 = slice_by_index(begin = x2_107_begin_0, end = x2_107_end_0, end_mask = x2_107_end_mask_0, x = var_4263_cast_fp16_1)[name = string("x2_107_cast_fp16")];
+            fp16 const_329_promoted_to_fp16 = const()[name = string("const_329_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_4305_cast_fp16 = mul(x = x2_107_cast_fp16, y = const_329_promoted_to_fp16)[name = string("op_4305_cast_fp16")];
+            bool var_4307_interleave_0 = const()[name = string("op_4307_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_4307_cast_fp16 = concat(axis = var_38, interleave = var_4307_interleave_0, values = (var_4305_cast_fp16, x1_107_cast_fp16))[name = string("op_4307_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4308_cast_fp16 = mul(x = var_4307_cast_fp16, y = sin_217)[name = string("op_4308_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4309_cast_fp16 = add(x = var_4294_cast_fp16, y = var_4308_cast_fp16)[name = string("op_4309_cast_fp16")];
+            bool query_states_27_interleave_0 = const()[name = string("query_states_27_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> query_states_27_cast_fp16 = concat(axis = var_38, interleave = query_states_27_interleave_0, values = (var_4291_cast_fp16, var_4309_cast_fp16))[name = string("query_states_27_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_13_self_attn_k_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_13_self_attn_k_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(282826496)))];
+            tensor<fp16, [1, 2304, 768]> linear_93_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_13_self_attn_k_proj_linear_weight_promoted_to_fp16, x = clip_182_cast_fp16)[name = string("linear_93_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_13_self_attn_k_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_13_self_attn_k_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.52p+4)];
+            fp16 model_vision_tower_encoder_layers_13_self_attn_k_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_13_self_attn_k_proj_output_max_promoted_to_fp16"), val = fp16(0x1.5p+4)];
+            tensor<fp16, [1, 2304, 768]> clip_185_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_13_self_attn_k_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_13_self_attn_k_proj_output_max_promoted_to_fp16, x = linear_93_cast_fp16)[name = string("clip_185_cast_fp16")];
+            tensor<int32, [4]> var_4322 = const()[name = string("op_4322"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_771_cast_fp16 = reshape(shape = var_4322, x = clip_185_cast_fp16)[name = string("hidden_states_771_cast_fp16")];
+            fp16 var_33_promoted_93_to_fp16 = const()[name = string("op_33_promoted_93_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_4326_cast_fp16 = pow(x = hidden_states_771_cast_fp16, y = var_33_promoted_93_to_fp16)[name = string("op_4326_cast_fp16")];
+            tensor<int32, [1]> var_4328_axes_0 = const()[name = string("op_4328_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4328_keep_dims_0 = const()[name = string("op_4328_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_4328_cast_fp16 = reduce_mean(axes = var_4328_axes_0, keep_dims = var_4328_keep_dims_0, x = var_4326_cast_fp16)[name = string("op_4328_cast_fp16")];
+            fp16 var_4329_to_fp16 = const()[name = string("op_4329_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_187_cast_fp16 = add(x = var_4328_cast_fp16, y = var_4329_to_fp16)[name = string("mean_squared_187_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_4331_cast_fp16 = pow(x = mean_squared_187_cast_fp16, y = var_27_to_fp16)[name = string("op_4331_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_347_cast_fp16 = mul(x = hidden_states_771_cast_fp16, y = var_4331_cast_fp16)[name = string("normed_output_347_cast_fp16")];
+            tensor<fp16, [64]> const_330_to_fp16 = const()[name = string("const_330_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(284006208)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_349_cast_fp16 = mul(x = normed_output_347_cast_fp16, y = const_330_to_fp16)[name = string("normed_output_349_cast_fp16")];
+            tensor<int32, [2]> var_4351 = const()[name = string("op_4351"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_4352_axis_0 = const()[name = string("op_4352_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_4352_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_4352_cast_fp16_1 = split(axis = var_4352_axis_0, split_sizes = var_4351, x = normed_output_349_cast_fp16)[name = string("op_4352_cast_fp16")];
+            tensor<int32, [2]> var_4355 = const()[name = string("op_4355"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_4356_axis_0 = const()[name = string("op_4356_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_4356_0, tensor<fp16, [1, 2304, 32]> var_4356_1 = split(axis = var_4356_axis_0, split_sizes = var_4355, x = var_160_cast_fp16)[name = string("op_4356")];
+            tensor<int32, [2]> var_4359 = const()[name = string("op_4359"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_4360_axis_0 = const()[name = string("op_4360_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_4360_0, tensor<fp16, [1, 2304, 32]> var_4360_1 = split(axis = var_4360_axis_0, split_sizes = var_4359, x = var_163_cast_fp16)[name = string("op_4360")];
+            tensor<int32, [1]> cos_221_axes_0 = const()[name = string("cos_221_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_221 = expand_dims(axes = cos_221_axes_0, x = var_4356_0)[name = string("cos_221")];
+            tensor<int32, [1]> sin_221_axes_0 = const()[name = string("sin_221_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_221 = expand_dims(axes = sin_221_axes_0, x = var_4360_0)[name = string("sin_221")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4365_cast_fp16 = mul(x = var_4352_cast_fp16_0, y = cos_221)[name = string("op_4365_cast_fp16")];
+            tensor<int32, [4]> x1_109_begin_0 = const()[name = string("x1_109_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_109_end_0 = const()[name = string("x1_109_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_109_end_mask_0 = const()[name = string("x1_109_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_109_cast_fp16 = slice_by_index(begin = x1_109_begin_0, end = x1_109_end_0, end_mask = x1_109_end_mask_0, x = var_4352_cast_fp16_0)[name = string("x1_109_cast_fp16")];
+            tensor<int32, [4]> x2_109_begin_0 = const()[name = string("x2_109_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_109_end_0 = const()[name = string("x2_109_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_109_end_mask_0 = const()[name = string("x2_109_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_109_cast_fp16 = slice_by_index(begin = x2_109_begin_0, end = x2_109_end_0, end_mask = x2_109_end_mask_0, x = var_4352_cast_fp16_0)[name = string("x2_109_cast_fp16")];
+            fp16 const_335_promoted_to_fp16 = const()[name = string("const_335_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_4376_cast_fp16 = mul(x = x2_109_cast_fp16, y = const_335_promoted_to_fp16)[name = string("op_4376_cast_fp16")];
+            bool var_4378_interleave_0 = const()[name = string("op_4378_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_4378_cast_fp16 = concat(axis = var_38, interleave = var_4378_interleave_0, values = (var_4376_cast_fp16, x1_109_cast_fp16))[name = string("op_4378_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4379_cast_fp16 = mul(x = var_4378_cast_fp16, y = sin_221)[name = string("op_4379_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4380_cast_fp16 = add(x = var_4365_cast_fp16, y = var_4379_cast_fp16)[name = string("op_4380_cast_fp16")];
+            tensor<int32, [1]> cos_225_axes_0 = const()[name = string("cos_225_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_225 = expand_dims(axes = cos_225_axes_0, x = var_4356_1)[name = string("cos_225")];
+            tensor<int32, [1]> sin_225_axes_0 = const()[name = string("sin_225_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_225 = expand_dims(axes = sin_225_axes_0, x = var_4360_1)[name = string("sin_225")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4383_cast_fp16 = mul(x = var_4352_cast_fp16_1, y = cos_225)[name = string("op_4383_cast_fp16")];
+            tensor<int32, [4]> x1_111_begin_0 = const()[name = string("x1_111_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_111_end_0 = const()[name = string("x1_111_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_111_end_mask_0 = const()[name = string("x1_111_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_111_cast_fp16 = slice_by_index(begin = x1_111_begin_0, end = x1_111_end_0, end_mask = x1_111_end_mask_0, x = var_4352_cast_fp16_1)[name = string("x1_111_cast_fp16")];
+            tensor<int32, [4]> x2_111_begin_0 = const()[name = string("x2_111_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_111_end_0 = const()[name = string("x2_111_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_111_end_mask_0 = const()[name = string("x2_111_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_111_cast_fp16 = slice_by_index(begin = x2_111_begin_0, end = x2_111_end_0, end_mask = x2_111_end_mask_0, x = var_4352_cast_fp16_1)[name = string("x2_111_cast_fp16")];
+            fp16 const_338_promoted_to_fp16 = const()[name = string("const_338_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_4394_cast_fp16 = mul(x = x2_111_cast_fp16, y = const_338_promoted_to_fp16)[name = string("op_4394_cast_fp16")];
+            bool var_4396_interleave_0 = const()[name = string("op_4396_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_4396_cast_fp16 = concat(axis = var_38, interleave = var_4396_interleave_0, values = (var_4394_cast_fp16, x1_111_cast_fp16))[name = string("op_4396_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4397_cast_fp16 = mul(x = var_4396_cast_fp16, y = sin_225)[name = string("op_4397_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4398_cast_fp16 = add(x = var_4383_cast_fp16, y = var_4397_cast_fp16)[name = string("op_4398_cast_fp16")];
+            bool key_states_27_interleave_0 = const()[name = string("key_states_27_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> key_states_27_cast_fp16 = concat(axis = var_38, interleave = key_states_27_interleave_0, values = (var_4380_cast_fp16, var_4398_cast_fp16))[name = string("key_states_27_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_13_self_attn_v_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_13_self_attn_v_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(284006400)))];
+            tensor<fp16, [1, 2304, 768]> linear_94_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_13_self_attn_v_proj_linear_weight_promoted_to_fp16, x = clip_182_cast_fp16)[name = string("linear_94_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_13_self_attn_v_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_13_self_attn_v_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.52p+4)];
+            fp16 model_vision_tower_encoder_layers_13_self_attn_v_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_13_self_attn_v_proj_output_max_promoted_to_fp16"), val = fp16(0x1.5p+4)];
+            tensor<fp16, [1, 2304, 768]> clip_187_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_13_self_attn_v_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_13_self_attn_v_proj_output_max_promoted_to_fp16, x = linear_94_cast_fp16)[name = string("clip_187_cast_fp16")];
+            tensor<int32, [4]> var_4411 = const()[name = string("op_4411"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_777_cast_fp16 = reshape(shape = var_4411, x = clip_187_cast_fp16)[name = string("hidden_states_777_cast_fp16")];
+            fp16 var_33_promoted_94_to_fp16 = const()[name = string("op_33_promoted_94_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_4414_cast_fp16 = pow(x = hidden_states_777_cast_fp16, y = var_33_promoted_94_to_fp16)[name = string("op_4414_cast_fp16")];
+            tensor<int32, [1]> var_4416_axes_0 = const()[name = string("op_4416_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4416_keep_dims_0 = const()[name = string("op_4416_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_4416_cast_fp16 = reduce_mean(axes = var_4416_axes_0, keep_dims = var_4416_keep_dims_0, x = var_4414_cast_fp16)[name = string("op_4416_cast_fp16")];
+            fp16 var_4417_to_fp16 = const()[name = string("op_4417_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_189_cast_fp16 = add(x = var_4416_cast_fp16, y = var_4417_to_fp16)[name = string("mean_squared_189_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_4419_cast_fp16 = pow(x = mean_squared_189_cast_fp16, y = var_27_to_fp16)[name = string("op_4419_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_351_cast_fp16 = mul(x = hidden_states_777_cast_fp16, y = var_4419_cast_fp16)[name = string("normed_output_351_cast_fp16")];
+            tensor<int32, [4]> hidden_states_783_perm_0 = const()[name = string("hidden_states_783_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            bool matmul_13_transpose_y_0 = const()[name = string("matmul_13_transpose_y_0"), val = bool(true)];
+            bool matmul_13_transpose_x_0 = const()[name = string("matmul_13_transpose_x_0"), val = bool(false)];
+            tensor<int32, [4]> transpose_90_perm_0 = const()[name = string("transpose_90_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<int32, [4]> transpose_91_perm_0 = const()[name = string("transpose_91_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_91 = transpose(perm = transpose_91_perm_0, x = key_states_27_cast_fp16)[name = string("transpose_105")];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_90 = transpose(perm = transpose_90_perm_0, x = query_states_27_cast_fp16)[name = string("transpose_106")];
+            tensor<fp16, [1, 12, 2304, 2304]> matmul_13_cast_fp16 = matmul(transpose_x = matmul_13_transpose_x_0, transpose_y = matmul_13_transpose_y_0, x = transpose_90, y = transpose_91)[name = string("matmul_13_cast_fp16")];
+            tensor<fp16, [1, 12, 2304, 2304]> add_13_cast_fp16 = add(x = matmul_13_cast_fp16, y = attention_mask_cast_fp16)[name = string("add_13_cast_fp16")];
+            int32 softmax_13_axis_0 = const()[name = string("softmax_13_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 12, 2304, 2304]> softmax_13_cast_fp16 = softmax(axis = softmax_13_axis_0, x = add_13_cast_fp16)[name = string("softmax_13_cast_fp16")];
+            bool attn_output_53_transpose_x_0 = const()[name = string("attn_output_53_transpose_x_0"), val = bool(false)];
+            bool attn_output_53_transpose_y_0 = const()[name = string("attn_output_53_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 12, 2304, 64]> hidden_states_783_cast_fp16 = transpose(perm = hidden_states_783_perm_0, x = normed_output_351_cast_fp16)[name = string("transpose_107")];
+            tensor<fp16, [1, 12, 2304, 64]> attn_output_53_cast_fp16 = matmul(transpose_x = attn_output_53_transpose_x_0, transpose_y = attn_output_53_transpose_y_0, x = softmax_13_cast_fp16, y = hidden_states_783_cast_fp16)[name = string("attn_output_53_cast_fp16")];
+            tensor<int32, [4]> var_4424_perm_0 = const()[name = string("op_4424_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_4426 = const()[name = string("op_4426"), val = tensor<int32, [3]>([1, 2304, -1])];
+            tensor<fp16, [1, 2304, 12, 64]> var_4424_cast_fp16 = transpose(perm = var_4424_perm_0, x = attn_output_53_cast_fp16)[name = string("transpose_104")];
+            tensor<fp16, [1, 2304, 768]> var_4427_cast_fp16 = reshape(shape = var_4426, x = var_4424_cast_fp16)[name = string("op_4427_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_13_self_attn_o_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_13_self_attn_o_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.f8p+0)];
+            fp16 model_vision_tower_encoder_layers_13_self_attn_o_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_13_self_attn_o_proj_input_max_promoted_to_fp16"), val = fp16(0x1.f4p+0)];
+            tensor<fp16, [1, 2304, 768]> clip_188_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_13_self_attn_o_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_13_self_attn_o_proj_input_max_promoted_to_fp16, x = var_4427_cast_fp16)[name = string("clip_188_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_13_self_attn_o_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_13_self_attn_o_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(285186112)))];
+            tensor<fp16, [1, 2304, 768]> linear_95_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_13_self_attn_o_proj_linear_weight_promoted_to_fp16, x = clip_188_cast_fp16)[name = string("linear_95_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_13_self_attn_o_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_13_self_attn_o_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.6ep+1)];
+            fp16 model_vision_tower_encoder_layers_13_self_attn_o_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_13_self_attn_o_proj_output_max_promoted_to_fp16"), val = fp16(0x1.6ap+1)];
+            tensor<fp16, [1, 2304, 768]> clip_189_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_13_self_attn_o_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_13_self_attn_o_proj_output_max_promoted_to_fp16, x = linear_95_cast_fp16)[name = string("clip_189_cast_fp16")];
+            fp16 var_33_promoted_95_to_fp16 = const()[name = string("op_33_promoted_95_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_4440_cast_fp16 = pow(x = clip_189_cast_fp16, y = var_33_promoted_95_to_fp16)[name = string("op_4440_cast_fp16")];
+            tensor<int32, [1]> var_4442_axes_0 = const()[name = string("op_4442_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4442_keep_dims_0 = const()[name = string("op_4442_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_4442_cast_fp16 = reduce_mean(axes = var_4442_axes_0, keep_dims = var_4442_keep_dims_0, x = var_4440_cast_fp16)[name = string("op_4442_cast_fp16")];
+            fp16 var_4443_to_fp16 = const()[name = string("op_4443_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_191_cast_fp16 = add(x = var_4442_cast_fp16, y = var_4443_to_fp16)[name = string("mean_squared_191_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_4445_cast_fp16 = pow(x = mean_squared_191_cast_fp16, y = var_27_to_fp16)[name = string("op_4445_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_353_cast_fp16 = mul(x = clip_189_cast_fp16, y = var_4445_cast_fp16)[name = string("normed_output_353_cast_fp16")];
+            tensor<fp16, [768]> const_339_to_fp16 = const()[name = string("const_339_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(286365824)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_355_cast_fp16 = mul(x = normed_output_353_cast_fp16, y = const_339_to_fp16)[name = string("normed_output_355_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_795_cast_fp16 = add(x = hidden_states_757_cast_fp16, y = normed_output_355_cast_fp16)[name = string("hidden_states_795_cast_fp16")];
+            fp16 var_33_promoted_96_to_fp16 = const()[name = string("op_33_promoted_96_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_4453_cast_fp16 = pow(x = hidden_states_795_cast_fp16, y = var_33_promoted_96_to_fp16)[name = string("op_4453_cast_fp16")];
+            tensor<int32, [1]> var_4455_axes_0 = const()[name = string("op_4455_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4455_keep_dims_0 = const()[name = string("op_4455_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_4455_cast_fp16 = reduce_mean(axes = var_4455_axes_0, keep_dims = var_4455_keep_dims_0, x = var_4453_cast_fp16)[name = string("op_4455_cast_fp16")];
+            fp16 var_4456_to_fp16 = const()[name = string("op_4456_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_193_cast_fp16 = add(x = var_4455_cast_fp16, y = var_4456_to_fp16)[name = string("mean_squared_193_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_4458_cast_fp16 = pow(x = mean_squared_193_cast_fp16, y = var_27_to_fp16)[name = string("op_4458_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_357_cast_fp16 = mul(x = hidden_states_795_cast_fp16, y = var_4458_cast_fp16)[name = string("normed_output_357_cast_fp16")];
+            tensor<fp16, [768]> const_340_to_fp16 = const()[name = string("const_340_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(286367424)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_359_cast_fp16 = mul(x = normed_output_357_cast_fp16, y = const_340_to_fp16)[name = string("normed_output_359_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_13_mlp_gate_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_13_mlp_gate_proj_input_min_promoted_to_fp16"), val = fp16(-0x1p+3)];
+            fp16 model_vision_tower_encoder_layers_13_mlp_gate_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_13_mlp_gate_proj_input_max_promoted_to_fp16"), val = fp16(0x1.fcp+2)];
+            tensor<fp16, [1, 2304, 768]> clip_190_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_13_mlp_gate_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_13_mlp_gate_proj_input_max_promoted_to_fp16, x = normed_output_359_cast_fp16)[name = string("clip_190_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_13_mlp_gate_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_13_mlp_gate_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(286369024)))];
+            tensor<fp16, [1, 2304, 3072]> linear_96_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_13_mlp_gate_proj_linear_weight_promoted_to_fp16, x = clip_190_cast_fp16)[name = string("linear_96_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_13_mlp_gate_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_13_mlp_gate_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.08p+3)];
+            fp16 model_vision_tower_encoder_layers_13_mlp_gate_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_13_mlp_gate_proj_output_max_promoted_to_fp16"), val = fp16(0x1.06p+3)];
+            tensor<fp16, [1, 2304, 3072]> clip_191_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_13_mlp_gate_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_13_mlp_gate_proj_output_max_promoted_to_fp16, x = linear_96_cast_fp16)[name = string("clip_191_cast_fp16")];
+            string var_4475_mode_0 = const()[name = string("op_4475_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 2304, 3072]> var_4475_cast_fp16 = gelu(mode = var_4475_mode_0, x = clip_191_cast_fp16)[name = string("op_4475_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_13_mlp_up_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_13_mlp_up_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(291087680)))];
+            tensor<fp16, [1, 2304, 3072]> linear_97_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_13_mlp_up_proj_linear_weight_promoted_to_fp16, x = clip_190_cast_fp16)[name = string("linear_97_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_13_mlp_up_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_13_mlp_up_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.08p+3)];
+            fp16 model_vision_tower_encoder_layers_13_mlp_up_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_13_mlp_up_proj_output_max_promoted_to_fp16"), val = fp16(0x1.06p+3)];
+            tensor<fp16, [1, 2304, 3072]> clip_193_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_13_mlp_up_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_13_mlp_up_proj_output_max_promoted_to_fp16, x = linear_97_cast_fp16)[name = string("clip_193_cast_fp16")];
+            tensor<fp16, [1, 2304, 3072]> hidden_states_805_cast_fp16 = mul(x = var_4475_cast_fp16, y = clip_193_cast_fp16)[name = string("hidden_states_805_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_13_mlp_down_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_13_mlp_down_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.8ap+3)];
+            fp16 model_vision_tower_encoder_layers_13_mlp_down_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_13_mlp_down_proj_input_max_promoted_to_fp16"), val = fp16(0x1.86p+3)];
+            tensor<fp16, [1, 2304, 3072]> clip_194_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_13_mlp_down_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_13_mlp_down_proj_input_max_promoted_to_fp16, x = hidden_states_805_cast_fp16)[name = string("clip_194_cast_fp16")];
+            tensor<fp16, [768, 3072]> model_vision_tower_encoder_layers_13_mlp_down_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_13_mlp_down_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 3072]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(295806336)))];
+            tensor<fp16, [1, 2304, 768]> linear_98_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_13_mlp_down_proj_linear_weight_promoted_to_fp16, x = clip_194_cast_fp16)[name = string("linear_98_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_13_mlp_down_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_13_mlp_down_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.f4p+1)];
+            fp16 model_vision_tower_encoder_layers_13_mlp_down_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_13_mlp_down_proj_output_max_promoted_to_fp16"), val = fp16(0x1.fp+1)];
+            tensor<fp16, [1, 2304, 768]> clip_195_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_13_mlp_down_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_13_mlp_down_proj_output_max_promoted_to_fp16, x = linear_98_cast_fp16)[name = string("clip_195_cast_fp16")];
+            fp16 var_33_promoted_97_to_fp16 = const()[name = string("op_33_promoted_97_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_4497_cast_fp16 = pow(x = clip_195_cast_fp16, y = var_33_promoted_97_to_fp16)[name = string("op_4497_cast_fp16")];
+            tensor<int32, [1]> var_4499_axes_0 = const()[name = string("op_4499_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4499_keep_dims_0 = const()[name = string("op_4499_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_4499_cast_fp16 = reduce_mean(axes = var_4499_axes_0, keep_dims = var_4499_keep_dims_0, x = var_4497_cast_fp16)[name = string("op_4499_cast_fp16")];
+            fp16 var_4500_to_fp16 = const()[name = string("op_4500_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_195_cast_fp16 = add(x = var_4499_cast_fp16, y = var_4500_to_fp16)[name = string("mean_squared_195_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_4502_cast_fp16 = pow(x = mean_squared_195_cast_fp16, y = var_27_to_fp16)[name = string("op_4502_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_361_cast_fp16 = mul(x = clip_195_cast_fp16, y = var_4502_cast_fp16)[name = string("normed_output_361_cast_fp16")];
+            tensor<fp16, [768]> const_341_to_fp16 = const()[name = string("const_341_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(300524992)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_363_cast_fp16 = mul(x = normed_output_361_cast_fp16, y = const_341_to_fp16)[name = string("normed_output_363_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_815_cast_fp16 = add(x = hidden_states_795_cast_fp16, y = normed_output_363_cast_fp16)[name = string("hidden_states_815_cast_fp16")];
+            fp16 var_33_promoted_98_to_fp16 = const()[name = string("op_33_promoted_98_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_4516_cast_fp16 = pow(x = hidden_states_815_cast_fp16, y = var_33_promoted_98_to_fp16)[name = string("op_4516_cast_fp16")];
+            tensor<int32, [1]> var_4518_axes_0 = const()[name = string("op_4518_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4518_keep_dims_0 = const()[name = string("op_4518_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_4518_cast_fp16 = reduce_mean(axes = var_4518_axes_0, keep_dims = var_4518_keep_dims_0, x = var_4516_cast_fp16)[name = string("op_4518_cast_fp16")];
+            fp16 var_4519_to_fp16 = const()[name = string("op_4519_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_197_cast_fp16 = add(x = var_4518_cast_fp16, y = var_4519_to_fp16)[name = string("mean_squared_197_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_4521_cast_fp16 = pow(x = mean_squared_197_cast_fp16, y = var_27_to_fp16)[name = string("op_4521_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_365_cast_fp16 = mul(x = hidden_states_815_cast_fp16, y = var_4521_cast_fp16)[name = string("normed_output_365_cast_fp16")];
+            tensor<fp16, [768]> const_342_to_fp16 = const()[name = string("const_342_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(300526592)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_367_cast_fp16 = mul(x = normed_output_365_cast_fp16, y = const_342_to_fp16)[name = string("normed_output_367_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_14_self_attn_q_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_14_self_attn_q_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.3ap+4)];
+            fp16 model_vision_tower_encoder_layers_14_self_attn_q_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_14_self_attn_q_proj_input_max_promoted_to_fp16"), val = fp16(0x1.38p+4)];
+            tensor<fp16, [1, 2304, 768]> clip_196_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_14_self_attn_q_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_14_self_attn_q_proj_input_max_promoted_to_fp16, x = normed_output_367_cast_fp16)[name = string("clip_196_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_14_self_attn_q_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_14_self_attn_q_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(300528192)))];
+            tensor<fp16, [1, 2304, 768]> linear_99_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_14_self_attn_q_proj_linear_weight_promoted_to_fp16, x = clip_196_cast_fp16)[name = string("linear_99_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_14_self_attn_q_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_14_self_attn_q_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.62p+4)];
+            fp16 model_vision_tower_encoder_layers_14_self_attn_q_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_14_self_attn_q_proj_output_max_promoted_to_fp16"), val = fp16(0x1.5ep+4)];
+            tensor<fp16, [1, 2304, 768]> clip_197_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_14_self_attn_q_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_14_self_attn_q_proj_output_max_promoted_to_fp16, x = linear_99_cast_fp16)[name = string("clip_197_cast_fp16")];
+            tensor<int32, [4]> var_4543 = const()[name = string("op_4543"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_823_cast_fp16 = reshape(shape = var_4543, x = clip_197_cast_fp16)[name = string("hidden_states_823_cast_fp16")];
+            fp16 var_33_promoted_99_to_fp16 = const()[name = string("op_33_promoted_99_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_4547_cast_fp16 = pow(x = hidden_states_823_cast_fp16, y = var_33_promoted_99_to_fp16)[name = string("op_4547_cast_fp16")];
+            tensor<int32, [1]> var_4549_axes_0 = const()[name = string("op_4549_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4549_keep_dims_0 = const()[name = string("op_4549_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_4549_cast_fp16 = reduce_mean(axes = var_4549_axes_0, keep_dims = var_4549_keep_dims_0, x = var_4547_cast_fp16)[name = string("op_4549_cast_fp16")];
+            fp16 var_4550_to_fp16 = const()[name = string("op_4550_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_199_cast_fp16 = add(x = var_4549_cast_fp16, y = var_4550_to_fp16)[name = string("mean_squared_199_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_4552_cast_fp16 = pow(x = mean_squared_199_cast_fp16, y = var_27_to_fp16)[name = string("op_4552_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_369_cast_fp16 = mul(x = hidden_states_823_cast_fp16, y = var_4552_cast_fp16)[name = string("normed_output_369_cast_fp16")];
+            tensor<fp16, [64]> const_345_to_fp16 = const()[name = string("const_345_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(301707904)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_371_cast_fp16 = mul(x = normed_output_369_cast_fp16, y = const_345_to_fp16)[name = string("normed_output_371_cast_fp16")];
+            tensor<int32, [2]> var_4572 = const()[name = string("op_4572"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_4573_axis_0 = const()[name = string("op_4573_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_4573_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_4573_cast_fp16_1 = split(axis = var_4573_axis_0, split_sizes = var_4572, x = normed_output_371_cast_fp16)[name = string("op_4573_cast_fp16")];
+            tensor<int32, [2]> var_4576 = const()[name = string("op_4576"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_4577_axis_0 = const()[name = string("op_4577_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_4577_0, tensor<fp16, [1, 2304, 32]> var_4577_1 = split(axis = var_4577_axis_0, split_sizes = var_4576, x = var_160_cast_fp16)[name = string("op_4577")];
+            tensor<int32, [2]> var_4580 = const()[name = string("op_4580"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_4581_axis_0 = const()[name = string("op_4581_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_4581_0, tensor<fp16, [1, 2304, 32]> var_4581_1 = split(axis = var_4581_axis_0, split_sizes = var_4580, x = var_163_cast_fp16)[name = string("op_4581")];
+            tensor<int32, [1]> cos_229_axes_0 = const()[name = string("cos_229_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_229 = expand_dims(axes = cos_229_axes_0, x = var_4577_0)[name = string("cos_229")];
+            tensor<int32, [1]> sin_229_axes_0 = const()[name = string("sin_229_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_229 = expand_dims(axes = sin_229_axes_0, x = var_4581_0)[name = string("sin_229")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4586_cast_fp16 = mul(x = var_4573_cast_fp16_0, y = cos_229)[name = string("op_4586_cast_fp16")];
+            tensor<int32, [4]> x1_113_begin_0 = const()[name = string("x1_113_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_113_end_0 = const()[name = string("x1_113_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_113_end_mask_0 = const()[name = string("x1_113_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_113_cast_fp16 = slice_by_index(begin = x1_113_begin_0, end = x1_113_end_0, end_mask = x1_113_end_mask_0, x = var_4573_cast_fp16_0)[name = string("x1_113_cast_fp16")];
+            tensor<int32, [4]> x2_113_begin_0 = const()[name = string("x2_113_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_113_end_0 = const()[name = string("x2_113_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_113_end_mask_0 = const()[name = string("x2_113_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_113_cast_fp16 = slice_by_index(begin = x2_113_begin_0, end = x2_113_end_0, end_mask = x2_113_end_mask_0, x = var_4573_cast_fp16_0)[name = string("x2_113_cast_fp16")];
+            fp16 const_350_promoted_to_fp16 = const()[name = string("const_350_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_4597_cast_fp16 = mul(x = x2_113_cast_fp16, y = const_350_promoted_to_fp16)[name = string("op_4597_cast_fp16")];
+            bool var_4599_interleave_0 = const()[name = string("op_4599_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_4599_cast_fp16 = concat(axis = var_38, interleave = var_4599_interleave_0, values = (var_4597_cast_fp16, x1_113_cast_fp16))[name = string("op_4599_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4600_cast_fp16 = mul(x = var_4599_cast_fp16, y = sin_229)[name = string("op_4600_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4601_cast_fp16 = add(x = var_4586_cast_fp16, y = var_4600_cast_fp16)[name = string("op_4601_cast_fp16")];
+            tensor<int32, [1]> cos_233_axes_0 = const()[name = string("cos_233_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_233 = expand_dims(axes = cos_233_axes_0, x = var_4577_1)[name = string("cos_233")];
+            tensor<int32, [1]> sin_233_axes_0 = const()[name = string("sin_233_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_233 = expand_dims(axes = sin_233_axes_0, x = var_4581_1)[name = string("sin_233")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4604_cast_fp16 = mul(x = var_4573_cast_fp16_1, y = cos_233)[name = string("op_4604_cast_fp16")];
+            tensor<int32, [4]> x1_115_begin_0 = const()[name = string("x1_115_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_115_end_0 = const()[name = string("x1_115_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_115_end_mask_0 = const()[name = string("x1_115_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_115_cast_fp16 = slice_by_index(begin = x1_115_begin_0, end = x1_115_end_0, end_mask = x1_115_end_mask_0, x = var_4573_cast_fp16_1)[name = string("x1_115_cast_fp16")];
+            tensor<int32, [4]> x2_115_begin_0 = const()[name = string("x2_115_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_115_end_0 = const()[name = string("x2_115_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_115_end_mask_0 = const()[name = string("x2_115_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_115_cast_fp16 = slice_by_index(begin = x2_115_begin_0, end = x2_115_end_0, end_mask = x2_115_end_mask_0, x = var_4573_cast_fp16_1)[name = string("x2_115_cast_fp16")];
+            fp16 const_353_promoted_to_fp16 = const()[name = string("const_353_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_4615_cast_fp16 = mul(x = x2_115_cast_fp16, y = const_353_promoted_to_fp16)[name = string("op_4615_cast_fp16")];
+            bool var_4617_interleave_0 = const()[name = string("op_4617_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_4617_cast_fp16 = concat(axis = var_38, interleave = var_4617_interleave_0, values = (var_4615_cast_fp16, x1_115_cast_fp16))[name = string("op_4617_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4618_cast_fp16 = mul(x = var_4617_cast_fp16, y = sin_233)[name = string("op_4618_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4619_cast_fp16 = add(x = var_4604_cast_fp16, y = var_4618_cast_fp16)[name = string("op_4619_cast_fp16")];
+            bool query_states_29_interleave_0 = const()[name = string("query_states_29_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> query_states_29_cast_fp16 = concat(axis = var_38, interleave = query_states_29_interleave_0, values = (var_4601_cast_fp16, var_4619_cast_fp16))[name = string("query_states_29_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_14_self_attn_k_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_14_self_attn_k_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(301708096)))];
+            tensor<fp16, [1, 2304, 768]> linear_100_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_14_self_attn_k_proj_linear_weight_promoted_to_fp16, x = clip_196_cast_fp16)[name = string("linear_100_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_14_self_attn_k_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_14_self_attn_k_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.7p+4)];
+            fp16 model_vision_tower_encoder_layers_14_self_attn_k_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_14_self_attn_k_proj_output_max_promoted_to_fp16"), val = fp16(0x1.6ep+4)];
+            tensor<fp16, [1, 2304, 768]> clip_199_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_14_self_attn_k_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_14_self_attn_k_proj_output_max_promoted_to_fp16, x = linear_100_cast_fp16)[name = string("clip_199_cast_fp16")];
+            tensor<int32, [4]> var_4632 = const()[name = string("op_4632"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_829_cast_fp16 = reshape(shape = var_4632, x = clip_199_cast_fp16)[name = string("hidden_states_829_cast_fp16")];
+            fp16 var_33_promoted_100_to_fp16 = const()[name = string("op_33_promoted_100_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_4636_cast_fp16 = pow(x = hidden_states_829_cast_fp16, y = var_33_promoted_100_to_fp16)[name = string("op_4636_cast_fp16")];
+            tensor<int32, [1]> var_4638_axes_0 = const()[name = string("op_4638_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4638_keep_dims_0 = const()[name = string("op_4638_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_4638_cast_fp16 = reduce_mean(axes = var_4638_axes_0, keep_dims = var_4638_keep_dims_0, x = var_4636_cast_fp16)[name = string("op_4638_cast_fp16")];
+            fp16 var_4639_to_fp16 = const()[name = string("op_4639_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_201_cast_fp16 = add(x = var_4638_cast_fp16, y = var_4639_to_fp16)[name = string("mean_squared_201_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_4641_cast_fp16 = pow(x = mean_squared_201_cast_fp16, y = var_27_to_fp16)[name = string("op_4641_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_373_cast_fp16 = mul(x = hidden_states_829_cast_fp16, y = var_4641_cast_fp16)[name = string("normed_output_373_cast_fp16")];
+            tensor<fp16, [64]> const_354_to_fp16 = const()[name = string("const_354_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(302887808)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_375_cast_fp16 = mul(x = normed_output_373_cast_fp16, y = const_354_to_fp16)[name = string("normed_output_375_cast_fp16")];
+            tensor<int32, [2]> var_4661 = const()[name = string("op_4661"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_4662_axis_0 = const()[name = string("op_4662_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_4662_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_4662_cast_fp16_1 = split(axis = var_4662_axis_0, split_sizes = var_4661, x = normed_output_375_cast_fp16)[name = string("op_4662_cast_fp16")];
+            tensor<int32, [2]> var_4665 = const()[name = string("op_4665"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_4666_axis_0 = const()[name = string("op_4666_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_4666_0, tensor<fp16, [1, 2304, 32]> var_4666_1 = split(axis = var_4666_axis_0, split_sizes = var_4665, x = var_160_cast_fp16)[name = string("op_4666")];
+            tensor<int32, [2]> var_4669 = const()[name = string("op_4669"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_4670_axis_0 = const()[name = string("op_4670_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_4670_0, tensor<fp16, [1, 2304, 32]> var_4670_1 = split(axis = var_4670_axis_0, split_sizes = var_4669, x = var_163_cast_fp16)[name = string("op_4670")];
+            tensor<int32, [1]> cos_237_axes_0 = const()[name = string("cos_237_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_237 = expand_dims(axes = cos_237_axes_0, x = var_4666_0)[name = string("cos_237")];
+            tensor<int32, [1]> sin_237_axes_0 = const()[name = string("sin_237_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_237 = expand_dims(axes = sin_237_axes_0, x = var_4670_0)[name = string("sin_237")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4675_cast_fp16 = mul(x = var_4662_cast_fp16_0, y = cos_237)[name = string("op_4675_cast_fp16")];
+            tensor<int32, [4]> x1_117_begin_0 = const()[name = string("x1_117_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_117_end_0 = const()[name = string("x1_117_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_117_end_mask_0 = const()[name = string("x1_117_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_117_cast_fp16 = slice_by_index(begin = x1_117_begin_0, end = x1_117_end_0, end_mask = x1_117_end_mask_0, x = var_4662_cast_fp16_0)[name = string("x1_117_cast_fp16")];
+            tensor<int32, [4]> x2_117_begin_0 = const()[name = string("x2_117_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_117_end_0 = const()[name = string("x2_117_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_117_end_mask_0 = const()[name = string("x2_117_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_117_cast_fp16 = slice_by_index(begin = x2_117_begin_0, end = x2_117_end_0, end_mask = x2_117_end_mask_0, x = var_4662_cast_fp16_0)[name = string("x2_117_cast_fp16")];
+            fp16 const_359_promoted_to_fp16 = const()[name = string("const_359_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_4686_cast_fp16 = mul(x = x2_117_cast_fp16, y = const_359_promoted_to_fp16)[name = string("op_4686_cast_fp16")];
+            bool var_4688_interleave_0 = const()[name = string("op_4688_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_4688_cast_fp16 = concat(axis = var_38, interleave = var_4688_interleave_0, values = (var_4686_cast_fp16, x1_117_cast_fp16))[name = string("op_4688_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4689_cast_fp16 = mul(x = var_4688_cast_fp16, y = sin_237)[name = string("op_4689_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4690_cast_fp16 = add(x = var_4675_cast_fp16, y = var_4689_cast_fp16)[name = string("op_4690_cast_fp16")];
+            tensor<int32, [1]> cos_241_axes_0 = const()[name = string("cos_241_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_241 = expand_dims(axes = cos_241_axes_0, x = var_4666_1)[name = string("cos_241")];
+            tensor<int32, [1]> sin_241_axes_0 = const()[name = string("sin_241_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_241 = expand_dims(axes = sin_241_axes_0, x = var_4670_1)[name = string("sin_241")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4693_cast_fp16 = mul(x = var_4662_cast_fp16_1, y = cos_241)[name = string("op_4693_cast_fp16")];
+            tensor<int32, [4]> x1_119_begin_0 = const()[name = string("x1_119_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_119_end_0 = const()[name = string("x1_119_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_119_end_mask_0 = const()[name = string("x1_119_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_119_cast_fp16 = slice_by_index(begin = x1_119_begin_0, end = x1_119_end_0, end_mask = x1_119_end_mask_0, x = var_4662_cast_fp16_1)[name = string("x1_119_cast_fp16")];
+            tensor<int32, [4]> x2_119_begin_0 = const()[name = string("x2_119_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_119_end_0 = const()[name = string("x2_119_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_119_end_mask_0 = const()[name = string("x2_119_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_119_cast_fp16 = slice_by_index(begin = x2_119_begin_0, end = x2_119_end_0, end_mask = x2_119_end_mask_0, x = var_4662_cast_fp16_1)[name = string("x2_119_cast_fp16")];
+            fp16 const_362_promoted_to_fp16 = const()[name = string("const_362_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_4704_cast_fp16 = mul(x = x2_119_cast_fp16, y = const_362_promoted_to_fp16)[name = string("op_4704_cast_fp16")];
+            bool var_4706_interleave_0 = const()[name = string("op_4706_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_4706_cast_fp16 = concat(axis = var_38, interleave = var_4706_interleave_0, values = (var_4704_cast_fp16, x1_119_cast_fp16))[name = string("op_4706_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4707_cast_fp16 = mul(x = var_4706_cast_fp16, y = sin_241)[name = string("op_4707_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4708_cast_fp16 = add(x = var_4693_cast_fp16, y = var_4707_cast_fp16)[name = string("op_4708_cast_fp16")];
+            bool key_states_29_interleave_0 = const()[name = string("key_states_29_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> key_states_29_cast_fp16 = concat(axis = var_38, interleave = key_states_29_interleave_0, values = (var_4690_cast_fp16, var_4708_cast_fp16))[name = string("key_states_29_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_14_self_attn_v_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_14_self_attn_v_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(302888000)))];
+            tensor<fp16, [1, 2304, 768]> linear_101_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_14_self_attn_v_proj_linear_weight_promoted_to_fp16, x = clip_196_cast_fp16)[name = string("linear_101_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_14_self_attn_v_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_14_self_attn_v_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.7p+4)];
+            fp16 model_vision_tower_encoder_layers_14_self_attn_v_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_14_self_attn_v_proj_output_max_promoted_to_fp16"), val = fp16(0x1.6ep+4)];
+            tensor<fp16, [1, 2304, 768]> clip_201_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_14_self_attn_v_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_14_self_attn_v_proj_output_max_promoted_to_fp16, x = linear_101_cast_fp16)[name = string("clip_201_cast_fp16")];
+            tensor<int32, [4]> var_4721 = const()[name = string("op_4721"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_835_cast_fp16 = reshape(shape = var_4721, x = clip_201_cast_fp16)[name = string("hidden_states_835_cast_fp16")];
+            fp16 var_33_promoted_101_to_fp16 = const()[name = string("op_33_promoted_101_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_4724_cast_fp16 = pow(x = hidden_states_835_cast_fp16, y = var_33_promoted_101_to_fp16)[name = string("op_4724_cast_fp16")];
+            tensor<int32, [1]> var_4726_axes_0 = const()[name = string("op_4726_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4726_keep_dims_0 = const()[name = string("op_4726_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_4726_cast_fp16 = reduce_mean(axes = var_4726_axes_0, keep_dims = var_4726_keep_dims_0, x = var_4724_cast_fp16)[name = string("op_4726_cast_fp16")];
+            fp16 var_4727_to_fp16 = const()[name = string("op_4727_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_203_cast_fp16 = add(x = var_4726_cast_fp16, y = var_4727_to_fp16)[name = string("mean_squared_203_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_4729_cast_fp16 = pow(x = mean_squared_203_cast_fp16, y = var_27_to_fp16)[name = string("op_4729_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_377_cast_fp16 = mul(x = hidden_states_835_cast_fp16, y = var_4729_cast_fp16)[name = string("normed_output_377_cast_fp16")];
+            tensor<int32, [4]> hidden_states_841_perm_0 = const()[name = string("hidden_states_841_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            bool matmul_14_transpose_y_0 = const()[name = string("matmul_14_transpose_y_0"), val = bool(true)];
+            bool matmul_14_transpose_x_0 = const()[name = string("matmul_14_transpose_x_0"), val = bool(false)];
+            tensor<int32, [4]> transpose_92_perm_0 = const()[name = string("transpose_92_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<int32, [4]> transpose_93_perm_0 = const()[name = string("transpose_93_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_93 = transpose(perm = transpose_93_perm_0, x = key_states_29_cast_fp16)[name = string("transpose_101")];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_92 = transpose(perm = transpose_92_perm_0, x = query_states_29_cast_fp16)[name = string("transpose_102")];
+            tensor<fp16, [1, 12, 2304, 2304]> matmul_14_cast_fp16 = matmul(transpose_x = matmul_14_transpose_x_0, transpose_y = matmul_14_transpose_y_0, x = transpose_92, y = transpose_93)[name = string("matmul_14_cast_fp16")];
+            tensor<fp16, [1, 12, 2304, 2304]> add_14_cast_fp16 = add(x = matmul_14_cast_fp16, y = attention_mask_cast_fp16)[name = string("add_14_cast_fp16")];
+            int32 softmax_14_axis_0 = const()[name = string("softmax_14_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 12, 2304, 2304]> softmax_14_cast_fp16 = softmax(axis = softmax_14_axis_0, x = add_14_cast_fp16)[name = string("softmax_14_cast_fp16")];
+            bool attn_output_57_transpose_x_0 = const()[name = string("attn_output_57_transpose_x_0"), val = bool(false)];
+            bool attn_output_57_transpose_y_0 = const()[name = string("attn_output_57_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 12, 2304, 64]> hidden_states_841_cast_fp16 = transpose(perm = hidden_states_841_perm_0, x = normed_output_377_cast_fp16)[name = string("transpose_103")];
+            tensor<fp16, [1, 12, 2304, 64]> attn_output_57_cast_fp16 = matmul(transpose_x = attn_output_57_transpose_x_0, transpose_y = attn_output_57_transpose_y_0, x = softmax_14_cast_fp16, y = hidden_states_841_cast_fp16)[name = string("attn_output_57_cast_fp16")];
+            tensor<int32, [4]> var_4734_perm_0 = const()[name = string("op_4734_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_4736 = const()[name = string("op_4736"), val = tensor<int32, [3]>([1, 2304, -1])];
+            tensor<fp16, [1, 2304, 12, 64]> var_4734_cast_fp16 = transpose(perm = var_4734_perm_0, x = attn_output_57_cast_fp16)[name = string("transpose_100")];
+            tensor<fp16, [1, 2304, 768]> var_4737_cast_fp16 = reshape(shape = var_4736, x = var_4734_cast_fp16)[name = string("op_4737_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_14_self_attn_o_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_14_self_attn_o_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.2cp+1)];
+            fp16 model_vision_tower_encoder_layers_14_self_attn_o_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_14_self_attn_o_proj_input_max_promoted_to_fp16"), val = fp16(0x1.2ap+1)];
+            tensor<fp16, [1, 2304, 768]> clip_202_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_14_self_attn_o_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_14_self_attn_o_proj_input_max_promoted_to_fp16, x = var_4737_cast_fp16)[name = string("clip_202_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_14_self_attn_o_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_14_self_attn_o_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(304067712)))];
+            tensor<fp16, [1, 2304, 768]> linear_102_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_14_self_attn_o_proj_linear_weight_promoted_to_fp16, x = clip_202_cast_fp16)[name = string("linear_102_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_14_self_attn_o_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_14_self_attn_o_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.74p+1)];
+            fp16 model_vision_tower_encoder_layers_14_self_attn_o_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_14_self_attn_o_proj_output_max_promoted_to_fp16"), val = fp16(0x1.7p+1)];
+            tensor<fp16, [1, 2304, 768]> clip_203_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_14_self_attn_o_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_14_self_attn_o_proj_output_max_promoted_to_fp16, x = linear_102_cast_fp16)[name = string("clip_203_cast_fp16")];
+            fp16 var_33_promoted_102_to_fp16 = const()[name = string("op_33_promoted_102_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_4750_cast_fp16 = pow(x = clip_203_cast_fp16, y = var_33_promoted_102_to_fp16)[name = string("op_4750_cast_fp16")];
+            tensor<int32, [1]> var_4752_axes_0 = const()[name = string("op_4752_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4752_keep_dims_0 = const()[name = string("op_4752_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_4752_cast_fp16 = reduce_mean(axes = var_4752_axes_0, keep_dims = var_4752_keep_dims_0, x = var_4750_cast_fp16)[name = string("op_4752_cast_fp16")];
+            fp16 var_4753_to_fp16 = const()[name = string("op_4753_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_205_cast_fp16 = add(x = var_4752_cast_fp16, y = var_4753_to_fp16)[name = string("mean_squared_205_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_4755_cast_fp16 = pow(x = mean_squared_205_cast_fp16, y = var_27_to_fp16)[name = string("op_4755_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_379_cast_fp16 = mul(x = clip_203_cast_fp16, y = var_4755_cast_fp16)[name = string("normed_output_379_cast_fp16")];
+            tensor<fp16, [768]> const_363_to_fp16 = const()[name = string("const_363_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(305247424)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_381_cast_fp16 = mul(x = normed_output_379_cast_fp16, y = const_363_to_fp16)[name = string("normed_output_381_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_853_cast_fp16 = add(x = hidden_states_815_cast_fp16, y = normed_output_381_cast_fp16)[name = string("hidden_states_853_cast_fp16")];
+            fp16 var_33_promoted_103_to_fp16 = const()[name = string("op_33_promoted_103_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_4763_cast_fp16 = pow(x = hidden_states_853_cast_fp16, y = var_33_promoted_103_to_fp16)[name = string("op_4763_cast_fp16")];
+            tensor<int32, [1]> var_4765_axes_0 = const()[name = string("op_4765_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4765_keep_dims_0 = const()[name = string("op_4765_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_4765_cast_fp16 = reduce_mean(axes = var_4765_axes_0, keep_dims = var_4765_keep_dims_0, x = var_4763_cast_fp16)[name = string("op_4765_cast_fp16")];
+            fp16 var_4766_to_fp16 = const()[name = string("op_4766_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_207_cast_fp16 = add(x = var_4765_cast_fp16, y = var_4766_to_fp16)[name = string("mean_squared_207_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_4768_cast_fp16 = pow(x = mean_squared_207_cast_fp16, y = var_27_to_fp16)[name = string("op_4768_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_383_cast_fp16 = mul(x = hidden_states_853_cast_fp16, y = var_4768_cast_fp16)[name = string("normed_output_383_cast_fp16")];
+            tensor<fp16, [768]> const_364_to_fp16 = const()[name = string("const_364_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(305249024)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_385_cast_fp16 = mul(x = normed_output_383_cast_fp16, y = const_364_to_fp16)[name = string("normed_output_385_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_14_mlp_gate_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_14_mlp_gate_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.1ap+3)];
+            fp16 model_vision_tower_encoder_layers_14_mlp_gate_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_14_mlp_gate_proj_input_max_promoted_to_fp16"), val = fp16(0x1.18p+3)];
+            tensor<fp16, [1, 2304, 768]> clip_204_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_14_mlp_gate_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_14_mlp_gate_proj_input_max_promoted_to_fp16, x = normed_output_385_cast_fp16)[name = string("clip_204_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_14_mlp_gate_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_14_mlp_gate_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(305250624)))];
+            tensor<fp16, [1, 2304, 3072]> linear_103_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_14_mlp_gate_proj_linear_weight_promoted_to_fp16, x = clip_204_cast_fp16)[name = string("linear_103_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_14_mlp_gate_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_14_mlp_gate_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.28p+3)];
+            fp16 model_vision_tower_encoder_layers_14_mlp_gate_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_14_mlp_gate_proj_output_max_promoted_to_fp16"), val = fp16(0x1.26p+3)];
+            tensor<fp16, [1, 2304, 3072]> clip_205_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_14_mlp_gate_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_14_mlp_gate_proj_output_max_promoted_to_fp16, x = linear_103_cast_fp16)[name = string("clip_205_cast_fp16")];
+            string var_4785_mode_0 = const()[name = string("op_4785_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 2304, 3072]> var_4785_cast_fp16 = gelu(mode = var_4785_mode_0, x = clip_205_cast_fp16)[name = string("op_4785_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_14_mlp_up_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_14_mlp_up_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(309969280)))];
+            tensor<fp16, [1, 2304, 3072]> linear_104_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_14_mlp_up_proj_linear_weight_promoted_to_fp16, x = clip_204_cast_fp16)[name = string("linear_104_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_14_mlp_up_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_14_mlp_up_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.28p+3)];
+            fp16 model_vision_tower_encoder_layers_14_mlp_up_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_14_mlp_up_proj_output_max_promoted_to_fp16"), val = fp16(0x1.26p+3)];
+            tensor<fp16, [1, 2304, 3072]> clip_207_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_14_mlp_up_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_14_mlp_up_proj_output_max_promoted_to_fp16, x = linear_104_cast_fp16)[name = string("clip_207_cast_fp16")];
+            tensor<fp16, [1, 2304, 3072]> hidden_states_863_cast_fp16 = mul(x = var_4785_cast_fp16, y = clip_207_cast_fp16)[name = string("hidden_states_863_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_14_mlp_down_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_14_mlp_down_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.bp+3)];
+            fp16 model_vision_tower_encoder_layers_14_mlp_down_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_14_mlp_down_proj_input_max_promoted_to_fp16"), val = fp16(0x1.aep+3)];
+            tensor<fp16, [1, 2304, 3072]> clip_208_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_14_mlp_down_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_14_mlp_down_proj_input_max_promoted_to_fp16, x = hidden_states_863_cast_fp16)[name = string("clip_208_cast_fp16")];
+            tensor<fp16, [768, 3072]> model_vision_tower_encoder_layers_14_mlp_down_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_14_mlp_down_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 3072]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(314687936)))];
+            tensor<fp16, [1, 2304, 768]> linear_105_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_14_mlp_down_proj_linear_weight_promoted_to_fp16, x = clip_208_cast_fp16)[name = string("linear_105_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_14_mlp_down_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_14_mlp_down_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.3p+2)];
+            fp16 model_vision_tower_encoder_layers_14_mlp_down_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_14_mlp_down_proj_output_max_promoted_to_fp16"), val = fp16(0x1.2ep+2)];
+            tensor<fp16, [1, 2304, 768]> clip_209_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_14_mlp_down_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_14_mlp_down_proj_output_max_promoted_to_fp16, x = linear_105_cast_fp16)[name = string("clip_209_cast_fp16")];
+            fp16 var_33_promoted_104_to_fp16 = const()[name = string("op_33_promoted_104_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_4807_cast_fp16 = pow(x = clip_209_cast_fp16, y = var_33_promoted_104_to_fp16)[name = string("op_4807_cast_fp16")];
+            tensor<int32, [1]> var_4809_axes_0 = const()[name = string("op_4809_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4809_keep_dims_0 = const()[name = string("op_4809_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_4809_cast_fp16 = reduce_mean(axes = var_4809_axes_0, keep_dims = var_4809_keep_dims_0, x = var_4807_cast_fp16)[name = string("op_4809_cast_fp16")];
+            fp16 var_4810_to_fp16 = const()[name = string("op_4810_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_209_cast_fp16 = add(x = var_4809_cast_fp16, y = var_4810_to_fp16)[name = string("mean_squared_209_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_4812_cast_fp16 = pow(x = mean_squared_209_cast_fp16, y = var_27_to_fp16)[name = string("op_4812_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_387_cast_fp16 = mul(x = clip_209_cast_fp16, y = var_4812_cast_fp16)[name = string("normed_output_387_cast_fp16")];
+            tensor<fp16, [768]> const_365_to_fp16 = const()[name = string("const_365_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(319406592)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_389_cast_fp16 = mul(x = normed_output_387_cast_fp16, y = const_365_to_fp16)[name = string("normed_output_389_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_873_cast_fp16 = add(x = hidden_states_853_cast_fp16, y = normed_output_389_cast_fp16)[name = string("hidden_states_873_cast_fp16")];
+            fp16 var_33_promoted_105_to_fp16 = const()[name = string("op_33_promoted_105_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_4826_cast_fp16 = pow(x = hidden_states_873_cast_fp16, y = var_33_promoted_105_to_fp16)[name = string("op_4826_cast_fp16")];
+            tensor<int32, [1]> var_4828_axes_0 = const()[name = string("op_4828_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4828_keep_dims_0 = const()[name = string("op_4828_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_4828_cast_fp16 = reduce_mean(axes = var_4828_axes_0, keep_dims = var_4828_keep_dims_0, x = var_4826_cast_fp16)[name = string("op_4828_cast_fp16")];
+            fp16 var_4829_to_fp16 = const()[name = string("op_4829_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_211_cast_fp16 = add(x = var_4828_cast_fp16, y = var_4829_to_fp16)[name = string("mean_squared_211_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_4831_cast_fp16 = pow(x = mean_squared_211_cast_fp16, y = var_27_to_fp16)[name = string("op_4831_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_391_cast_fp16 = mul(x = hidden_states_873_cast_fp16, y = var_4831_cast_fp16)[name = string("normed_output_391_cast_fp16")];
+            tensor<fp16, [768]> const_366_to_fp16 = const()[name = string("const_366_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(319408192)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_393_cast_fp16 = mul(x = normed_output_391_cast_fp16, y = const_366_to_fp16)[name = string("normed_output_393_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_15_self_attn_q_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_15_self_attn_q_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.4cp+4)];
+            fp16 model_vision_tower_encoder_layers_15_self_attn_q_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_15_self_attn_q_proj_input_max_promoted_to_fp16"), val = fp16(0x1.4ap+4)];
+            tensor<fp16, [1, 2304, 768]> clip_210_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_15_self_attn_q_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_15_self_attn_q_proj_input_max_promoted_to_fp16, x = normed_output_393_cast_fp16)[name = string("clip_210_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_15_self_attn_q_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_15_self_attn_q_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(319409792)))];
+            tensor<fp16, [1, 2304, 768]> linear_106_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_15_self_attn_q_proj_linear_weight_promoted_to_fp16, x = clip_210_cast_fp16)[name = string("linear_106_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_15_self_attn_q_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_15_self_attn_q_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.6p+4)];
+            fp16 model_vision_tower_encoder_layers_15_self_attn_q_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_15_self_attn_q_proj_output_max_promoted_to_fp16"), val = fp16(0x1.5ep+4)];
+            tensor<fp16, [1, 2304, 768]> clip_211_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_15_self_attn_q_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_15_self_attn_q_proj_output_max_promoted_to_fp16, x = linear_106_cast_fp16)[name = string("clip_211_cast_fp16")];
+            tensor<int32, [4]> var_4853 = const()[name = string("op_4853"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_881_cast_fp16 = reshape(shape = var_4853, x = clip_211_cast_fp16)[name = string("hidden_states_881_cast_fp16")];
+            fp16 var_33_promoted_106_to_fp16 = const()[name = string("op_33_promoted_106_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_4857_cast_fp16 = pow(x = hidden_states_881_cast_fp16, y = var_33_promoted_106_to_fp16)[name = string("op_4857_cast_fp16")];
+            tensor<int32, [1]> var_4859_axes_0 = const()[name = string("op_4859_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4859_keep_dims_0 = const()[name = string("op_4859_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_4859_cast_fp16 = reduce_mean(axes = var_4859_axes_0, keep_dims = var_4859_keep_dims_0, x = var_4857_cast_fp16)[name = string("op_4859_cast_fp16")];
+            fp16 var_4860_to_fp16 = const()[name = string("op_4860_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_213_cast_fp16 = add(x = var_4859_cast_fp16, y = var_4860_to_fp16)[name = string("mean_squared_213_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_4862_cast_fp16 = pow(x = mean_squared_213_cast_fp16, y = var_27_to_fp16)[name = string("op_4862_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_395_cast_fp16 = mul(x = hidden_states_881_cast_fp16, y = var_4862_cast_fp16)[name = string("normed_output_395_cast_fp16")];
+            tensor<fp16, [64]> const_369_to_fp16 = const()[name = string("const_369_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(320589504)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_397_cast_fp16 = mul(x = normed_output_395_cast_fp16, y = const_369_to_fp16)[name = string("normed_output_397_cast_fp16")];
+            tensor<int32, [2]> var_4882 = const()[name = string("op_4882"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_4883_axis_0 = const()[name = string("op_4883_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_4883_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_4883_cast_fp16_1 = split(axis = var_4883_axis_0, split_sizes = var_4882, x = normed_output_397_cast_fp16)[name = string("op_4883_cast_fp16")];
+            tensor<int32, [2]> var_4886 = const()[name = string("op_4886"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_4887_axis_0 = const()[name = string("op_4887_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_4887_0, tensor<fp16, [1, 2304, 32]> var_4887_1 = split(axis = var_4887_axis_0, split_sizes = var_4886, x = var_160_cast_fp16)[name = string("op_4887")];
+            tensor<int32, [2]> var_4890 = const()[name = string("op_4890"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_4891_axis_0 = const()[name = string("op_4891_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_4891_0, tensor<fp16, [1, 2304, 32]> var_4891_1 = split(axis = var_4891_axis_0, split_sizes = var_4890, x = var_163_cast_fp16)[name = string("op_4891")];
+            tensor<int32, [1]> cos_245_axes_0 = const()[name = string("cos_245_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_245 = expand_dims(axes = cos_245_axes_0, x = var_4887_0)[name = string("cos_245")];
+            tensor<int32, [1]> sin_245_axes_0 = const()[name = string("sin_245_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_245 = expand_dims(axes = sin_245_axes_0, x = var_4891_0)[name = string("sin_245")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4896_cast_fp16 = mul(x = var_4883_cast_fp16_0, y = cos_245)[name = string("op_4896_cast_fp16")];
+            tensor<int32, [4]> x1_121_begin_0 = const()[name = string("x1_121_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_121_end_0 = const()[name = string("x1_121_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_121_end_mask_0 = const()[name = string("x1_121_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_121_cast_fp16 = slice_by_index(begin = x1_121_begin_0, end = x1_121_end_0, end_mask = x1_121_end_mask_0, x = var_4883_cast_fp16_0)[name = string("x1_121_cast_fp16")];
+            tensor<int32, [4]> x2_121_begin_0 = const()[name = string("x2_121_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_121_end_0 = const()[name = string("x2_121_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_121_end_mask_0 = const()[name = string("x2_121_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_121_cast_fp16 = slice_by_index(begin = x2_121_begin_0, end = x2_121_end_0, end_mask = x2_121_end_mask_0, x = var_4883_cast_fp16_0)[name = string("x2_121_cast_fp16")];
+            fp16 const_374_promoted_to_fp16 = const()[name = string("const_374_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_4907_cast_fp16 = mul(x = x2_121_cast_fp16, y = const_374_promoted_to_fp16)[name = string("op_4907_cast_fp16")];
+            bool var_4909_interleave_0 = const()[name = string("op_4909_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_4909_cast_fp16 = concat(axis = var_38, interleave = var_4909_interleave_0, values = (var_4907_cast_fp16, x1_121_cast_fp16))[name = string("op_4909_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4910_cast_fp16 = mul(x = var_4909_cast_fp16, y = sin_245)[name = string("op_4910_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4911_cast_fp16 = add(x = var_4896_cast_fp16, y = var_4910_cast_fp16)[name = string("op_4911_cast_fp16")];
+            tensor<int32, [1]> cos_249_axes_0 = const()[name = string("cos_249_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_249 = expand_dims(axes = cos_249_axes_0, x = var_4887_1)[name = string("cos_249")];
+            tensor<int32, [1]> sin_249_axes_0 = const()[name = string("sin_249_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_249 = expand_dims(axes = sin_249_axes_0, x = var_4891_1)[name = string("sin_249")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4914_cast_fp16 = mul(x = var_4883_cast_fp16_1, y = cos_249)[name = string("op_4914_cast_fp16")];
+            tensor<int32, [4]> x1_123_begin_0 = const()[name = string("x1_123_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_123_end_0 = const()[name = string("x1_123_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_123_end_mask_0 = const()[name = string("x1_123_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_123_cast_fp16 = slice_by_index(begin = x1_123_begin_0, end = x1_123_end_0, end_mask = x1_123_end_mask_0, x = var_4883_cast_fp16_1)[name = string("x1_123_cast_fp16")];
+            tensor<int32, [4]> x2_123_begin_0 = const()[name = string("x2_123_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_123_end_0 = const()[name = string("x2_123_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_123_end_mask_0 = const()[name = string("x2_123_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_123_cast_fp16 = slice_by_index(begin = x2_123_begin_0, end = x2_123_end_0, end_mask = x2_123_end_mask_0, x = var_4883_cast_fp16_1)[name = string("x2_123_cast_fp16")];
+            fp16 const_377_promoted_to_fp16 = const()[name = string("const_377_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_4925_cast_fp16 = mul(x = x2_123_cast_fp16, y = const_377_promoted_to_fp16)[name = string("op_4925_cast_fp16")];
+            bool var_4927_interleave_0 = const()[name = string("op_4927_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_4927_cast_fp16 = concat(axis = var_38, interleave = var_4927_interleave_0, values = (var_4925_cast_fp16, x1_123_cast_fp16))[name = string("op_4927_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4928_cast_fp16 = mul(x = var_4927_cast_fp16, y = sin_249)[name = string("op_4928_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4929_cast_fp16 = add(x = var_4914_cast_fp16, y = var_4928_cast_fp16)[name = string("op_4929_cast_fp16")];
+            bool query_states_interleave_0 = const()[name = string("query_states_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> query_states_cast_fp16 = concat(axis = var_38, interleave = query_states_interleave_0, values = (var_4911_cast_fp16, var_4929_cast_fp16))[name = string("query_states_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_15_self_attn_k_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_15_self_attn_k_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(320589696)))];
+            tensor<fp16, [1, 2304, 768]> linear_107_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_15_self_attn_k_proj_linear_weight_promoted_to_fp16, x = clip_210_cast_fp16)[name = string("linear_107_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_15_self_attn_k_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_15_self_attn_k_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.9p+4)];
+            fp16 model_vision_tower_encoder_layers_15_self_attn_k_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_15_self_attn_k_proj_output_max_promoted_to_fp16"), val = fp16(0x1.8ep+4)];
+            tensor<fp16, [1, 2304, 768]> clip_213_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_15_self_attn_k_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_15_self_attn_k_proj_output_max_promoted_to_fp16, x = linear_107_cast_fp16)[name = string("clip_213_cast_fp16")];
+            tensor<int32, [4]> var_4942 = const()[name = string("op_4942"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_887_cast_fp16 = reshape(shape = var_4942, x = clip_213_cast_fp16)[name = string("hidden_states_887_cast_fp16")];
+            fp16 var_33_promoted_107_to_fp16 = const()[name = string("op_33_promoted_107_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_4946_cast_fp16 = pow(x = hidden_states_887_cast_fp16, y = var_33_promoted_107_to_fp16)[name = string("op_4946_cast_fp16")];
+            tensor<int32, [1]> var_4948_axes_0 = const()[name = string("op_4948_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_4948_keep_dims_0 = const()[name = string("op_4948_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_4948_cast_fp16 = reduce_mean(axes = var_4948_axes_0, keep_dims = var_4948_keep_dims_0, x = var_4946_cast_fp16)[name = string("op_4948_cast_fp16")];
+            fp16 var_4949_to_fp16 = const()[name = string("op_4949_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_215_cast_fp16 = add(x = var_4948_cast_fp16, y = var_4949_to_fp16)[name = string("mean_squared_215_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_4951_cast_fp16 = pow(x = mean_squared_215_cast_fp16, y = var_27_to_fp16)[name = string("op_4951_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_399_cast_fp16 = mul(x = hidden_states_887_cast_fp16, y = var_4951_cast_fp16)[name = string("normed_output_399_cast_fp16")];
+            tensor<fp16, [64]> const_378_to_fp16 = const()[name = string("const_378_to_fp16"), val = tensor<fp16, [64]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(321769408)))];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_401_cast_fp16 = mul(x = normed_output_399_cast_fp16, y = const_378_to_fp16)[name = string("normed_output_401_cast_fp16")];
+            tensor<int32, [2]> var_4971 = const()[name = string("op_4971"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_4972_axis_0 = const()[name = string("op_4972_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 12, 32]> var_4972_cast_fp16_0, tensor<fp16, [1, 2304, 12, 32]> var_4972_cast_fp16_1 = split(axis = var_4972_axis_0, split_sizes = var_4971, x = normed_output_401_cast_fp16)[name = string("op_4972_cast_fp16")];
+            tensor<int32, [2]> var_4975 = const()[name = string("op_4975"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_4976_axis_0 = const()[name = string("op_4976_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_4976_0, tensor<fp16, [1, 2304, 32]> var_4976_1 = split(axis = var_4976_axis_0, split_sizes = var_4975, x = var_160_cast_fp16)[name = string("op_4976")];
+            tensor<int32, [2]> var_4979 = const()[name = string("op_4979"), val = tensor<int32, [2]>([32, 32])];
+            int32 var_4980_axis_0 = const()[name = string("op_4980_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 2304, 32]> var_4980_0, tensor<fp16, [1, 2304, 32]> var_4980_1 = split(axis = var_4980_axis_0, split_sizes = var_4979, x = var_163_cast_fp16)[name = string("op_4980")];
+            tensor<int32, [1]> cos_253_axes_0 = const()[name = string("cos_253_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos_253 = expand_dims(axes = cos_253_axes_0, x = var_4976_0)[name = string("cos_253")];
+            tensor<int32, [1]> sin_253_axes_0 = const()[name = string("sin_253_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin_253 = expand_dims(axes = sin_253_axes_0, x = var_4980_0)[name = string("sin_253")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4985_cast_fp16 = mul(x = var_4972_cast_fp16_0, y = cos_253)[name = string("op_4985_cast_fp16")];
+            tensor<int32, [4]> x1_125_begin_0 = const()[name = string("x1_125_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_125_end_0 = const()[name = string("x1_125_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_125_end_mask_0 = const()[name = string("x1_125_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_125_cast_fp16 = slice_by_index(begin = x1_125_begin_0, end = x1_125_end_0, end_mask = x1_125_end_mask_0, x = var_4972_cast_fp16_0)[name = string("x1_125_cast_fp16")];
+            tensor<int32, [4]> x2_125_begin_0 = const()[name = string("x2_125_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_125_end_0 = const()[name = string("x2_125_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_125_end_mask_0 = const()[name = string("x2_125_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_125_cast_fp16 = slice_by_index(begin = x2_125_begin_0, end = x2_125_end_0, end_mask = x2_125_end_mask_0, x = var_4972_cast_fp16_0)[name = string("x2_125_cast_fp16")];
+            fp16 const_383_promoted_to_fp16 = const()[name = string("const_383_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_4996_cast_fp16 = mul(x = x2_125_cast_fp16, y = const_383_promoted_to_fp16)[name = string("op_4996_cast_fp16")];
+            bool var_4998_interleave_0 = const()[name = string("op_4998_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_4998_cast_fp16 = concat(axis = var_38, interleave = var_4998_interleave_0, values = (var_4996_cast_fp16, x1_125_cast_fp16))[name = string("op_4998_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_4999_cast_fp16 = mul(x = var_4998_cast_fp16, y = sin_253)[name = string("op_4999_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_5000_cast_fp16 = add(x = var_4985_cast_fp16, y = var_4999_cast_fp16)[name = string("op_5000_cast_fp16")];
+            tensor<int32, [1]> cos_axes_0 = const()[name = string("cos_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> cos = expand_dims(axes = cos_axes_0, x = var_4976_1)[name = string("cos")];
+            tensor<int32, [1]> sin_axes_0 = const()[name = string("sin_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 2304, 1, 32]> sin = expand_dims(axes = sin_axes_0, x = var_4980_1)[name = string("sin")];
+            tensor<fp16, [1, 2304, 12, 32]> var_5003_cast_fp16 = mul(x = var_4972_cast_fp16_1, y = cos)[name = string("op_5003_cast_fp16")];
+            tensor<int32, [4]> x1_begin_0 = const()[name = string("x1_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> x1_end_0 = const()[name = string("x1_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 16])];
+            tensor<bool, [4]> x1_end_mask_0 = const()[name = string("x1_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 2304, 12, 16]> x1_cast_fp16 = slice_by_index(begin = x1_begin_0, end = x1_end_0, end_mask = x1_end_mask_0, x = var_4972_cast_fp16_1)[name = string("x1_cast_fp16")];
+            tensor<int32, [4]> x2_begin_0 = const()[name = string("x2_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 16])];
+            tensor<int32, [4]> x2_end_0 = const()[name = string("x2_end_0"), val = tensor<int32, [4]>([1, 2304, 12, 32])];
+            tensor<bool, [4]> x2_end_mask_0 = const()[name = string("x2_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 2304, 12, 16]> x2_cast_fp16 = slice_by_index(begin = x2_begin_0, end = x2_end_0, end_mask = x2_end_mask_0, x = var_4972_cast_fp16_1)[name = string("x2_cast_fp16")];
+            fp16 const_386_promoted_to_fp16 = const()[name = string("const_386_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 2304, 12, 16]> var_5014_cast_fp16 = mul(x = x2_cast_fp16, y = const_386_promoted_to_fp16)[name = string("op_5014_cast_fp16")];
+            bool var_5016_interleave_0 = const()[name = string("op_5016_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 32]> var_5016_cast_fp16 = concat(axis = var_38, interleave = var_5016_interleave_0, values = (var_5014_cast_fp16, x1_cast_fp16))[name = string("op_5016_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_5017_cast_fp16 = mul(x = var_5016_cast_fp16, y = sin)[name = string("op_5017_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 32]> var_5018_cast_fp16 = add(x = var_5003_cast_fp16, y = var_5017_cast_fp16)[name = string("op_5018_cast_fp16")];
+            bool key_states_interleave_0 = const()[name = string("key_states_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 2304, 12, 64]> key_states_cast_fp16 = concat(axis = var_38, interleave = key_states_interleave_0, values = (var_5000_cast_fp16, var_5018_cast_fp16))[name = string("key_states_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_15_self_attn_v_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_15_self_attn_v_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(321769600)))];
+            tensor<fp16, [1, 2304, 768]> linear_108_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_15_self_attn_v_proj_linear_weight_promoted_to_fp16, x = clip_210_cast_fp16)[name = string("linear_108_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_15_self_attn_v_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_15_self_attn_v_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.9p+4)];
+            fp16 model_vision_tower_encoder_layers_15_self_attn_v_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_15_self_attn_v_proj_output_max_promoted_to_fp16"), val = fp16(0x1.8ep+4)];
+            tensor<fp16, [1, 2304, 768]> clip_215_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_15_self_attn_v_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_15_self_attn_v_proj_output_max_promoted_to_fp16, x = linear_108_cast_fp16)[name = string("clip_215_cast_fp16")];
+            tensor<int32, [4]> var_5031 = const()[name = string("op_5031"), val = tensor<int32, [4]>([1, 2304, -1, 64])];
+            tensor<fp16, [1, 2304, 12, 64]> hidden_states_893_cast_fp16 = reshape(shape = var_5031, x = clip_215_cast_fp16)[name = string("hidden_states_893_cast_fp16")];
+            fp16 var_33_promoted_108_to_fp16 = const()[name = string("op_33_promoted_108_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 12, 64]> var_5034_cast_fp16 = pow(x = hidden_states_893_cast_fp16, y = var_33_promoted_108_to_fp16)[name = string("op_5034_cast_fp16")];
+            tensor<int32, [1]> var_5036_axes_0 = const()[name = string("op_5036_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5036_keep_dims_0 = const()[name = string("op_5036_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 12, 1]> var_5036_cast_fp16 = reduce_mean(axes = var_5036_axes_0, keep_dims = var_5036_keep_dims_0, x = var_5034_cast_fp16)[name = string("op_5036_cast_fp16")];
+            fp16 var_5037_to_fp16 = const()[name = string("op_5037_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 12, 1]> mean_squared_217_cast_fp16 = add(x = var_5036_cast_fp16, y = var_5037_to_fp16)[name = string("mean_squared_217_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 1]> var_5039_cast_fp16 = pow(x = mean_squared_217_cast_fp16, y = var_27_to_fp16)[name = string("op_5039_cast_fp16")];
+            tensor<fp16, [1, 2304, 12, 64]> normed_output_403_cast_fp16 = mul(x = hidden_states_893_cast_fp16, y = var_5039_cast_fp16)[name = string("normed_output_403_cast_fp16")];
+            tensor<int32, [4]> hidden_states_899_perm_0 = const()[name = string("hidden_states_899_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            bool matmul_15_transpose_y_0 = const()[name = string("matmul_15_transpose_y_0"), val = bool(true)];
+            bool matmul_15_transpose_x_0 = const()[name = string("matmul_15_transpose_x_0"), val = bool(false)];
+            tensor<int32, [4]> transpose_94_perm_0 = const()[name = string("transpose_94_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<int32, [4]> transpose_95_perm_0 = const()[name = string("transpose_95_perm_0"), val = tensor<int32, [4]>([0, 2, -3, -1])];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_95 = transpose(perm = transpose_95_perm_0, x = key_states_cast_fp16)[name = string("transpose_97")];
+            tensor<fp16, [1, 12, 2304, 64]> transpose_94 = transpose(perm = transpose_94_perm_0, x = query_states_cast_fp16)[name = string("transpose_98")];
+            tensor<fp16, [1, 12, 2304, 2304]> matmul_15_cast_fp16 = matmul(transpose_x = matmul_15_transpose_x_0, transpose_y = matmul_15_transpose_y_0, x = transpose_94, y = transpose_95)[name = string("matmul_15_cast_fp16")];
+            tensor<fp16, [1, 12, 2304, 2304]> add_15_cast_fp16 = add(x = matmul_15_cast_fp16, y = attention_mask_cast_fp16)[name = string("add_15_cast_fp16")];
+            int32 softmax_15_axis_0 = const()[name = string("softmax_15_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 12, 2304, 2304]> softmax_15_cast_fp16 = softmax(axis = softmax_15_axis_0, x = add_15_cast_fp16)[name = string("softmax_15_cast_fp16")];
+            bool attn_output_61_transpose_x_0 = const()[name = string("attn_output_61_transpose_x_0"), val = bool(false)];
+            bool attn_output_61_transpose_y_0 = const()[name = string("attn_output_61_transpose_y_0"), val = bool(false)];
+            tensor<fp16, [1, 12, 2304, 64]> hidden_states_899_cast_fp16 = transpose(perm = hidden_states_899_perm_0, x = normed_output_403_cast_fp16)[name = string("transpose_99")];
+            tensor<fp16, [1, 12, 2304, 64]> attn_output_61_cast_fp16 = matmul(transpose_x = attn_output_61_transpose_x_0, transpose_y = attn_output_61_transpose_y_0, x = softmax_15_cast_fp16, y = hidden_states_899_cast_fp16)[name = string("attn_output_61_cast_fp16")];
+            tensor<int32, [4]> var_5044_perm_0 = const()[name = string("op_5044_perm_0"), val = tensor<int32, [4]>([0, 2, 1, 3])];
+            tensor<int32, [3]> var_5046 = const()[name = string("op_5046"), val = tensor<int32, [3]>([1, 2304, -1])];
+            tensor<fp16, [1, 2304, 12, 64]> var_5044_cast_fp16 = transpose(perm = var_5044_perm_0, x = attn_output_61_cast_fp16)[name = string("transpose_96")];
+            tensor<fp16, [1, 2304, 768]> var_5047_cast_fp16 = reshape(shape = var_5046, x = var_5044_cast_fp16)[name = string("op_5047_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_15_self_attn_o_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_15_self_attn_o_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.22p+1)];
+            fp16 model_vision_tower_encoder_layers_15_self_attn_o_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_15_self_attn_o_proj_input_max_promoted_to_fp16"), val = fp16(0x1.2p+1)];
+            tensor<fp16, [1, 2304, 768]> clip_216_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_15_self_attn_o_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_15_self_attn_o_proj_input_max_promoted_to_fp16, x = var_5047_cast_fp16)[name = string("clip_216_cast_fp16")];
+            tensor<fp16, [768, 768]> model_vision_tower_encoder_layers_15_self_attn_o_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_15_self_attn_o_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(322949312)))];
+            tensor<fp16, [1, 2304, 768]> linear_109_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_15_self_attn_o_proj_linear_weight_promoted_to_fp16, x = clip_216_cast_fp16)[name = string("linear_109_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_15_self_attn_o_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_15_self_attn_o_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.d4p+1)];
+            fp16 model_vision_tower_encoder_layers_15_self_attn_o_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_15_self_attn_o_proj_output_max_promoted_to_fp16"), val = fp16(0x1.d2p+1)];
+            tensor<fp16, [1, 2304, 768]> clip_217_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_15_self_attn_o_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_15_self_attn_o_proj_output_max_promoted_to_fp16, x = linear_109_cast_fp16)[name = string("clip_217_cast_fp16")];
+            fp16 var_33_promoted_109_to_fp16 = const()[name = string("op_33_promoted_109_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_5060_cast_fp16 = pow(x = clip_217_cast_fp16, y = var_33_promoted_109_to_fp16)[name = string("op_5060_cast_fp16")];
+            tensor<int32, [1]> var_5062_axes_0 = const()[name = string("op_5062_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5062_keep_dims_0 = const()[name = string("op_5062_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_5062_cast_fp16 = reduce_mean(axes = var_5062_axes_0, keep_dims = var_5062_keep_dims_0, x = var_5060_cast_fp16)[name = string("op_5062_cast_fp16")];
+            fp16 var_5063_to_fp16 = const()[name = string("op_5063_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_219_cast_fp16 = add(x = var_5062_cast_fp16, y = var_5063_to_fp16)[name = string("mean_squared_219_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_5065_cast_fp16 = pow(x = mean_squared_219_cast_fp16, y = var_27_to_fp16)[name = string("op_5065_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_405_cast_fp16 = mul(x = clip_217_cast_fp16, y = var_5065_cast_fp16)[name = string("normed_output_405_cast_fp16")];
+            tensor<fp16, [768]> const_387_to_fp16 = const()[name = string("const_387_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(324129024)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_407_cast_fp16 = mul(x = normed_output_405_cast_fp16, y = const_387_to_fp16)[name = string("normed_output_407_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_911_cast_fp16 = add(x = hidden_states_873_cast_fp16, y = normed_output_407_cast_fp16)[name = string("hidden_states_911_cast_fp16")];
+            fp16 var_33_promoted_110_to_fp16 = const()[name = string("op_33_promoted_110_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_5073_cast_fp16 = pow(x = hidden_states_911_cast_fp16, y = var_33_promoted_110_to_fp16)[name = string("op_5073_cast_fp16")];
+            tensor<int32, [1]> var_5075_axes_0 = const()[name = string("op_5075_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5075_keep_dims_0 = const()[name = string("op_5075_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_5075_cast_fp16 = reduce_mean(axes = var_5075_axes_0, keep_dims = var_5075_keep_dims_0, x = var_5073_cast_fp16)[name = string("op_5075_cast_fp16")];
+            fp16 var_5076_to_fp16 = const()[name = string("op_5076_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_221_cast_fp16 = add(x = var_5075_cast_fp16, y = var_5076_to_fp16)[name = string("mean_squared_221_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_5078_cast_fp16 = pow(x = mean_squared_221_cast_fp16, y = var_27_to_fp16)[name = string("op_5078_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_409_cast_fp16 = mul(x = hidden_states_911_cast_fp16, y = var_5078_cast_fp16)[name = string("normed_output_409_cast_fp16")];
+            tensor<fp16, [768]> const_388_to_fp16 = const()[name = string("const_388_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(324130624)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_411_cast_fp16 = mul(x = normed_output_409_cast_fp16, y = const_388_to_fp16)[name = string("normed_output_411_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_15_mlp_gate_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_15_mlp_gate_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.fap+2)];
+            fp16 model_vision_tower_encoder_layers_15_mlp_gate_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_15_mlp_gate_proj_input_max_promoted_to_fp16"), val = fp16(0x1.f6p+2)];
+            tensor<fp16, [1, 2304, 768]> clip_218_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_15_mlp_gate_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_15_mlp_gate_proj_input_max_promoted_to_fp16, x = normed_output_411_cast_fp16)[name = string("clip_218_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_15_mlp_gate_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_15_mlp_gate_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(324132224)))];
+            tensor<fp16, [1, 2304, 3072]> linear_110_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_15_mlp_gate_proj_linear_weight_promoted_to_fp16, x = clip_218_cast_fp16)[name = string("linear_110_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_15_mlp_gate_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_15_mlp_gate_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.18p+3)];
+            fp16 model_vision_tower_encoder_layers_15_mlp_gate_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_15_mlp_gate_proj_output_max_promoted_to_fp16"), val = fp16(0x1.16p+3)];
+            tensor<fp16, [1, 2304, 3072]> clip_219_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_15_mlp_gate_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_15_mlp_gate_proj_output_max_promoted_to_fp16, x = linear_110_cast_fp16)[name = string("clip_219_cast_fp16")];
+            string var_5095_mode_0 = const()[name = string("op_5095_mode_0"), val = string("TANH_APPROXIMATION")];
+            tensor<fp16, [1, 2304, 3072]> var_5095_cast_fp16 = gelu(mode = var_5095_mode_0, x = clip_219_cast_fp16)[name = string("op_5095_cast_fp16")];
+            tensor<fp16, [3072, 768]> model_vision_tower_encoder_layers_15_mlp_up_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_15_mlp_up_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [3072, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(328850880)))];
+            tensor<fp16, [1, 2304, 3072]> linear_111_cast_fp16 = linear(bias = linear_5_bias_0_to_fp16, weight = model_vision_tower_encoder_layers_15_mlp_up_proj_linear_weight_promoted_to_fp16, x = clip_218_cast_fp16)[name = string("linear_111_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_15_mlp_up_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_15_mlp_up_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.18p+3)];
+            fp16 model_vision_tower_encoder_layers_15_mlp_up_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_15_mlp_up_proj_output_max_promoted_to_fp16"), val = fp16(0x1.16p+3)];
+            tensor<fp16, [1, 2304, 3072]> clip_221_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_15_mlp_up_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_15_mlp_up_proj_output_max_promoted_to_fp16, x = linear_111_cast_fp16)[name = string("clip_221_cast_fp16")];
+            tensor<fp16, [1, 2304, 3072]> hidden_states_921_cast_fp16 = mul(x = var_5095_cast_fp16, y = clip_221_cast_fp16)[name = string("hidden_states_921_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_15_mlp_down_proj_input_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_15_mlp_down_proj_input_min_promoted_to_fp16"), val = fp16(-0x1.82p+3)];
+            fp16 model_vision_tower_encoder_layers_15_mlp_down_proj_input_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_15_mlp_down_proj_input_max_promoted_to_fp16"), val = fp16(0x1.7ep+3)];
+            tensor<fp16, [1, 2304, 3072]> clip_222_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_15_mlp_down_proj_input_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_15_mlp_down_proj_input_max_promoted_to_fp16, x = hidden_states_921_cast_fp16)[name = string("clip_222_cast_fp16")];
+            tensor<fp16, [768, 3072]> model_vision_tower_encoder_layers_15_mlp_down_proj_linear_weight_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_15_mlp_down_proj_linear_weight_promoted_to_fp16"), val = tensor<fp16, [768, 3072]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(333569536)))];
+            tensor<fp16, [1, 2304, 768]> linear_112_cast_fp16 = linear(bias = linear_0_bias_0, weight = model_vision_tower_encoder_layers_15_mlp_down_proj_linear_weight_promoted_to_fp16, x = clip_222_cast_fp16)[name = string("linear_112_cast_fp16")];
+            fp16 model_vision_tower_encoder_layers_15_mlp_down_proj_output_min_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_15_mlp_down_proj_output_min_promoted_to_fp16"), val = fp16(-0x1.e8p+1)];
+            fp16 model_vision_tower_encoder_layers_15_mlp_down_proj_output_max_promoted_to_fp16 = const()[name = string("model_vision_tower_encoder_layers_15_mlp_down_proj_output_max_promoted_to_fp16"), val = fp16(0x1.e4p+1)];
+            tensor<fp16, [1, 2304, 768]> clip_223_cast_fp16 = clip(alpha = model_vision_tower_encoder_layers_15_mlp_down_proj_output_min_promoted_to_fp16, beta = model_vision_tower_encoder_layers_15_mlp_down_proj_output_max_promoted_to_fp16, x = linear_112_cast_fp16)[name = string("clip_223_cast_fp16")];
+            fp16 var_33_promoted_111_to_fp16 = const()[name = string("op_33_promoted_111_to_fp16"), val = fp16(0x1p+1)];
+            tensor<fp16, [1, 2304, 768]> var_5117_cast_fp16 = pow(x = clip_223_cast_fp16, y = var_33_promoted_111_to_fp16)[name = string("op_5117_cast_fp16")];
+            tensor<int32, [1]> var_5119_axes_0 = const()[name = string("op_5119_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool var_5119_keep_dims_0 = const()[name = string("op_5119_keep_dims_0"), val = bool(true)];
+            tensor<fp16, [1, 2304, 1]> var_5119_cast_fp16 = reduce_mean(axes = var_5119_axes_0, keep_dims = var_5119_keep_dims_0, x = var_5117_cast_fp16)[name = string("op_5119_cast_fp16")];
+            fp16 var_5120_to_fp16 = const()[name = string("op_5120_to_fp16"), val = fp16(0x1.1p-20)];
+            tensor<fp16, [1, 2304, 1]> mean_squared_cast_fp16 = add(x = var_5119_cast_fp16, y = var_5120_to_fp16)[name = string("mean_squared_cast_fp16")];
+            tensor<fp16, [1, 2304, 1]> var_5122_cast_fp16 = pow(x = mean_squared_cast_fp16, y = var_27_to_fp16)[name = string("op_5122_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> normed_output_413_cast_fp16 = mul(x = clip_223_cast_fp16, y = var_5122_cast_fp16)[name = string("normed_output_413_cast_fp16")];
+            tensor<fp16, [768]> const_389_to_fp16 = const()[name = string("const_389_to_fp16"), val = tensor<fp16, [768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(338288192)))];
+            tensor<fp16, [1, 2304, 768]> normed_output_cast_fp16 = mul(x = normed_output_413_cast_fp16, y = const_389_to_fp16)[name = string("normed_output_cast_fp16")];
+            tensor<fp16, [1, 2304, 768]> hidden_states_931_cast_fp16 = add(x = hidden_states_911_cast_fp16, y = normed_output_cast_fp16)[name = string("hidden_states_931_cast_fp16")];
+            fp16 var_36_to_fp16 = const()[name = string("op_36_to_fp16"), val = fp16(0x0p+0)];
+            tensor<fp16, [1, 2304, 768]> hidden_states_933_cast_fp16 = select(a = var_36_to_fp16, b = hidden_states_931_cast_fp16, cond = var_66)[name = string("hidden_states_933_cast_fp16")];
+            tensor<int32, [3]> var_5131_begin_0 = const()[name = string("op_5131_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_5131_end_0 = const()[name = string("op_5131_end_0"), val = tensor<int32, [3]>([1, 2304, 1])];
+            tensor<bool, [3]> var_5131_end_mask_0 = const()[name = string("op_5131_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<bool, [3]> var_5131_squeeze_mask_0 = const()[name = string("op_5131_squeeze_mask_0"), val = tensor<bool, [3]>([false, false, true])];
+            tensor<int32, [1, 2304]> var_5131 = slice_by_index(begin = var_5131_begin_0, end = var_5131_end_0, end_mask = var_5131_end_mask_0, squeeze_mask = var_5131_squeeze_mask_0, x = clamped_positions_1)[name = string("op_5131")];
+            tensor<int32, [1]> reduce_max_0_axes_0 = const()[name = string("reduce_max_0_axes_0"), val = tensor<int32, [1]>([-1])];
+            bool reduce_max_0_keep_dims_0 = const()[name = string("reduce_max_0_keep_dims_0"), val = bool(true)];
+            tensor<int32, [1, 1]> reduce_max_0 = reduce_max(axes = reduce_max_0_axes_0, keep_dims = reduce_max_0_keep_dims_0, x = var_5131)[name = string("reduce_max_0")];
+            int32 var_5134 = const()[name = string("op_5134"), val = int32(1)];
+            tensor<int32, [1, 1]> var_5135 = add(x = reduce_max_0, y = var_5134)[name = string("op_5135")];
+            string cast_66_to_fp16_dtype_0 = const()[name = string("cast_66_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 cast_67_to_fp16 = const()[name = string("cast_67_to_fp16"), val = fp16(0x1.8p+1)];
+            tensor<fp16, [1, 2304, 2]> clamped_positions_1_to_fp16 = cast(dtype = cast_66_to_fp16_dtype_0, x = clamped_positions_1)[name = string("cast_74")];
+            tensor<fp16, [1, 2304, 2]> kernel_idxs_1_cast_fp16 = floor_div(x = clamped_positions_1_to_fp16, y = cast_67_to_fp16)[name = string("kernel_idxs_1_cast_fp16")];
+            tensor<int32, [3]> var_5137_begin_0 = const()[name = string("op_5137_begin_0"), val = tensor<int32, [3]>([0, 0, 0])];
+            tensor<int32, [3]> var_5137_end_0 = const()[name = string("op_5137_end_0"), val = tensor<int32, [3]>([1, 2304, 1])];
+            tensor<bool, [3]> var_5137_end_mask_0 = const()[name = string("op_5137_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<bool, [3]> var_5137_squeeze_mask_0 = const()[name = string("op_5137_squeeze_mask_0"), val = tensor<bool, [3]>([false, false, true])];
+            tensor<fp16, [1, 2304]> var_5137_cast_fp16 = slice_by_index(begin = var_5137_begin_0, end = var_5137_end_0, end_mask = var_5137_end_mask_0, squeeze_mask = var_5137_squeeze_mask_0, x = kernel_idxs_1_cast_fp16)[name = string("op_5137_cast_fp16")];
+            tensor<int32, [1, 1]> floor_div_161 = floor_div(x = var_5135, y = var_17)[name = string("floor_div_161")];
+            tensor<int32, [3]> var_5139_begin_0 = const()[name = string("op_5139_begin_0"), val = tensor<int32, [3]>([0, 0, 1])];
+            tensor<int32, [3]> var_5139_end_0 = const()[name = string("op_5139_end_0"), val = tensor<int32, [3]>([1, 2304, 2])];
+            tensor<bool, [3]> var_5139_end_mask_0 = const()[name = string("op_5139_end_mask_0"), val = tensor<bool, [3]>([true, true, false])];
+            tensor<bool, [3]> var_5139_squeeze_mask_0 = const()[name = string("op_5139_squeeze_mask_0"), val = tensor<bool, [3]>([false, false, true])];
+            tensor<fp16, [1, 2304]> var_5139_cast_fp16 = slice_by_index(begin = var_5139_begin_0, end = var_5139_end_0, end_mask = var_5139_end_mask_0, squeeze_mask = var_5139_squeeze_mask_0, x = kernel_idxs_1_cast_fp16)[name = string("op_5139_cast_fp16")];
+            string var_5138_to_fp16_dtype_0 = const()[name = string("op_5138_to_fp16_dtype_0"), val = string("fp16")];
+            tensor<fp16, [1, 1]> floor_div_161_to_fp16 = cast(dtype = var_5138_to_fp16_dtype_0, x = floor_div_161)[name = string("cast_73")];
+            tensor<fp16, [1, 2304]> var_5140_cast_fp16 = mul(x = floor_div_161_to_fp16, y = var_5139_cast_fp16)[name = string("op_5140_cast_fp16")];
+            tensor<fp16, [1, 2304]> kernel_idxs_3_cast_fp16 = add(x = var_5137_cast_fp16, y = var_5140_cast_fp16)[name = string("kernel_idxs_3_cast_fp16")];
+            string kernel_idxs_dtype_0 = const()[name = string("kernel_idxs_dtype_0"), val = string("int32")];
+            int32 var_5143_one_hot_vector_size_0 = const()[name = string("op_5143_one_hot_vector_size_0"), val = int32(256)];
+            int32 var_5143_axis_0 = const()[name = string("op_5143_axis_0"), val = int32(-1)];
+            int32 var_5143_on_value_0 = const()[name = string("op_5143_on_value_0"), val = int32(1)];
+            int32 var_5143_off_value_0 = const()[name = string("op_5143_off_value_0"), val = int32(0)];
+            tensor<int32, [1, 2304]> kernel_idxs_3_cast_fp16_to_int32 = cast(dtype = kernel_idxs_dtype_0, x = kernel_idxs_3_cast_fp16)[name = string("cast_72")];
+            tensor<int32, [1, 2304, 256]> var_5143 = one_hot(axis = var_5143_axis_0, indices = kernel_idxs_3_cast_fp16_to_int32, off_value = var_5143_off_value_0, on_value = var_5143_on_value_0, one_hot_vector_size = var_5143_one_hot_vector_size_0)[name = string("op_5143")];
+            string var_5144_to_fp16_dtype_0 = const()[name = string("op_5144_to_fp16_dtype_0"), val = string("fp16")];
+            fp16 _inversed_weights_y_0_to_fp16 = const()[name = string("_inversed_weights_y_0_to_fp16"), val = fp16(0x1.c7p-4)];
+            tensor<fp16, [1, 2304, 256]> var_5143_to_fp16 = cast(dtype = var_5144_to_fp16_dtype_0, x = var_5143)[name = string("cast_71")];
+            tensor<fp16, [1, 2304, 256]> _inversed_weights_cast_fp16 = mul(x = var_5143_to_fp16, y = _inversed_weights_y_0_to_fp16)[name = string("_inversed_weights_cast_fp16")];
+            bool output_transpose_x_1 = const()[name = string("output_transpose_x_1"), val = bool(true)];
+            bool output_transpose_y_1 = const()[name = string("output_transpose_y_1"), val = bool(false)];
+            tensor<fp16, [1, 256, 768]> output_cast_fp16 = matmul(transpose_x = output_transpose_x_1, transpose_y = output_transpose_y_1, x = _inversed_weights_cast_fp16, y = hidden_states_933_cast_fp16)[name = string("output_cast_fp16")];
+            fp16 var_20_to_fp16 = const()[name = string("op_20_to_fp16"), val = fp16(0x1.bb8p+4)];
+            tensor<fp16, [1, 256, 768]> x_cast_fp16 = mul(x = output_cast_fp16, y = var_20_to_fp16)[name = string("x_cast_fp16")];
+            int32 var_5152 = const()[name = string("op_5152"), val = int32(-1)];
+            fp16 const_390_promoted_to_fp16 = const()[name = string("const_390_promoted_to_fp16"), val = fp16(-0x1p+0)];
+            tensor<fp16, [1, 256, 768]> var_5158_cast_fp16 = mul(x = x_cast_fp16, y = const_390_promoted_to_fp16)[name = string("op_5158_cast_fp16")];
+            bool input_259_interleave_0 = const()[name = string("input_259_interleave_0"), val = bool(false)];
+            tensor<fp16, [1, 256, 1536]> input_259_cast_fp16 = concat(axis = var_5152, interleave = input_259_interleave_0, values = (x_cast_fp16, var_5158_cast_fp16))[name = string("input_259_cast_fp16")];
+            tensor<int32, [1]> normed_axes_0 = const()[name = string("normed_axes_0"), val = tensor<int32, [1]>([-1])];
+            fp16 var_5155_to_fp16 = const()[name = string("op_5155_to_fp16"), val = fp16(0x1.5p-17)];
+            tensor<fp16, [1, 256, 1536]> normed_cast_fp16 = layer_norm(axes = normed_axes_0, epsilon = var_5155_to_fp16, x = input_259_cast_fp16)[name = string("normed_cast_fp16")];
+            tensor<int32, [2]> var_5163_split_sizes_0 = const()[name = string("op_5163_split_sizes_0"), val = tensor<int32, [2]>([768, 768])];
+            int32 var_5163_axis_0 = const()[name = string("op_5163_axis_0"), val = int32(-1)];
+            tensor<fp16, [1, 256, 768]> var_5163_cast_fp16_0, tensor<fp16, [1, 256, 768]> var_5163_cast_fp16_1 = split(axis = var_5163_axis_0, split_sizes = var_5163_split_sizes_0, x = normed_cast_fp16)[name = string("op_5163_cast_fp16")];
+            tensor<fp16, [2560, 768]> model_embed_vision_embedding_projection_weight_promoted_to_fp16 = const()[name = string("model_embed_vision_embedding_projection_weight_promoted_to_fp16"), val = tensor<fp16, [2560, 768]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(338289792)))];
+            tensor<fp16, [2560]> linear_113_bias_0_to_fp16 = const()[name = string("linear_113_bias_0_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = string("@model_path/weights/weight.bin"), offset = uint64(342222016)))];
+            tensor<fp16, [1, 256, 2560]> image_features = linear(bias = linear_113_bias_0_to_fp16, weight = model_embed_vision_embedding_projection_weight_promoted_to_fp16, x = var_5163_cast_fp16_0)[name = string("linear_113_cast_fp16")];
+        } -> (image_features);
+}
\ No newline at end of file
diff --git a/vision.ane.mlmodelc/weights/weight.bin b/vision.ane.mlmodelc/weights/weight.bin
new file mode 100644
index 0000000000000000000000000000000000000000..faf9e7798e89a56ed0f28eaa6dd2b735bfe3a073
--- /dev/null
+++ b/vision.ane.mlmodelc/weights/weight.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:314f7354d81899b6b9481e9431ea27f9eae163a761e2edd8a53b36ef4ad73bf5
+size 342227200