Phi 3.5 ONNX

by megh-2901 - opened May 22, 2025

base: refs/heads/main

←

from: refs/pr/7

Discussion Files changed

+61468

-245247

Files changed (8) hide show

README.md +1 -26
gpu/gpu-int4-awq-block-128/chat_template.jinja +0 -8
gpu/gpu-int4-awq-block-128/genai_config.json +53 -52
gpu/gpu-int4-awq-block-128/model.onnx +2 -2
gpu/gpu-int4-awq-block-128/model.onnx.data +2 -2
gpu/gpu-int4-awq-block-128/special_tokens_map.json +30 -30
gpu/gpu-int4-awq-block-128/tokenizer.json +0 -0
gpu/gpu-int4-awq-block-128/tokenizer_config.json +131 -131

README.md CHANGED Viewed

@@ -36,31 +36,6 @@ Nothing contained in this Model Card should be interpreted as or deemed a restri
 ## Release Notes
 This is an update over the instruction-tuned Phi-3 Mini ONNX model release. We believe most use cases will benefit from this release, but we encourage users to test their particular AI applications. We appreciate the enthusiastic adoption of the Phi-3 model family and continue to welcome all feedback from the community.
-## What’s New (2026-02)
-This update introduces an improved **INT4 GPU ONNX model** that incorporates **quantization-aware fine-tuning (QAT)** on top of the existing quantization pipeline.
-### Benchmark Accuracy Improvements (INT4 GPU)
-| Benchmark Group | Representative Tasks                | Avg Improvement |
-|----------------|------------------------------------|-----------------|
-| Knowledge & QA | TriviaQA, CommonSenseQA, OpenBookQA | **+3 to +10 pts** |
-| Reasoning      | ARC-Easy, ARC-Challenge             | **+0.6 to +4.2 pts** |
-| Commonsense    | PIQA, Winogrande                    | **+0.5 to +1.0 pts** |
-| Broad Coverage | MMLU (overall)                      | −0.5 pts |
-The table above provides a high-level summary of observed accuracy deltas across benchmark categories compared to the old INT4 GPU model. The QAT-tuned INT4 GPU model improves performance on the majority of downstream reasoning and QA benchmarks, with a small regression on broad-coverage evaluation.
-### Generation Stability (EOS Behavior)
-| Model | EOS Non-Emission Rate |
-|------|--------------------|
-| Torch baseline | 6% |
-| Previous INT4 GPU ONNX model | 52% |
-| Updated QAT INT4 GPU ONNX model | **11%** |
-The updated model reduces EOS non-emission by approximately 5× compared to the previous INT4 GPU ONNX release, as observed across a large set of randomly generated prompts, resulting in more reliable sequence termination and generation behavior closer to the Torch baseline.
 ## Hardware Supported
 The ONNX models are tested on:
@@ -194,7 +169,7 @@ Activation Aware Quantization (AWQ) works by identifying the top 1% most salient
 parinitarahi
 ## Contributors
-Sunghoon Choi, Yufeng Li, Kunal Vaishnavi, Akshay Sonawane, Rui Ren, Parinita Rahi, Nenad Banfic
 ## License
 The model is licensed under the MIT license.

 ## Release Notes
 This is an update over the instruction-tuned Phi-3 Mini ONNX model release. We believe most use cases will benefit from this release, but we encourage users to test their particular AI applications. We appreciate the enthusiastic adoption of the Phi-3 model family and continue to welcome all feedback from the community.
 ## Hardware Supported
 The ONNX models are tested on:
 parinitarahi
 ## Contributors
+Sunghoon Choi, Yufeng Li, Kunal Vaishnavi, Akshay Sonawane, Rui Ren, Parinita Rahi
 ## License
 The model is licensed under the MIT license.

gpu/gpu-int4-awq-block-128/chat_template.jinja DELETED Viewed

@@ -1,8 +0,0 @@
-{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>
-' + message['content'] + '<|end|>
-'}}{% elif message['role'] == 'user' %}{{'<|user|>
-' + message['content'] + '<|end|>
-'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>
-' + message['content'] + '<|end|>
-'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>
-' }}{% else %}{{ eos_token }}{% endif %}

gpu/gpu-int4-awq-block-128/genai_config.json CHANGED Viewed

@@ -1,53 +1,54 @@
-{
-    "model": {
-        "bos_token_id": 1,
-        "context_length": 131072,
-        "decoder": {
-            "session_options": {
-                "log_id": "onnxruntime-genai",
-                "provider_options": []
-            },
-            "filename": "model.onnx",
-            "head_size": 96,
-            "hidden_size": 3072,
-            "inputs": {
-                "input_ids": "input_ids",
-                "attention_mask": "attention_mask",
-                "past_key_names": "past_key_values.%d.key",
-                "past_value_names": "past_key_values.%d.value"
-            },
-            "outputs": {
-                "logits": "logits",
-                "present_key_names": "present.%d.key",
-                "present_value_names": "present.%d.value"
-            },
-            "num_attention_heads": 32,
-            "num_hidden_layers": 32,
-            "num_key_value_heads": 32
-        },
-        "eos_token_id": [
-            32007,
-            32001,
-            32000
-        ],
-        "pad_token_id": 32000,
-        "type": "phi3",
-        "vocab_size": 32064
-    },
-    "search": {
-        "diversity_penalty": 0.0,
-        "do_sample": false,
-        "early_stopping": true,
-        "length_penalty": 1.0,
-        "max_length": 131072,
-        "min_length": 0,
-        "no_repeat_ngram_size": 0,
-        "num_beams": 1,
-        "num_return_sequences": 1,
-        "past_present_share_buffer": true,
-        "repetition_penalty": 1.0,
-        "temperature": 1.0,
-        "top_k": 50,
-        "top_p": 1.0
-    }
 }

+{
+    "model": {
+        "bos_token_id": 1,
+        "context_length": 131072,
+        "decoder": {
+            "session_options": {
+                "log_id": "onnxruntime-genai",
+                "provider_options": []
+            },
+            "filename": "model.onnx",
+            "head_size": 96,
+            "hidden_size": 3072,
+            "inputs": {
+                "input_ids": "input_ids",
+                "attention_mask": "attention_mask",
+                "position_ids": "position_ids",
+                "past_key_names": "past_key_values.%d.key",
+                "past_value_names": "past_key_values.%d.value"
+            },
+            "outputs": {
+                "logits": "logits",
+                "present_key_names": "present.%d.key",
+                "present_value_names": "present.%d.value"
+            },
+            "num_attention_heads": 32,
+            "num_hidden_layers": 32,
+            "num_key_value_heads": 32
+        },
+        "eos_token_id": [
+            32007,
+            32001,
+            32000
+        ],
+        "pad_token_id": 32000,
+        "type": "phi3",
+        "vocab_size": 32064
+    },
+    "search": {
+        "diversity_penalty": 0.0,
+        "do_sample": true,
+        "early_stopping": true,
+        "length_penalty": 1.0,
+        "max_length": 131072,
+        "min_length": 0,
+        "no_repeat_ngram_size": 0,
+        "num_beams": 1,
+        "num_return_sequences": 1,
+        "past_present_share_buffer": true,
+        "repetition_penalty": 1.0,
+        "temperature": 1.0,
+        "top_k": 1,
+        "top_p": 1.0
+    }
 }

gpu/gpu-int4-awq-block-128/model.onnx CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c022f639a6db4f0da6308e2f578fc698ac59c19a1976daee77783845c0807ee7
-size 26188036

 version https://git-lfs.github.com/spec/v1
+oid sha256:d4392f76ffec63b659a83261e08337fbb33194f509816b7f843f7c46a6f37cc1
+size 320891

gpu/gpu-int4-awq-block-128/model.onnx.data CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a515261766fe96490c65f4ed9ebd2b07206d77dc90e7a6422a807cce4ccc84e8
-size 2291335168

 version https://git-lfs.github.com/spec/v1
+oid sha256:3ccad8fba8b01a75f6ef96bd5f27401b1ba92eca512819eee3128f576453fa15
+size 2303072256

gpu/gpu-int4-awq-block-128/special_tokens_map.json CHANGED Viewed

@@ -1,30 +1,30 @@
-{
-  "bos_token": {
-    "content": "<s>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "eos_token": {
-    "content": "<|endoftext|>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "pad_token": {
-    "content": "<|endoftext|>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "unk_token": {
-    "content": "<unk>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  }
-}

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

gpu/gpu-int4-awq-block-128/tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

gpu/gpu-int4-awq-block-128/tokenizer_config.json CHANGED Viewed

@@ -1,131 +1,131 @@
-{
-  "add_bos_token": false,
-  "add_eos_token": false,
-  "add_prefix_space": null,
-  "added_tokens_decoder": {
-    "0": {
-      "content": "<unk>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "1": {
-      "content": "<s>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "2": {
-      "content": "</s>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": false
-    },
-    "32000": {
-      "content": "<|endoftext|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "32001": {
-      "content": "<|assistant|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": true
-    },
-    "32002": {
-      "content": "<|placeholder1|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": true
-    },
-    "32003": {
-      "content": "<|placeholder2|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": true
-    },
-    "32004": {
-      "content": "<|placeholder3|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": true
-    },
-    "32005": {
-      "content": "<|placeholder4|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": true
-    },
-    "32006": {
-      "content": "<|system|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": true
-    },
-    "32007": {
-      "content": "<|end|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": true
-    },
-    "32008": {
-      "content": "<|placeholder5|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": true
-    },
-    "32009": {
-      "content": "<|placeholder6|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": true
-    },
-    "32010": {
-      "content": "<|user|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": true,
-      "single_word": false,
-      "special": true
-    }
-  },
-  "bos_token": "<s>",
-  "clean_up_tokenization_spaces": false,
-  "eos_token": "<|endoftext|>",
-  "extra_special_tokens": {},
-  "legacy": false,
-  "model_max_length": 131072,
-  "pad_token": "<|endoftext|>",
-  "padding_side": "left",
-  "sp_model_kwargs": {},
-  "tokenizer_class": "LlamaTokenizer",
-  "unk_token": "<unk>",
-  "use_default_system_prompt": false
-}

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "32000": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<|assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32002": {
+      "content": "<|placeholder1|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32003": {
+      "content": "<|placeholder2|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32004": {
+      "content": "<|placeholder3|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32005": {
+      "content": "<|placeholder4|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32006": {
+      "content": "<|system|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32007": {
+      "content": "<|end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "<|placeholder5|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "<|placeholder6|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "32010": {
+      "content": "<|user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "legacy": false,
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}