README.md CHANGED
@@ -36,31 +36,6 @@ Nothing contained in this Model Card should be interpreted as or deemed a restri
36
  ## Release Notes
37
  This is an update over the instruction-tuned Phi-3 Mini ONNX model release. We believe most use cases will benefit from this release, but we encourage users to test their particular AI applications. We appreciate the enthusiastic adoption of the Phi-3 model family and continue to welcome all feedback from the community.
38
 
39
- ## What’s New (2026-02)
40
-
41
- This update introduces an improved **INT4 GPU ONNX model** that incorporates **quantization-aware fine-tuning (QAT)** on top of the existing quantization pipeline.
42
-
43
- ### Benchmark Accuracy Improvements (INT4 GPU)
44
-
45
- | Benchmark Group | Representative Tasks | Avg Improvement |
46
- |----------------|------------------------------------|-----------------|
47
- | Knowledge & QA | TriviaQA, CommonSenseQA, OpenBookQA | **+3 to +10 pts** |
48
- | Reasoning | ARC-Easy, ARC-Challenge | **+0.6 to +4.2 pts** |
49
- | Commonsense | PIQA, Winogrande | **+0.5 to +1.0 pts** |
50
- | Broad Coverage | MMLU (overall) | −0.5 pts |
51
-
52
- The table above provides a high-level summary of observed accuracy deltas across benchmark categories compared to the old INT4 GPU model. The QAT-tuned INT4 GPU model improves performance on the majority of downstream reasoning and QA benchmarks, with a small regression on broad-coverage evaluation.
53
-
54
- ### Generation Stability (EOS Behavior)
55
-
56
- | Model | EOS Non-Emission Rate |
57
- |------|--------------------|
58
- | Torch baseline | 6% |
59
- | Previous INT4 GPU ONNX model | 52% |
60
- | Updated QAT INT4 GPU ONNX model | **11%** |
61
-
62
- The updated model reduces EOS non-emission by approximately 5× compared to the previous INT4 GPU ONNX release, as observed across a large set of randomly generated prompts, resulting in more reliable sequence termination and generation behavior closer to the Torch baseline.
63
-
64
  ## Hardware Supported
65
  The ONNX models are tested on:
66
 
@@ -194,7 +169,7 @@ Activation Aware Quantization (AWQ) works by identifying the top 1% most salient
194
  parinitarahi
195
 
196
  ## Contributors
197
- Sunghoon Choi, Yufeng Li, Kunal Vaishnavi, Akshay Sonawane, Rui Ren, Parinita Rahi, Nenad Banfic
198
 
199
  ## License
200
  The model is licensed under the MIT license.
 
36
  ## Release Notes
37
  This is an update over the instruction-tuned Phi-3 Mini ONNX model release. We believe most use cases will benefit from this release, but we encourage users to test their particular AI applications. We appreciate the enthusiastic adoption of the Phi-3 model family and continue to welcome all feedback from the community.
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  ## Hardware Supported
40
  The ONNX models are tested on:
41
 
 
169
  parinitarahi
170
 
171
  ## Contributors
172
+ Sunghoon Choi, Yufeng Li, Kunal Vaishnavi, Akshay Sonawane, Rui Ren, Parinita Rahi
173
 
174
  ## License
175
  The model is licensed under the MIT license.
gpu/gpu-int4-awq-block-128/chat_template.jinja DELETED
@@ -1,8 +0,0 @@
1
- {% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>
2
- ' + message['content'] + '<|end|>
3
- '}}{% elif message['role'] == 'user' %}{{'<|user|>
4
- ' + message['content'] + '<|end|>
5
- '}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>
6
- ' + message['content'] + '<|end|>
7
- '}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>
8
- ' }}{% else %}{{ eos_token }}{% endif %}
 
 
 
 
 
 
 
 
 
gpu/gpu-int4-awq-block-128/genai_config.json CHANGED
@@ -1,53 +1,54 @@
1
- {
2
- "model": {
3
- "bos_token_id": 1,
4
- "context_length": 131072,
5
- "decoder": {
6
- "session_options": {
7
- "log_id": "onnxruntime-genai",
8
- "provider_options": []
9
- },
10
- "filename": "model.onnx",
11
- "head_size": 96,
12
- "hidden_size": 3072,
13
- "inputs": {
14
- "input_ids": "input_ids",
15
- "attention_mask": "attention_mask",
16
- "past_key_names": "past_key_values.%d.key",
17
- "past_value_names": "past_key_values.%d.value"
18
- },
19
- "outputs": {
20
- "logits": "logits",
21
- "present_key_names": "present.%d.key",
22
- "present_value_names": "present.%d.value"
23
- },
24
- "num_attention_heads": 32,
25
- "num_hidden_layers": 32,
26
- "num_key_value_heads": 32
27
- },
28
- "eos_token_id": [
29
- 32007,
30
- 32001,
31
- 32000
32
- ],
33
- "pad_token_id": 32000,
34
- "type": "phi3",
35
- "vocab_size": 32064
36
- },
37
- "search": {
38
- "diversity_penalty": 0.0,
39
- "do_sample": false,
40
- "early_stopping": true,
41
- "length_penalty": 1.0,
42
- "max_length": 131072,
43
- "min_length": 0,
44
- "no_repeat_ngram_size": 0,
45
- "num_beams": 1,
46
- "num_return_sequences": 1,
47
- "past_present_share_buffer": true,
48
- "repetition_penalty": 1.0,
49
- "temperature": 1.0,
50
- "top_k": 50,
51
- "top_p": 1.0
52
- }
 
53
  }
 
1
+ {
2
+ "model": {
3
+ "bos_token_id": 1,
4
+ "context_length": 131072,
5
+ "decoder": {
6
+ "session_options": {
7
+ "log_id": "onnxruntime-genai",
8
+ "provider_options": []
9
+ },
10
+ "filename": "model.onnx",
11
+ "head_size": 96,
12
+ "hidden_size": 3072,
13
+ "inputs": {
14
+ "input_ids": "input_ids",
15
+ "attention_mask": "attention_mask",
16
+ "position_ids": "position_ids",
17
+ "past_key_names": "past_key_values.%d.key",
18
+ "past_value_names": "past_key_values.%d.value"
19
+ },
20
+ "outputs": {
21
+ "logits": "logits",
22
+ "present_key_names": "present.%d.key",
23
+ "present_value_names": "present.%d.value"
24
+ },
25
+ "num_attention_heads": 32,
26
+ "num_hidden_layers": 32,
27
+ "num_key_value_heads": 32
28
+ },
29
+ "eos_token_id": [
30
+ 32007,
31
+ 32001,
32
+ 32000
33
+ ],
34
+ "pad_token_id": 32000,
35
+ "type": "phi3",
36
+ "vocab_size": 32064
37
+ },
38
+ "search": {
39
+ "diversity_penalty": 0.0,
40
+ "do_sample": true,
41
+ "early_stopping": true,
42
+ "length_penalty": 1.0,
43
+ "max_length": 131072,
44
+ "min_length": 0,
45
+ "no_repeat_ngram_size": 0,
46
+ "num_beams": 1,
47
+ "num_return_sequences": 1,
48
+ "past_present_share_buffer": true,
49
+ "repetition_penalty": 1.0,
50
+ "temperature": 1.0,
51
+ "top_k": 1,
52
+ "top_p": 1.0
53
+ }
54
  }
gpu/gpu-int4-awq-block-128/model.onnx CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c022f639a6db4f0da6308e2f578fc698ac59c19a1976daee77783845c0807ee7
3
- size 26188036
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4392f76ffec63b659a83261e08337fbb33194f509816b7f843f7c46a6f37cc1
3
+ size 320891
gpu/gpu-int4-awq-block-128/model.onnx.data CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a515261766fe96490c65f4ed9ebd2b07206d77dc90e7a6422a807cce4ccc84e8
3
- size 2291335168
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ccad8fba8b01a75f6ef96bd5f27401b1ba92eca512819eee3128f576453fa15
3
+ size 2303072256
gpu/gpu-int4-awq-block-128/special_tokens_map.json CHANGED
@@ -1,30 +1,30 @@
1
- {
2
- "bos_token": {
3
- "content": "<s>",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "eos_token": {
10
- "content": "<|endoftext|>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "pad_token": {
17
- "content": "<|endoftext|>",
18
- "lstrip": false,
19
- "normalized": false,
20
- "rstrip": false,
21
- "single_word": false
22
- },
23
- "unk_token": {
24
- "content": "<unk>",
25
- "lstrip": false,
26
- "normalized": false,
27
- "rstrip": false,
28
- "single_word": false
29
- }
30
- }
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
gpu/gpu-int4-awq-block-128/tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
gpu/gpu-int4-awq-block-128/tokenizer_config.json CHANGED
@@ -1,131 +1,131 @@
1
- {
2
- "add_bos_token": false,
3
- "add_eos_token": false,
4
- "add_prefix_space": null,
5
- "added_tokens_decoder": {
6
- "0": {
7
- "content": "<unk>",
8
- "lstrip": false,
9
- "normalized": false,
10
- "rstrip": false,
11
- "single_word": false,
12
- "special": true
13
- },
14
- "1": {
15
- "content": "<s>",
16
- "lstrip": false,
17
- "normalized": false,
18
- "rstrip": false,
19
- "single_word": false,
20
- "special": true
21
- },
22
- "2": {
23
- "content": "</s>",
24
- "lstrip": false,
25
- "normalized": false,
26
- "rstrip": true,
27
- "single_word": false,
28
- "special": false
29
- },
30
- "32000": {
31
- "content": "<|endoftext|>",
32
- "lstrip": false,
33
- "normalized": false,
34
- "rstrip": false,
35
- "single_word": false,
36
- "special": true
37
- },
38
- "32001": {
39
- "content": "<|assistant|>",
40
- "lstrip": false,
41
- "normalized": false,
42
- "rstrip": true,
43
- "single_word": false,
44
- "special": true
45
- },
46
- "32002": {
47
- "content": "<|placeholder1|>",
48
- "lstrip": false,
49
- "normalized": false,
50
- "rstrip": true,
51
- "single_word": false,
52
- "special": true
53
- },
54
- "32003": {
55
- "content": "<|placeholder2|>",
56
- "lstrip": false,
57
- "normalized": false,
58
- "rstrip": true,
59
- "single_word": false,
60
- "special": true
61
- },
62
- "32004": {
63
- "content": "<|placeholder3|>",
64
- "lstrip": false,
65
- "normalized": false,
66
- "rstrip": true,
67
- "single_word": false,
68
- "special": true
69
- },
70
- "32005": {
71
- "content": "<|placeholder4|>",
72
- "lstrip": false,
73
- "normalized": false,
74
- "rstrip": true,
75
- "single_word": false,
76
- "special": true
77
- },
78
- "32006": {
79
- "content": "<|system|>",
80
- "lstrip": false,
81
- "normalized": false,
82
- "rstrip": true,
83
- "single_word": false,
84
- "special": true
85
- },
86
- "32007": {
87
- "content": "<|end|>",
88
- "lstrip": false,
89
- "normalized": false,
90
- "rstrip": true,
91
- "single_word": false,
92
- "special": true
93
- },
94
- "32008": {
95
- "content": "<|placeholder5|>",
96
- "lstrip": false,
97
- "normalized": false,
98
- "rstrip": true,
99
- "single_word": false,
100
- "special": true
101
- },
102
- "32009": {
103
- "content": "<|placeholder6|>",
104
- "lstrip": false,
105
- "normalized": false,
106
- "rstrip": true,
107
- "single_word": false,
108
- "special": true
109
- },
110
- "32010": {
111
- "content": "<|user|>",
112
- "lstrip": false,
113
- "normalized": false,
114
- "rstrip": true,
115
- "single_word": false,
116
- "special": true
117
- }
118
- },
119
- "bos_token": "<s>",
120
- "clean_up_tokenization_spaces": false,
121
- "eos_token": "<|endoftext|>",
122
- "extra_special_tokens": {},
123
- "legacy": false,
124
- "model_max_length": 131072,
125
- "pad_token": "<|endoftext|>",
126
- "padding_side": "left",
127
- "sp_model_kwargs": {},
128
- "tokenizer_class": "LlamaTokenizer",
129
- "unk_token": "<unk>",
130
- "use_default_system_prompt": false
131
- }
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": true,
27
+ "single_word": false,
28
+ "special": false
29
+ },
30
+ "32000": {
31
+ "content": "<|endoftext|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "32001": {
39
+ "content": "<|assistant|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": true,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "32002": {
47
+ "content": "<|placeholder1|>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": true,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "32003": {
55
+ "content": "<|placeholder2|>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": true,
59
+ "single_word": false,
60
+ "special": true
61
+ },
62
+ "32004": {
63
+ "content": "<|placeholder3|>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": true,
67
+ "single_word": false,
68
+ "special": true
69
+ },
70
+ "32005": {
71
+ "content": "<|placeholder4|>",
72
+ "lstrip": false,
73
+ "normalized": false,
74
+ "rstrip": true,
75
+ "single_word": false,
76
+ "special": true
77
+ },
78
+ "32006": {
79
+ "content": "<|system|>",
80
+ "lstrip": false,
81
+ "normalized": false,
82
+ "rstrip": true,
83
+ "single_word": false,
84
+ "special": true
85
+ },
86
+ "32007": {
87
+ "content": "<|end|>",
88
+ "lstrip": false,
89
+ "normalized": false,
90
+ "rstrip": true,
91
+ "single_word": false,
92
+ "special": true
93
+ },
94
+ "32008": {
95
+ "content": "<|placeholder5|>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": true,
99
+ "single_word": false,
100
+ "special": true
101
+ },
102
+ "32009": {
103
+ "content": "<|placeholder6|>",
104
+ "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": true,
107
+ "single_word": false,
108
+ "special": true
109
+ },
110
+ "32010": {
111
+ "content": "<|user|>",
112
+ "lstrip": false,
113
+ "normalized": false,
114
+ "rstrip": true,
115
+ "single_word": false,
116
+ "special": true
117
+ }
118
+ },
119
+ "bos_token": "<s>",
120
+ "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
121
+ "clean_up_tokenization_spaces": false,
122
+ "eos_token": "<|endoftext|>",
123
+ "legacy": false,
124
+ "model_max_length": 131072,
125
+ "pad_token": "<|endoftext|>",
126
+ "padding_side": "left",
127
+ "sp_model_kwargs": {},
128
+ "tokenizer_class": "LlamaTokenizer",
129
+ "unk_token": "<unk>",
130
+ "use_default_system_prompt": false
131
+ }