lwanming commited on
Commit
1123a31
·
verified ·
1 Parent(s): a250a0d

Upload 3 files

Browse files
.gitattributes CHANGED
@@ -37,3 +37,4 @@ z_image_turbo_onnx/text_encoder/q4f16/model.onnx.data filter=lfs diff=lfs merge=
37
  z_image_turbo_onnx/text_encoder/qdq-q4f16/model.onnx.data filter=lfs diff=lfs merge=lfs -text
38
  z_image_turbo_onnx/transformer/q4f16/model.onnx.data filter=lfs diff=lfs merge=lfs -text
39
  z_image_turbo_onnx/transformer/qdq-q4f16/model.onnx.data filter=lfs diff=lfs merge=lfs -text
 
 
37
  z_image_turbo_onnx/text_encoder/qdq-q4f16/model.onnx.data filter=lfs diff=lfs merge=lfs -text
38
  z_image_turbo_onnx/transformer/q4f16/model.onnx.data filter=lfs diff=lfs merge=lfs -text
39
  z_image_turbo_onnx/transformer/qdq-q4f16/model.onnx.data filter=lfs diff=lfs merge=lfs -text
40
+ z_image_turbo_onnx/text_encoder/q4f16-genai/model.onnx.data filter=lfs diff=lfs merge=lfs -text
z_image_turbo_onnx/text_encoder/q4f16-genai/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d52eab98a1003ab306b470def7321cbe6fa44741edb5286db2584a988c4469f0
3
+ size 690757672
z_image_turbo_onnx/text_encoder/q4f16-genai/model.onnx.data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0893f98bd5445308dbe421fbe300f586a47ae7ef71157b3e586b67fb55ed6261
3
+ size 1526231040
z_image_turbo_onnx/text_encoder/q4f16-genai/modify_genai_model.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+
3
+ How to generate Z-image text encoder into genai-webgpu-q4f16 onnx model:
4
+ 1. Download microsoft/onnxruntime-genai
5
+ 2. install pip onnxruntime_genai
6
+ 3. cd to src/python/py/models/
7
+ 4. mkdir genai-webgpu-q4f16
8
+ 5. mkdir z-image-text-encoder
9
+ 6. Download transformer models and tokenizers from HuggingFace and move all files into z-image-text-encoder:
10
+ - https://huggingface.co/Tongyi-MAI/Z-Image-Turbo/tree/main/text_encoder
11
+ - https://huggingface.co/Tongyi-MAI/Z-Image-Turbo/tree/main/tokenizer
12
+ 7. python builder.py -i z-image-text-encoder -o genai-webgpu-q4f16 -p int4 -e webgpu --extra_options int4_block_size=32 int4_accuracy_level=4 int4_op_types_to_quantize=MatMul/Gather enable_webgpu_graph=true
13
+
14
+
15
+ Modify the genai-webgpu-q4f16 model:
16
+ 1. Remove KV cache inputs (past_key_values.*) and convert them to empty initializers
17
+ 2. Remove all outputs (logits, present.*)
18
+ 3. Add an `encoder_hidden_state` output (fp32)
19
+ 4. Dead code elimination: remove unused nodes and initializers
20
+ """
21
+
22
+ import onnx
23
+ from onnx import helper, TensorProto, numpy_helper
24
+ from onnx.external_data_helper import convert_model_to_external_data
25
+ import numpy as np
26
+ import os
27
+ import shutil
28
+
29
+ # Configuration
30
+ INPUT_MODEL_PATH = r'genai-webgpu-q4f16\model.onnx'
31
+ OUTPUT_DIR = r'genai-webgpu-q4f16-modified'
32
+ OUTPUT_MODEL_NAME = 'model.onnx'
33
+ EXTERNAL_DATA_NAME = 'model.onnx.data'
34
+
35
+ # Target output node
36
+ TARGET_OUTPUT_NAME = '/model/layers.35/input_layernorm/output_3'
37
+ CAST_OUTPUT_NAME = 'encoder_hidden_state'
38
+
39
+ # KV cache configuration (batch=1, num_heads=8, seq_len=0, head_dim=128)
40
+ KV_CACHE_SHAPE = (1, 8, 0, 128)
41
+ KV_CACHE_DTYPE = np.float16
42
+
43
+
44
+ def main():
45
+ # Get the directory where the script resides
46
+ script_dir = os.path.dirname(os.path.abspath(__file__))
47
+ input_model_path = os.path.join(script_dir, INPUT_MODEL_PATH)
48
+ output_dir = os.path.join(script_dir, OUTPUT_DIR)
49
+ output_path = os.path.join(output_dir, OUTPUT_MODEL_NAME)
50
+
51
+ print(f'Loading model: {input_model_path}')
52
+ model = onnx.load(input_model_path)
53
+
54
+ print('Modifying model...')
55
+ print(f'Original node count: {len(model.graph.node)}')
56
+ print(f'Original initializer count: {len(model.graph.initializer)}')
57
+
58
+ # 1. Handle KV cache inputs: remove the inputs and replace node references with an empty string (optional input)
59
+ kv_names = set()
60
+ for inp in model.graph.input:
61
+ if inp.name.startswith('past_key_values'):
62
+ kv_names.add(inp.name)
63
+
64
+ print(f'Converting {len(kv_names)} KV cache inputs to Optional (empty name)')
65
+
66
+ # Remove KV cache inputs
67
+ new_inputs = [inp for inp in model.graph.input if not inp.name.startswith('past_key_values')]
68
+ while len(model.graph.input) > 0:
69
+ model.graph.input.pop()
70
+ for inp in new_inputs:
71
+ model.graph.input.append(inp)
72
+
73
+ # Update node input references to ""
74
+ for node in model.graph.node:
75
+ for i, inp in enumerate(node.input):
76
+ if inp in kv_names:
77
+ node.input[i] = ""
78
+
79
+ # 2. Add a Cast node
80
+ cast_node = helper.make_node(
81
+ 'Cast',
82
+ inputs=[TARGET_OUTPUT_NAME],
83
+ outputs=[CAST_OUTPUT_NAME],
84
+ name='graph_output_cast_encoder_hidden_state',
85
+ to=TensorProto.FLOAT
86
+ )
87
+ model.graph.node.append(cast_node)
88
+
89
+ # 3. Remove all existing outputs and add the new output
90
+ while len(model.graph.output) > 0:
91
+ model.graph.output.pop()
92
+ new_output = helper.make_tensor_value_info(CAST_OUTPUT_NAME, TensorProto.FLOAT, None)
93
+ model.graph.output.append(new_output)
94
+
95
+ # 4. Dead code elimination
96
+ print('Cleaning up unused nodes...')
97
+
98
+ initializer_names = set([init.name for init in model.graph.initializer])
99
+
100
+ # Build a mapping from index to node
101
+ node_list = list(model.graph.node)
102
+ node_idx_map = {i: node for i, node in enumerate(node_list)}
103
+
104
+ # Build a mapping from output tensor name to node index
105
+ output_to_node_idx = {}
106
+ for i, node in enumerate(node_list):
107
+ for out in node.output:
108
+ output_to_node_idx[out] = i
109
+
110
+ # Use BFS to find all node indices required to produce the final outputs
111
+ outputs_needed = set([out.name for out in model.graph.output])
112
+ tensors_needed = set(outputs_needed)
113
+ node_indices_to_keep = set()
114
+
115
+ visited = set()
116
+ queue = list(outputs_needed)
117
+ while queue:
118
+ tensor = queue.pop(0)
119
+ if tensor in visited:
120
+ continue
121
+ visited.add(tensor)
122
+ tensors_needed.add(tensor)
123
+
124
+ if tensor in output_to_node_idx:
125
+ idx = output_to_node_idx[tensor]
126
+ node_indices_to_keep.add(idx)
127
+ node = node_idx_map[idx]
128
+ for inp in node.input:
129
+ if inp and inp not in visited:
130
+ queue.append(inp)
131
+
132
+ print(f'Number of nodes to keep: {len(node_indices_to_keep)}')
133
+
134
+ # Keep nodes in their original order
135
+ nodes_to_keep = [node_list[i] for i in sorted(node_indices_to_keep)]
136
+
137
+ while len(model.graph.node) > 0:
138
+ model.graph.node.pop()
139
+ for node in nodes_to_keep:
140
+ model.graph.node.append(node)
141
+
142
+ # 5. Remove unused initializers
143
+ initializers_needed = tensors_needed & initializer_names
144
+ to_remove = [init for init in model.graph.initializer if init.name not in initializers_needed]
145
+ for init in to_remove:
146
+ model.graph.initializer.remove(init)
147
+
148
+ print(f'Optimized node count: {len(model.graph.node)}')
149
+ print(f'Optimized initializer count: {len(model.graph.initializer)}')
150
+
151
+ # 6. Save the model
152
+ os.makedirs(output_dir, exist_ok=True)
153
+
154
+ # Use onnx.save_model with size_threshold=10MB to reduce external data size
155
+ onnx.save_model(
156
+ model,
157
+ output_path,
158
+ save_as_external_data=True,
159
+ all_tensors_to_one_file=True,
160
+ location=EXTERNAL_DATA_NAME,
161
+ size_threshold=1024*1024*10,
162
+ convert_attribute=False
163
+ )
164
+
165
+ print(f'\nInputs: {[inp.name for inp in model.graph.input]}')
166
+ print(f'Outputs: {[out.name for out in model.graph.output]}')
167
+ print(f'\nModel saved to: {output_path}')
168
+
169
+ # Check file sizes
170
+ model_size = os.path.getsize(output_path) / (1024 * 1024)
171
+ data_path = os.path.join(output_dir, EXTERNAL_DATA_NAME)
172
+ data_size = os.path.getsize(data_path) / (1024 * 1024)
173
+ print(f'\nFile sizes:')
174
+ print(f' {OUTPUT_MODEL_NAME}: {model_size:.2f} MB')
175
+ print(f' {EXTERNAL_DATA_NAME}: {data_size:.2f} MB')
176
+
177
+ if data_size > 2048:
178
+ print(f'\n⚠️ Warning: external data exceeds 2GB ({data_size:.2f} MB)')
179
+ else:
180
+ print(f'\n✓ external data is within the 2GB limit')
181
+
182
+ # Validate the model
183
+ print('\nValidating model...')
184
+ try:
185
+ loaded = onnx.load(output_path)
186
+ onnx.checker.check_model(loaded)
187
+ print('✓ Model validation passed!')
188
+ except Exception as e:
189
+ print(f'✗ Validation failed: {e}')
190
+ return False
191
+
192
+ return True
193
+
194
+
195
+ if __name__ == '__main__':
196
+ main()