sepehrn commited on
Commit
2235ef2
·
verified ·
1 Parent(s): 252eb54

docs: update config.json with surgical fp16 as recommended, int8 warning

Browse files
Files changed (1) hide show
  1. config.json +179 -82
config.json CHANGED
@@ -1,82 +1,179 @@
1
- {
2
- "model_type": "wav2vec2-a2e",
3
- "task": "audio-to-expression",
4
- "framework": "onnx",
5
- "opset_version": 14,
6
- "min_ort_version": "1.17.0",
7
- "sample_rate": 16000,
8
- "input_samples": 16000,
9
- "output_fps": 30,
10
- "num_blendshapes": 52,
11
- "blendshape_standard": "ARKit",
12
- "parameters": 100528020,
13
- "upstream": {
14
- "repo": "https://github.com/aigc3d/LAM_Audio2Expression",
15
- "paper": "LAM: Large Avatar Model for One-Shot Animatable Gaussian Head",
16
- "venue": "SIGGRAPH 2025",
17
- "license": "Apache-2.0"
18
- },
19
- "inputs": {
20
- "audio": {
21
- "shape": ["batch", "samples"],
22
- "dtype": "float32",
23
- "description": "Raw audio at 16kHz"
24
- },
25
- "identity": {
26
- "shape": ["batch", 12],
27
- "dtype": "float32",
28
- "description": "One-hot identity vector"
29
- }
30
- },
31
- "outputs": {
32
- "blendshapes": {
33
- "shape": ["batch", "time_a2e", 52],
34
- "dtype": "float32",
35
- "description": "ARKit blendshape weights at 30fps"
36
- },
37
- "asr_logits": {
38
- "shape": ["batch", "time_asr", 32],
39
- "dtype": "float32",
40
- "description": "CTC ASR logits at 50fps"
41
- }
42
- },
43
- "variants": {
44
- "fp32": {
45
- "path": "fp32/",
46
- "size_mb": 384,
47
- "format": "external_data",
48
- "precision": "float32",
49
- "backends": ["webgpu", "wasm"]
50
- },
51
- "fp16": {
52
- "path": "fp16/",
53
- "size_mb": 192,
54
- "format": "external_data",
55
- "precision": "float16",
56
- "backends": ["webgpu", "wasm"]
57
- },
58
- "int8": {
59
- "path": "int8/",
60
- "size_mb": 97,
61
- "format": "external_data",
62
- "precision": "int8_dynamic",
63
- "backends": ["wasm"],
64
- "note": "WASM only WebGPU has limited int8 operator support"
65
- }
66
- },
67
- "blendshape_names": [
68
- "eyeBlinkLeft", "eyeLookDownLeft", "eyeLookInLeft", "eyeLookOutLeft", "eyeLookUpLeft",
69
- "eyeSquintLeft", "eyeWideLeft", "eyeBlinkRight", "eyeLookDownRight", "eyeLookInRight",
70
- "eyeLookOutRight", "eyeLookUpRight", "eyeSquintRight", "eyeWideRight",
71
- "jawForward", "jawLeft", "jawRight", "jawOpen",
72
- "mouthClose", "mouthFunnel", "mouthPucker", "mouthLeft", "mouthRight",
73
- "mouthSmileLeft", "mouthSmileRight", "mouthFrownLeft", "mouthFrownRight",
74
- "mouthDimpleLeft", "mouthDimpleRight", "mouthStretchLeft", "mouthStretchRight",
75
- "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper",
76
- "mouthPressLeft", "mouthPressRight", "mouthLowerDownLeft", "mouthLowerDownRight",
77
- "mouthUpperUpLeft", "mouthUpperUpRight",
78
- "browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight",
79
- "cheekPuff", "cheekSquintLeft", "cheekSquintRight",
80
- "noseSneerLeft", "noseSneerRight", "tongueOut"
81
- ]
82
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "wav2vec2-a2e",
3
+ "task": "audio-to-expression",
4
+ "framework": "onnx",
5
+ "opset_version": 14,
6
+ "min_ort_version": "1.17.0",
7
+ "sample_rate": 16000,
8
+ "input_samples": 16000,
9
+ "output_fps": 30,
10
+ "num_blendshapes": 52,
11
+ "blendshape_standard": "ARKit",
12
+ "parameters": 100528020,
13
+ "upstream": {
14
+ "repo": "https://github.com/aigc3d/LAM_Audio2Expression",
15
+ "paper": "LAM: Large Avatar Model for One-Shot Animatable Gaussian Head",
16
+ "venue": "SIGGRAPH 2025",
17
+ "license": "Apache-2.0"
18
+ },
19
+ "inputs": {
20
+ "audio": {
21
+ "shape": [
22
+ "batch",
23
+ "samples"
24
+ ],
25
+ "dtype": "float32",
26
+ "description": "Raw audio at 16kHz. Use 16000 samples (1s) for 30fps output."
27
+ },
28
+ "identity": {
29
+ "shape": [
30
+ "batch",
31
+ 12
32
+ ],
33
+ "dtype": "float32",
34
+ "description": "One-hot identity vector. 12 classes, use [1,0,...,0] for neutral."
35
+ }
36
+ },
37
+ "outputs": {
38
+ "blendshapes": {
39
+ "shape": [
40
+ "batch",
41
+ 30,
42
+ 52
43
+ ],
44
+ "dtype": "float32",
45
+ "description": "ARKit blendshape weights at 30fps"
46
+ },
47
+ "asr_logits": {
48
+ "shape": [
49
+ "batch",
50
+ 49,
51
+ 32
52
+ ],
53
+ "dtype": "float32",
54
+ "description": "CTC ASR logits at 50fps (~24K params auxiliary head)"
55
+ }
56
+ },
57
+ "recommended": {
58
+ "path": "model_fp16.onnx",
59
+ "format": "external_data",
60
+ "precision": "float16_surgical",
61
+ "graph_size_kb": 385,
62
+ "weights_size_mb": 192,
63
+ "conversion": "Surgical fp16: decomposed LayerNorm subgraphs kept in fp32",
64
+ "fidelity": "cosine >0.9999 vs fp32, magnitude ratio 0.998-1.002",
65
+ "backends": [
66
+ "webgpu",
67
+ "wasm"
68
+ ]
69
+ },
70
+ "variants": {
71
+ "fp16_surgical": {
72
+ "path": "model_fp16.onnx",
73
+ "format": "external_data",
74
+ "size_mb": 192,
75
+ "precision": "float16",
76
+ "note": "Recommended. Decomposed LayerNorm preserved in fp32.",
77
+ "backends": [
78
+ "webgpu",
79
+ "wasm"
80
+ ]
81
+ },
82
+ "fp32": {
83
+ "path": "fp32/model.onnx",
84
+ "format": "external_data",
85
+ "size_mb": 384,
86
+ "precision": "float32",
87
+ "backends": [
88
+ "webgpu",
89
+ "wasm"
90
+ ]
91
+ },
92
+ "fp32_single_file": {
93
+ "path": "model.onnx",
94
+ "format": "single_file",
95
+ "size_mb": 384,
96
+ "precision": "float32",
97
+ "note": "Legacy backwards-compat. Prefer external data variants.",
98
+ "backends": [
99
+ "webgpu",
100
+ "wasm"
101
+ ]
102
+ },
103
+ "fp16_naive": {
104
+ "path": "fp16/model.onnx",
105
+ "format": "external_data",
106
+ "size_mb": 192,
107
+ "precision": "float16",
108
+ "note": "Superseded by root model_fp16.onnx (surgical conversion).",
109
+ "backends": [
110
+ "webgpu",
111
+ "wasm"
112
+ ]
113
+ },
114
+ "int8": {
115
+ "path": "int8/model.onnx",
116
+ "format": "external_data",
117
+ "size_mb": 97,
118
+ "precision": "int8_dynamic",
119
+ "note": "NOT RECOMMENDED. Visibly degraded output. Wav2Vec2 weights too sensitive for int8.",
120
+ "backends": [
121
+ "wasm"
122
+ ]
123
+ }
124
+ },
125
+ "blendshape_names": [
126
+ "eyeBlinkLeft",
127
+ "eyeLookDownLeft",
128
+ "eyeLookInLeft",
129
+ "eyeLookOutLeft",
130
+ "eyeLookUpLeft",
131
+ "eyeSquintLeft",
132
+ "eyeWideLeft",
133
+ "eyeBlinkRight",
134
+ "eyeLookDownRight",
135
+ "eyeLookInRight",
136
+ "eyeLookOutRight",
137
+ "eyeLookUpRight",
138
+ "eyeSquintRight",
139
+ "eyeWideRight",
140
+ "jawForward",
141
+ "jawLeft",
142
+ "jawRight",
143
+ "jawOpen",
144
+ "mouthClose",
145
+ "mouthFunnel",
146
+ "mouthPucker",
147
+ "mouthLeft",
148
+ "mouthRight",
149
+ "mouthSmileLeft",
150
+ "mouthSmileRight",
151
+ "mouthFrownLeft",
152
+ "mouthFrownRight",
153
+ "mouthDimpleLeft",
154
+ "mouthDimpleRight",
155
+ "mouthStretchLeft",
156
+ "mouthStretchRight",
157
+ "mouthRollLower",
158
+ "mouthRollUpper",
159
+ "mouthShrugLower",
160
+ "mouthShrugUpper",
161
+ "mouthPressLeft",
162
+ "mouthPressRight",
163
+ "mouthLowerDownLeft",
164
+ "mouthLowerDownRight",
165
+ "mouthUpperUpLeft",
166
+ "mouthUpperUpRight",
167
+ "browDownLeft",
168
+ "browDownRight",
169
+ "browInnerUp",
170
+ "browOuterUpLeft",
171
+ "browOuterUpRight",
172
+ "cheekPuff",
173
+ "cheekSquintLeft",
174
+ "cheekSquintRight",
175
+ "noseSneerLeft",
176
+ "noseSneerRight",
177
+ "tongueOut"
178
+ ]
179
+ }