lam-a2e / config.json
sepehrn's picture
docs: update config.json with surgical fp16 as recommended, int8 warning
2235ef2 verified
{
"model_type": "wav2vec2-a2e",
"task": "audio-to-expression",
"framework": "onnx",
"opset_version": 14,
"min_ort_version": "1.17.0",
"sample_rate": 16000,
"input_samples": 16000,
"output_fps": 30,
"num_blendshapes": 52,
"blendshape_standard": "ARKit",
"parameters": 100528020,
"upstream": {
"repo": "https://github.com/aigc3d/LAM_Audio2Expression",
"paper": "LAM: Large Avatar Model for One-Shot Animatable Gaussian Head",
"venue": "SIGGRAPH 2025",
"license": "Apache-2.0"
},
"inputs": {
"audio": {
"shape": [
"batch",
"samples"
],
"dtype": "float32",
"description": "Raw audio at 16kHz. Use 16000 samples (1s) for 30fps output."
},
"identity": {
"shape": [
"batch",
12
],
"dtype": "float32",
"description": "One-hot identity vector. 12 classes, use [1,0,...,0] for neutral."
}
},
"outputs": {
"blendshapes": {
"shape": [
"batch",
30,
52
],
"dtype": "float32",
"description": "ARKit blendshape weights at 30fps"
},
"asr_logits": {
"shape": [
"batch",
49,
32
],
"dtype": "float32",
"description": "CTC ASR logits at 50fps (~24K params auxiliary head)"
}
},
"recommended": {
"path": "model_fp16.onnx",
"format": "external_data",
"precision": "float16_surgical",
"graph_size_kb": 385,
"weights_size_mb": 192,
"conversion": "Surgical fp16: decomposed LayerNorm subgraphs kept in fp32",
"fidelity": "cosine >0.9999 vs fp32, magnitude ratio 0.998-1.002",
"backends": [
"webgpu",
"wasm"
]
},
"variants": {
"fp16_surgical": {
"path": "model_fp16.onnx",
"format": "external_data",
"size_mb": 192,
"precision": "float16",
"note": "Recommended. Decomposed LayerNorm preserved in fp32.",
"backends": [
"webgpu",
"wasm"
]
},
"fp32": {
"path": "fp32/model.onnx",
"format": "external_data",
"size_mb": 384,
"precision": "float32",
"backends": [
"webgpu",
"wasm"
]
},
"fp32_single_file": {
"path": "model.onnx",
"format": "single_file",
"size_mb": 384,
"precision": "float32",
"note": "Legacy backwards-compat. Prefer external data variants.",
"backends": [
"webgpu",
"wasm"
]
},
"fp16_naive": {
"path": "fp16/model.onnx",
"format": "external_data",
"size_mb": 192,
"precision": "float16",
"note": "Superseded by root model_fp16.onnx (surgical conversion).",
"backends": [
"webgpu",
"wasm"
]
},
"int8": {
"path": "int8/model.onnx",
"format": "external_data",
"size_mb": 97,
"precision": "int8_dynamic",
"note": "NOT RECOMMENDED. Visibly degraded output. Wav2Vec2 weights too sensitive for int8.",
"backends": [
"wasm"
]
}
},
"blendshape_names": [
"eyeBlinkLeft",
"eyeLookDownLeft",
"eyeLookInLeft",
"eyeLookOutLeft",
"eyeLookUpLeft",
"eyeSquintLeft",
"eyeWideLeft",
"eyeBlinkRight",
"eyeLookDownRight",
"eyeLookInRight",
"eyeLookOutRight",
"eyeLookUpRight",
"eyeSquintRight",
"eyeWideRight",
"jawForward",
"jawLeft",
"jawRight",
"jawOpen",
"mouthClose",
"mouthFunnel",
"mouthPucker",
"mouthLeft",
"mouthRight",
"mouthSmileLeft",
"mouthSmileRight",
"mouthFrownLeft",
"mouthFrownRight",
"mouthDimpleLeft",
"mouthDimpleRight",
"mouthStretchLeft",
"mouthStretchRight",
"mouthRollLower",
"mouthRollUpper",
"mouthShrugLower",
"mouthShrugUpper",
"mouthPressLeft",
"mouthPressRight",
"mouthLowerDownLeft",
"mouthLowerDownRight",
"mouthUpperUpLeft",
"mouthUpperUpRight",
"browDownLeft",
"browDownRight",
"browInnerUp",
"browOuterUpLeft",
"browOuterUpRight",
"cheekPuff",
"cheekSquintLeft",
"cheekSquintRight",
"noseSneerLeft",
"noseSneerRight",
"tongueOut"
]
}