lam-a2e / config.json

docs: update config.json with surgical fp16 as recommended, int8 warning

2235ef2 verified about 2 months ago

4.36 kB

	{
	"model_type": "wav2vec2-a2e",
	"task": "audio-to-expression",
	"framework": "onnx",
	"opset_version": 14,
	"min_ort_version": "1.17.0",
	"sample_rate": 16000,
	"input_samples": 16000,
	"output_fps": 30,
	"num_blendshapes": 52,
	"blendshape_standard": "ARKit",
	"parameters": 100528020,
	"upstream": {
	"repo": "https://github.com/aigc3d/LAM_Audio2Expression",
	"paper": "LAM: Large Avatar Model for One-Shot Animatable Gaussian Head",
	"venue": "SIGGRAPH 2025",
	"license": "Apache-2.0"
	},
	"inputs": {
	"audio": {
	"shape": [
	"batch",
	"samples"
	],
	"dtype": "float32",
	"description": "Raw audio at 16kHz. Use 16000 samples (1s) for 30fps output."
	},
	"identity": {
	"shape": [
	"batch",
	12
	],
	"dtype": "float32",
	"description": "One-hot identity vector. 12 classes, use [1,0,...,0] for neutral."
	}
	},
	"outputs": {
	"blendshapes": {
	"shape": [
	"batch",
	30,
	52
	],
	"dtype": "float32",
	"description": "ARKit blendshape weights at 30fps"
	},
	"asr_logits": {
	"shape": [
	"batch",
	49,
	32
	],
	"dtype": "float32",
	"description": "CTC ASR logits at 50fps (~24K params auxiliary head)"
	}
	},
	"recommended": {
	"path": "model_fp16.onnx",
	"format": "external_data",
	"precision": "float16_surgical",
	"graph_size_kb": 385,
	"weights_size_mb": 192,
	"conversion": "Surgical fp16: decomposed LayerNorm subgraphs kept in fp32",
	"fidelity": "cosine >0.9999 vs fp32, magnitude ratio 0.998-1.002",
	"backends": [
	"webgpu",
	"wasm"
	]
	},
	"variants": {
	"fp16_surgical": {
	"path": "model_fp16.onnx",
	"format": "external_data",
	"size_mb": 192,
	"precision": "float16",
	"note": "Recommended. Decomposed LayerNorm preserved in fp32.",
	"backends": [
	"webgpu",
	"wasm"
	]
	},
	"fp32": {
	"path": "fp32/model.onnx",
	"format": "external_data",
	"size_mb": 384,
	"precision": "float32",
	"backends": [
	"webgpu",
	"wasm"
	]
	},
	"fp32_single_file": {
	"path": "model.onnx",
	"format": "single_file",
	"size_mb": 384,
	"precision": "float32",
	"note": "Legacy backwards-compat. Prefer external data variants.",
	"backends": [
	"webgpu",
	"wasm"
	]
	},
	"fp16_naive": {
	"path": "fp16/model.onnx",
	"format": "external_data",
	"size_mb": 192,
	"precision": "float16",
	"note": "Superseded by root model_fp16.onnx (surgical conversion).",
	"backends": [
	"webgpu",
	"wasm"
	]
	},
	"int8": {
	"path": "int8/model.onnx",
	"format": "external_data",
	"size_mb": 97,
	"precision": "int8_dynamic",
	"note": "NOT RECOMMENDED. Visibly degraded output. Wav2Vec2 weights too sensitive for int8.",
	"backends": [
	"wasm"
	]
	}
	},
	"blendshape_names": [
	"eyeBlinkLeft",
	"eyeLookDownLeft",
	"eyeLookInLeft",
	"eyeLookOutLeft",
	"eyeLookUpLeft",
	"eyeSquintLeft",
	"eyeWideLeft",
	"eyeBlinkRight",
	"eyeLookDownRight",
	"eyeLookInRight",
	"eyeLookOutRight",
	"eyeLookUpRight",
	"eyeSquintRight",
	"eyeWideRight",
	"jawForward",
	"jawLeft",
	"jawRight",
	"jawOpen",
	"mouthClose",
	"mouthFunnel",
	"mouthPucker",
	"mouthLeft",
	"mouthRight",
	"mouthSmileLeft",
	"mouthSmileRight",
	"mouthFrownLeft",
	"mouthFrownRight",
	"mouthDimpleLeft",
	"mouthDimpleRight",
	"mouthStretchLeft",
	"mouthStretchRight",
	"mouthRollLower",
	"mouthRollUpper",
	"mouthShrugLower",
	"mouthShrugUpper",
	"mouthPressLeft",
	"mouthPressRight",
	"mouthLowerDownLeft",
	"mouthLowerDownRight",
	"mouthUpperUpLeft",
	"mouthUpperUpRight",
	"browDownLeft",
	"browDownRight",
	"browInnerUp",
	"browOuterUpLeft",
	"browOuterUpRight",
	"cheekPuff",
	"cheekSquintLeft",
	"cheekSquintRight",
	"noseSneerLeft",
	"noseSneerRight",
	"tongueOut"
	]
	}