omote-ai
/

lam-a2e

@@ -1,82 +1,179 @@
-{
-  "model_type": "wav2vec2-a2e",
-  "task": "audio-to-expression",
-  "framework": "onnx",
-  "opset_version": 14,
-  "min_ort_version": "1.17.0",
-  "sample_rate": 16000,
-  "input_samples": 16000,
-  "output_fps": 30,
-  "num_blendshapes": 52,
-  "blendshape_standard": "ARKit",
-  "parameters": 100528020,
-  "upstream": {
-    "repo": "https://github.com/aigc3d/LAM_Audio2Expression",
-    "paper": "LAM: Large Avatar Model for One-Shot Animatable Gaussian Head",
-    "venue": "SIGGRAPH 2025",
-    "license": "Apache-2.0"
-  },
-  "inputs": {
-    "audio": {
-      "shape": ["batch", "samples"],
-      "dtype": "float32",
-      "description": "Raw audio at 16kHz"
-    },
-    "identity": {
-      "shape": ["batch", 12],
-      "dtype": "float32",
-      "description": "One-hot identity vector"
-    }
-  },
-  "outputs": {
-    "blendshapes": {
-      "shape": ["batch", "time_a2e", 52],
-      "dtype": "float32",
-      "description": "ARKit blendshape weights at 30fps"
-    },
-    "asr_logits": {
-      "shape": ["batch", "time_asr", 32],
-      "dtype": "float32",
-      "description": "CTC ASR logits at 50fps"
-    }
-  },
-  "variants": {
-    "fp32": {
-      "path": "fp32/",
-      "size_mb": 384,
-      "format": "external_data",
-      "precision": "float32",
-      "backends": ["webgpu", "wasm"]
-    },
-    "fp16": {
-      "path": "fp16/",
-      "size_mb": 192,
-      "format": "external_data",
-      "precision": "float16",
-      "backends": ["webgpu", "wasm"]
-    },
-    "int8": {
-      "path": "int8/",
-      "size_mb": 97,
-      "format": "external_data",
-      "precision": "int8_dynamic",
-      "backends": ["wasm"],
-      "note": "WASM only — WebGPU has limited int8 operator support"
-    }
-  },
-  "blendshape_names": [
-    "eyeBlinkLeft", "eyeLookDownLeft", "eyeLookInLeft", "eyeLookOutLeft", "eyeLookUpLeft",
-    "eyeSquintLeft", "eyeWideLeft", "eyeBlinkRight", "eyeLookDownRight", "eyeLookInRight",
-    "eyeLookOutRight", "eyeLookUpRight", "eyeSquintRight", "eyeWideRight",
-    "jawForward", "jawLeft", "jawRight", "jawOpen",
-    "mouthClose", "mouthFunnel", "mouthPucker", "mouthLeft", "mouthRight",
-    "mouthSmileLeft", "mouthSmileRight", "mouthFrownLeft", "mouthFrownRight",
-    "mouthDimpleLeft", "mouthDimpleRight", "mouthStretchLeft", "mouthStretchRight",
-    "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper",
-    "mouthPressLeft", "mouthPressRight", "mouthLowerDownLeft", "mouthLowerDownRight",
-    "mouthUpperUpLeft", "mouthUpperUpRight",
-    "browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight",
-    "cheekPuff", "cheekSquintLeft", "cheekSquintRight",
-    "noseSneerLeft", "noseSneerRight", "tongueOut"
-  ]
-}

+{
+  "model_type": "wav2vec2-a2e",
+  "task": "audio-to-expression",
+  "framework": "onnx",
+  "opset_version": 14,
+  "min_ort_version": "1.17.0",
+  "sample_rate": 16000,
+  "input_samples": 16000,
+  "output_fps": 30,
+  "num_blendshapes": 52,
+  "blendshape_standard": "ARKit",
+  "parameters": 100528020,
+  "upstream": {
+    "repo": "https://github.com/aigc3d/LAM_Audio2Expression",
+    "paper": "LAM: Large Avatar Model for One-Shot Animatable Gaussian Head",
+    "venue": "SIGGRAPH 2025",
+    "license": "Apache-2.0"
+  },
+  "inputs": {
+    "audio": {
+      "shape": [
+        "batch",
+        "samples"
+      ],
+      "dtype": "float32",
+      "description": "Raw audio at 16kHz. Use 16000 samples (1s) for 30fps output."
+    },
+    "identity": {
+      "shape": [
+        "batch",
+        12
+      ],
+      "dtype": "float32",
+      "description": "One-hot identity vector. 12 classes, use [1,0,...,0] for neutral."
+    }
+  },
+  "outputs": {
+    "blendshapes": {
+      "shape": [
+        "batch",
+        30,
+        52
+      ],
+      "dtype": "float32",
+      "description": "ARKit blendshape weights at 30fps"
+    },
+    "asr_logits": {
+      "shape": [
+        "batch",
+        49,
+        32
+      ],
+      "dtype": "float32",
+      "description": "CTC ASR logits at 50fps (~24K params auxiliary head)"
+    }
+  },
+  "recommended": {
+    "path": "model_fp16.onnx",
+    "format": "external_data",
+    "precision": "float16_surgical",
+    "graph_size_kb": 385,
+    "weights_size_mb": 192,
+    "conversion": "Surgical fp16: decomposed LayerNorm subgraphs kept in fp32",
+    "fidelity": "cosine >0.9999 vs fp32, magnitude ratio 0.998-1.002",
+    "backends": [
+      "webgpu",
+      "wasm"
+    ]
+  },
+  "variants": {
+    "fp16_surgical": {
+      "path": "model_fp16.onnx",
+      "format": "external_data",
+      "size_mb": 192,
+      "precision": "float16",
+      "note": "Recommended. Decomposed LayerNorm preserved in fp32.",
+      "backends": [
+        "webgpu",
+        "wasm"
+      ]
+    },
+    "fp32": {
+      "path": "fp32/model.onnx",
+      "format": "external_data",
+      "size_mb": 384,
+      "precision": "float32",
+      "backends": [
+        "webgpu",
+        "wasm"
+      ]
+    },
+    "fp32_single_file": {
+      "path": "model.onnx",
+      "format": "single_file",
+      "size_mb": 384,
+      "precision": "float32",
+      "note": "Legacy backwards-compat. Prefer external data variants.",
+      "backends": [
+        "webgpu",
+        "wasm"
+      ]
+    },
+    "fp16_naive": {
+      "path": "fp16/model.onnx",
+      "format": "external_data",
+      "size_mb": 192,
+      "precision": "float16",
+      "note": "Superseded by root model_fp16.onnx (surgical conversion).",
+      "backends": [
+        "webgpu",
+        "wasm"
+      ]
+    },
+    "int8": {
+      "path": "int8/model.onnx",
+      "format": "external_data",
+      "size_mb": 97,
+      "precision": "int8_dynamic",
+      "note": "NOT RECOMMENDED. Visibly degraded output. Wav2Vec2 weights too sensitive for int8.",
+      "backends": [
+        "wasm"
+      ]
+    }
+  },
+  "blendshape_names": [
+    "eyeBlinkLeft",
+    "eyeLookDownLeft",
+    "eyeLookInLeft",
+    "eyeLookOutLeft",
+    "eyeLookUpLeft",
+    "eyeSquintLeft",
+    "eyeWideLeft",
+    "eyeBlinkRight",
+    "eyeLookDownRight",
+    "eyeLookInRight",
+    "eyeLookOutRight",
+    "eyeLookUpRight",
+    "eyeSquintRight",
+    "eyeWideRight",
+    "jawForward",
+    "jawLeft",
+    "jawRight",
+    "jawOpen",
+    "mouthClose",
+    "mouthFunnel",
+    "mouthPucker",
+    "mouthLeft",
+    "mouthRight",
+    "mouthSmileLeft",
+    "mouthSmileRight",
+    "mouthFrownLeft",
+    "mouthFrownRight",
+    "mouthDimpleLeft",
+    "mouthDimpleRight",
+    "mouthStretchLeft",
+    "mouthStretchRight",
+    "mouthRollLower",
+    "mouthRollUpper",
+    "mouthShrugLower",
+    "mouthShrugUpper",
+    "mouthPressLeft",
+    "mouthPressRight",
+    "mouthLowerDownLeft",
+    "mouthLowerDownRight",
+    "mouthUpperUpLeft",
+    "mouthUpperUpRight",
+    "browDownLeft",
+    "browDownRight",
+    "browInnerUp",
+    "browOuterUpLeft",
+    "browOuterUpRight",
+    "cheekPuff",
+    "cheekSquintLeft",
+    "cheekSquintRight",
+    "noseSneerLeft",
+    "noseSneerRight",
+    "tongueOut"
+  ]
+}