Upload 4 files

Browse files

Files changed (5) hide show

.gitattributes +1 -0
README.md +225 -0
config.json +62 -0
wav2arkit_cpu.onnx +3 -0
wav2arkit_cpu.onnx.data +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+wav2arkit_cpu.onnx.data filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,228 @@
 ---
 license: apache-2.0
 ---

 ---
 license: apache-2.0
+base_model: 3DAIGC/LAM_audio2exp
+library_name: onnxruntime
+pipeline_tag: audio-to-audio
+tags:
+  - onnx
+  - audio2expression
+  - arkit
+  - blendshapes
+  - facial-animation
+  - avatar
+  - wav2vec2
+  - realtime
+  - cpu
 ---
+# Wav2ARKit - Audio to Facial Expression (ONNX)
+A **fused, end-to-end ONNX model** that converts raw audio waveforms directly into 52 ARKit-compatible facial blendshapes. Based on the [Facebook Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base-960h) and [LAM Audio2Expression](https://huggingface.co/3DAIGC/LAM_audio2exp) models, optimized for real-time CPU inference.
+## ✨ Features
+| Feature | Value |
+|---------|-------|
+| **Input** | Raw 16kHz audio waveform |
+| **Output** | 52 ARKit blendshapes @ 30fps |
+| **Inference** | ~45ms per second of audio |
+| **Speed** | 22× faster than realtime |
+| **Size** | 1.8 MB |
+## Quick Start
+```python
+import onnxruntime as ort
+import numpy as np
+# Load model
+session = ort.InferenceSession("wav2arkit_cpu.onnx", providers=["CPUExecutionProvider"])
+# Load audio (16kHz, mono, float32)
+# Example: 1 second = 16000 samples
+audio = np.random.randn(1, 16000).astype(np.float32)
+# Run inference
+blendshapes = session.run(None, {"audio_waveform": audio})[0]
+# Output: (1, 30, 52) - 30 frames at 30fps, 52 blendshapes
+```
+## Model Specification
+### Input
+| Name | Type | Shape | Description |
+|------|------|-------|-------------|
+| `audio_waveform` | float32 | `[batch, samples]` | Raw audio at 16kHz |
+### Output
+| Name | Type | Shape | Description |
+|------|------|-------|-------------|
+| `blendshapes` | float32 | `[batch, frames, 52]` | ARKit blendshapes [0-1] |
+### Frame Calculation
+```
+output_frames = ceil(30 × (num_samples / 16000))
+```
+Example: 1 second audio (16000 samples) → 30 frames
+## ARKit Blendshapes
+<details>
+<summary>52 blendshape indices (click to expand)</summary>
+| Idx | Name | Idx | Name |
+|-----|------|-----|------|
+| 0 | browDownLeft | 26 | mouthFrownRight |
+| 1 | browDownRight | 27 | mouthFunnel |
+| 2 | browInnerUp | 28 | mouthLeft |
+| 3 | browOuterUpLeft | 29 | mouthLowerDownLeft |
+| 4 | browOuterUpRight | 30 | mouthLowerDownRight |
+| 5 | cheekPuff | 31 | mouthPressLeft |
+| 6 | cheekSquintLeft | 32 | mouthPressRight |
+| 7 | cheekSquintRight | 33 | mouthPucker |
+| 8 | eyeBlinkLeft | 34 | mouthRight |
+| 9 | eyeBlinkRight | 35 | mouthRollLower |
+| 10 | eyeLookDownLeft | 36 | mouthRollUpper |
+| 11 | eyeLookDownRight | 37 | mouthShrugLower |
+| 12 | eyeLookInLeft | 38 | mouthShrugUpper |
+| 13 | eyeLookInRight | 39 | mouthSmileLeft |
+| 14 | eyeLookOutLeft | 40 | mouthSmileRight |
+| 15 | eyeLookOutRight | 41 | mouthStretchLeft |
+| 16 | eyeLookUpLeft | 42 | mouthStretchRight |
+| 17 | eyeLookUpRight | 43 | mouthUpperUpLeft |
+| 18 | eyeSquintLeft | 44 | mouthUpperUpRight |
+| 19 | eyeSquintRight | 45 | noseSneerLeft |
+| 20 | eyeWideLeft | 46 | noseSneerRight |
+| 21 | eyeWideRight | 47 | tongueOut |
+| 22 | jawForward | 48 | mouthClose |
+| 23 | jawLeft | 49 | mouthDimpleLeft |
+| 24 | jawOpen | 50 | mouthDimpleRight |
+| 25 | mouthFrownLeft | 51 | jawRight |
+</details>
+## Usage Examples
+### Python with audio file
+```python
+import onnxruntime as ort
+import numpy as np
+import soundfile as sf
+session = ort.InferenceSession("wav2arkit_cpu.onnx", providers=["CPUExecutionProvider"])
+# Load and resample audio to 16kHz if needed
+audio, sr = sf.read("speech.wav")
+if sr != 16000:
+    import librosa
+    audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
+# Ensure mono
+if len(audio.shape) > 1:
+    audio = audio.mean(axis=1)
+# Run inference
+audio_input = audio.astype(np.float32).reshape(1, -1)
+blendshapes = session.run(None, {"audio_waveform": audio_input})[0]
+print(f"Duration: {len(audio)/16000:.2f}s → {blendshapes.shape[1]} frames")
+```
+### C++
+```cpp
+#include <onnxruntime_cxx_api.h>
+Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "Wav2ARKit");
+Ort::Session session(env, L"wav2arkit_cpu.onnx", Ort::SessionOptions{});
+std::vector<float> audio(16000);  // 1 second
+std::vector<int64_t> shape = {1, 16000};
+Ort::MemoryInfo mem = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
+Ort::Value input = Ort::Value::CreateTensor<float>(mem, audio.data(), audio.size(), shape.data(), shape.size());
+const char* input_names[] = {"audio_waveform"};
+const char* output_names[] = {"blendshapes"};
+auto output = session.Run({}, input_names, &input, 1, output_names, 1);
+```
+### JavaScript (onnxruntime-web/node)
+```javascript
+const ort = require('onnxruntime-node');
+const session = await ort.InferenceSession.create('wav2arkit_cpu.onnx');
+const audioTensor = new ort.Tensor('float32', audioData, [1, audioData.length]);
+const { blendshapes } = await session.run({ audio_waveform: audioTensor });
+```
+## Architecture
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                         Wav2ARKit ONNX Model                                │
+├─────────────────────────────────────────────────────────────────────────────┤
+│                                                                             │
+│  ┌─────────────┐                                                            │
+│  │ Audio Input │  [batch, samples] @ 16kHz                                  │
+│  └──────┬──────┘                                                            │
+│         │                                                                   │
+│         ▼                                                                   │
+│  ┌─────────────────────────────────────────┐                                │
+│  │           Wav2Vec2 Encoder              │                                │
+│  │  ┌─────────────────────────────────┐    │                                │
+│  │  │ CNN Feature Extractor (50fps)   │    │                                │
+│  │  └───────────────┬─────────────────┘    │                                │
+│  │                  │                      │                                │
+│  │                  ▼                      │                                │
+│  │  ┌─────────────────────────────────┐    │                                │
+│  │  │ Linear Interpolation 50→30fps   │    │                                │
+│  │  └───────────────┬─────────────────┘    │                                │
+│  │                  │                      │                                │
+│  │                  ▼                      │                                │
+│  │  ┌─────────────────────────────────┐    │                                │
+│  │  │ Transformer Encoder (12 layers) │    │                                │
+│  │  └───────────────┬─────────────────┘    │                                │
+│  └──────────────────┼──────────────────────┘                                │
+│                     │ [batch, frames, 768]                                  │
+│                     ▼                                                       │
+│  ┌─────────────────────────────────────────┐                                │
+│  │      Feature Projection (768 → 512)     │                                │
+│  └───────────────────┬─────────────────────┘                                │
+│                      │ [batch, frames, 512]                                 │
+│                      ▼                                                      │
+│  ┌─────────────────────────────────────────┐    ┌─────────────────────┐     │
+│  │          Identity Encoder               │◄───│ Identity ID (0-11)  │     │
+│  │  ┌───────────────────────────────────┐  │    │ int → one-hot [12]  │     │
+│  │  │ Concat: [512] + MLP([12]→[64])    │  │    │ → MLP → [64]        │     │
+│  │  └─────────────────┬─────────────────┘  │    │ (baked as ID=11)    │     │
+│  │                    ▼                    │    └─────────────────────┘     │
+│  │  ┌───────────────────────────────────┐  │                                │
+│  │  │ SeqTranslator (3× Conv+LN+ReLU)   │  │                                │
+│  │  └─────────────────┬─────────────────┘  │                                │
+│  └────────────────────┼────────────────────┘                                │
+│                       │ [batch, 512, frames]                                │
+│                       ▼                                                     │
+│  ┌─────────────────────────────────────────┐                                │
+│  │   Decoder (3× Conv1D + LayerNorm)       │                                │
+│  └───────────────────┬─────────────────────┘                                │
+│                      │ [batch, 512, frames]                                 │
+│                      ▼                                                      │
+│  ┌─────────────────────────────────────────┐                                │
+│  │   Output Projection (512 → 52) + σ      │                                │
+│  └───────────────────┬─────────────────────┘                                │
+│                      │                                                      │
+│                      ▼                                                      │
+│  ┌─────────────┐                                                            │
+│  │   Output    │  [batch, frames, 52] @ 30fps, values ∈ [0,1]               │
+│  └─────────────┘                                                            │
+│                                                                             │
+└─────────────────────────────────────────────────────────────────────────────┘
+```
+**Note:** The identity encoder supports 12 speaker identities (0-11). This ONNX export uses identity `11` baked in for single-speaker inference.
+## License
+Apache 2.0 - Based on:
+- [3DAIGC/LAM_audio2exp](https://huggingface.co/3DAIGC/LAM_audio2exp)
+- [facebook/wav2vec2-base-960h](https://huggingface.co/facebook/wav2vec2-base-960h)

config.json ADDED Viewed

	@@ -0,0 +1,62 @@

+{
+  "model_name": "wav2arkit_cpu",
+  "description": "End-to-end audio to ARKit blendshape model",
+  "format": "onnx",
+  "source_models": {
+    "audio_encoder": "facebook/wav2vec2-base-960h",
+    "expression_decoder": "3DAIGC/LAM_audio2exp"
+  },
+  "audio_encoder": {
+    "source": "facebook/wav2vec2-base-960h",
+    "hidden_size": 768
+  },
+  "preprocessing": {
+    "sample_rate": 16000,
+    "channels": 1,
+    "normalize": false
+  },
+  "input_spec": {
+    "name": "audio_waveform",
+    "dtype": "float32",
+    "shape": ["batch_size", "num_samples"]
+  },
+  "output_spec": {
+    "name": "blendshapes",
+    "dtype": "float32",
+    "shape": ["batch_size", "num_frames", 52],
+    "fps": 30,
+    "value_range": [0.0, 1.0]
+  },
+  "num_blendshapes": 52,
+  "output_fps": 30,
+  "frame_formula": "ceil(30 * num_samples / 16000)",
+  "blendshape_names": [
+    "browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight",
+    "cheekPuff", "cheekSquintLeft", "cheekSquintRight",
+    "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight",
+    "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight",
+    "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight",
+    "eyeWideLeft", "eyeWideRight",
+    "jawForward", "jawLeft", "jawOpen", "jawRight",
+    "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight",
+    "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight",
+    "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight",
+    "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper",
+    "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight",
+    "mouthUpperUpLeft", "mouthUpperUpRight",
+    "noseSneerLeft", "noseSneerRight", "tongueOut"
+  ],
+  "onnx": {
+    "opset_version": 18,
+    "producer": "pytorch",
+    "model_file": "wav2arkit_cpu.onnx"
+  }
+}

wav2arkit_cpu.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cdecbfad3915dd20b2f0718942d0b8894b2ee11edcc5a9a9da45d29a46af2ed9
+size 1862753

wav2arkit_cpu.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c0f0364673c6e50be126b193e2b56809c16ac6bee4805aea9b8251ce53429bf8
+size 402063360